From bd38b56774208ea305b8817204a720c87dc371a7 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 18:09:08 -0400
Subject: [PATCH 01/50] Move TLB faults into the normal fault classes.  Now
 they are executed when the fault is invoked.

--HG--
extra : convert_revision : b5f00fff277e863b3fe43422bc39d0487c482e60
---
 arch/alpha/faults.cc |  45 +++++++++++++++++++
 arch/alpha/faults.hh |  97 +++++++++++++++++++++++++++++++++++++---
 arch/alpha/tlb.cc    | 104 +++++++++++++------------------------------
 arch/alpha/tlb.hh    |   7 +--
 4 files changed, 167 insertions(+), 86 deletions(-)

diff --git a/arch/alpha/faults.cc b/arch/alpha/faults.cc
index e0918da21..0083aa9f3 100644
--- a/arch/alpha/faults.cc
+++ b/arch/alpha/faults.cc
@@ -30,6 +30,9 @@
 #include "cpu/exec_context.hh"
 #include "cpu/base.hh"
 #include "base/trace.hh"
+#if FULL_SYSTEM
+#include "arch/alpha/ev5.hh"
+#endif
 
 namespace AlphaISA
 {
@@ -70,6 +73,10 @@ FaultName DtbAcvFault::_name = "dfault";
 FaultVect DtbAcvFault::_vect = 0x0381;
 FaultStat DtbAcvFault::_count;
 
+FaultName DtbAlignmentFault::_name = "unalign";
+FaultVect DtbAlignmentFault::_vect = 0x0301;
+FaultStat DtbAlignmentFault::_count;
+
 FaultName ItbMissFault::_name = "itbmiss";
 FaultVect ItbMissFault::_vect = 0x0181;
 FaultStat ItbMissFault::_count;
@@ -125,6 +132,44 @@ void ArithmeticFault::invoke(ExecContext * xc)
     panic("Arithmetic traps are unimplemented!");
 }
 
+void DtbFault::invoke(ExecContext * xc)
+{
+    // Set fault address and flags.  Even though we're modeling an
+    // EV5, we use the EV6 technique of not latching fault registers
+    // on VPTE loads (instead of locking the registers until IPR_VA is
+    // read, like the EV5).  The EV6 approach is cleaner and seems to
+    // work with EV5 PAL code, but not the other way around.
+    if (!xc->misspeculating()
+        && !(reqFlags & VPTE) && !(reqFlags & NO_FAULT)) {
+        // set VA register with faulting address
+        xc->setMiscReg(AlphaISA::IPR_VA, vaddr);
+
+        // set MM_STAT register flags
+        xc->setMiscReg(AlphaISA::IPR_MM_STAT,
+            (((EV5::Opcode(xc->getInst()) & 0x3f) << 11)
+             | ((EV5::Ra(xc->getInst()) & 0x1f) << 6)
+             | (flags & 0x3f)));
+
+        // set VA_FORM register with faulting formatted address
+        xc->setMiscReg(AlphaISA::IPR_VA_FORM,
+            xc->readMiscReg(AlphaISA::IPR_MVPTBR) | (vaddr.vpn() << 3));
+    }
+
+    AlphaFault::invoke(xc);
+}
+
+void ItbFault::invoke(ExecContext * xc)
+{
+    if (!xc->misspeculating()) {
+        xc->setMiscReg(AlphaISA::IPR_ITB_TAG, pc);
+        xc->setMiscReg(AlphaISA::IPR_IFAULT_VA_FORM,
+                       xc->readMiscReg(AlphaISA::IPR_IVPTBR) |
+                       (AlphaISA::VAddr(pc).vpn() << 3));
+    }
+
+    AlphaFault::invoke(xc);
+}
+
 #endif
 
 } // namespace AlphaISA
diff --git a/arch/alpha/faults.hh b/arch/alpha/faults.hh
index 1a196cc94..e8ccc6b79 100644
--- a/arch/alpha/faults.hh
+++ b/arch/alpha/faults.hh
@@ -29,6 +29,7 @@
 #ifndef __ALPHA_FAULTS_HH__
 #define __ALPHA_FAULTS_HH__
 
+#include "arch/alpha/isa_traits.hh"
 #include "sim/faults.hh"
 
 // The design of the "name" and "vect" functions is in sim/faults.hh
@@ -130,85 +131,167 @@ class InterruptFault : public AlphaFault
     FaultStat & countStat() {return _count;}
 };
 
-class NDtbMissFault : public AlphaFault
+class DtbFault : public AlphaFault
+{
+#if FULL_SYSTEM
+  private:
+    AlphaISA::VAddr vaddr;
+    uint32_t reqFlags;
+    uint64_t flags;
+  public:
+    DtbFault(AlphaISA::VAddr _vaddr, uint32_t _reqFlags, uint64_t _flags)
+        : vaddr(_vaddr), reqFlags(_reqFlags), flags(_flags)
+    { }
+#endif
+    FaultName name() = 0;
+    FaultVect vect() = 0;
+    FaultStat & countStat() = 0;
+#if FULL_SYSTEM
+    void invoke(ExecContext * xc);
+#endif
+};
+
+class NDtbMissFault : public DtbFault
 {
   private:
     static FaultName _name;
     static FaultVect _vect;
     static FaultStat _count;
   public:
+#if FULL_SYSTEM
+    NDtbMissFault(AlphaISA::VAddr vaddr, uint32_t reqFlags, uint64_t flags)
+        : DtbFault(vaddr, reqFlags, flags)
+    { }
+#endif
     FaultName name() {return _name;}
     FaultVect vect() {return _vect;}
     FaultStat & countStat() {return _count;}
 };
 
-class PDtbMissFault : public AlphaFault
+class PDtbMissFault : public DtbFault
 {
   private:
     static FaultName _name;
     static FaultVect _vect;
     static FaultStat _count;
   public:
+#if FULL_SYSTEM
+    PDtbMissFault(AlphaISA::VAddr vaddr, uint32_t reqFlags, uint64_t flags)
+        : DtbFault(vaddr, reqFlags, flags)
+    { }
+#endif
     FaultName name() {return _name;}
     FaultVect vect() {return _vect;}
     FaultStat & countStat() {return _count;}
 };
 
-class DtbPageFault : public AlphaFault
+class DtbPageFault : public DtbFault
 {
   private:
     static FaultName _name;
     static FaultVect _vect;
     static FaultStat _count;
   public:
+#if FULL_SYSTEM
+    DtbPageFault(AlphaISA::VAddr vaddr, uint32_t reqFlags, uint64_t flags)
+        : DtbFault(vaddr, reqFlags, flags)
+    { }
+#endif
     FaultName name() {return _name;}
     FaultVect vect() {return _vect;}
     FaultStat & countStat() {return _count;}
 };
 
-class DtbAcvFault : public AlphaFault
+class DtbAcvFault : public DtbFault
 {
   private:
     static FaultName _name;
     static FaultVect _vect;
     static FaultStat _count;
   public:
+#if FULL_SYSTEM
+    DtbAcvFault(AlphaISA::VAddr vaddr, uint32_t reqFlags, uint64_t flags)
+        : DtbFault(vaddr, reqFlags, flags)
+    { }
+#endif
     FaultName name() {return _name;}
     FaultVect vect() {return _vect;}
     FaultStat & countStat() {return _count;}
 };
 
-class ItbMissFault : public AlphaFault
+class DtbAlignmentFault : public DtbFault
 {
   private:
     static FaultName _name;
     static FaultVect _vect;
     static FaultStat _count;
   public:
+#if FULL_SYSTEM
+    DtbAlignmentFault(AlphaISA::VAddr vaddr, uint32_t reqFlags, uint64_t flags)
+        : DtbFault(vaddr, reqFlags, flags)
+    { }
+#endif
     FaultName name() {return _name;}
     FaultVect vect() {return _vect;}
     FaultStat & countStat() {return _count;}
 };
 
-class ItbPageFault : public AlphaFault
+class ItbFault : public AlphaFault
+{
+  private:
+    Addr pc;
+  public:
+    ItbFault(Addr _pc)
+        : pc(_pc)
+    { }
+    FaultName name() = 0;
+    FaultVect vect() = 0;
+    FaultStat & countStat() = 0;
+#if FULL_SYSTEM
+    void invoke(ExecContext * xc);
+#endif
+};
+
+class ItbMissFault : public ItbFault
 {
   private:
     static FaultName _name;
     static FaultVect _vect;
     static FaultStat _count;
   public:
+    ItbMissFault(Addr pc)
+        : ItbFault(pc)
+    { }
     FaultName name() {return _name;}
     FaultVect vect() {return _vect;}
     FaultStat & countStat() {return _count;}
 };
 
-class ItbAcvFault : public AlphaFault
+class ItbPageFault : public ItbFault
 {
   private:
     static FaultName _name;
     static FaultVect _vect;
     static FaultStat _count;
   public:
+    ItbPageFault(Addr pc)
+        : ItbFault(pc)
+    { }
+    FaultName name() {return _name;}
+    FaultVect vect() {return _vect;}
+    FaultStat & countStat() {return _count;}
+};
+
+class ItbAcvFault : public ItbFault
+{
+  private:
+    static FaultName _name;
+    static FaultVect _vect;
+    static FaultStat _count;
+  public:
+    ItbAcvFault(Addr pc)
+        : ItbFault(pc)
+    { }
     FaultName name() {return _name;}
     FaultVect vect() {return _vect;}
     FaultStat & countStat() {return _count;}
diff --git a/arch/alpha/tlb.cc b/arch/alpha/tlb.cc
index e30a8e595..562235ef8 100644
--- a/arch/alpha/tlb.cc
+++ b/arch/alpha/tlb.cc
@@ -290,17 +290,6 @@ AlphaITB::regStats()
     accesses = hits + misses;
 }
 
-void
-AlphaITB::fault(Addr pc, ExecContext *xc) const
-{
-    if (!xc->misspeculating()) {
-        xc->setMiscReg(AlphaISA::IPR_ITB_TAG, pc);
-        xc->setMiscReg(AlphaISA::IPR_IFAULT_VA_FORM,
-                       xc->readMiscReg(AlphaISA::IPR_IVPTBR) |
-                       (AlphaISA::VAddr(pc).vpn() << 3));
-    }
-}
-
 
 Fault
 AlphaITB::translate(MemReqPtr &req) const
@@ -319,9 +308,8 @@ AlphaITB::translate(MemReqPtr &req) const
     } else {
         // verify that this is a good virtual address
         if (!validVirtualAddress(req->vaddr)) {
-            fault(req->vaddr, req->xc);
             acv++;
-            return new ItbAcvFault;
+            return new ItbAcvFault(req->vaddr);
         }
 
 
@@ -336,9 +324,8 @@ AlphaITB::translate(MemReqPtr &req) const
             // only valid in kernel mode
             if (ICM_CM(xc->readMiscReg(AlphaISA::IPR_ICM)) !=
                 AlphaISA::mode_kernel) {
-                fault(req->vaddr, req->xc);
                 acv++;
-                return new ItbAcvFault;
+                return new ItbAcvFault(req->vaddr);
             }
 
             req->paddr = req->vaddr & PAddrImplMask;
@@ -358,9 +345,8 @@ AlphaITB::translate(MemReqPtr &req) const
                                         asn);
 
             if (!pte) {
-                fault(req->vaddr, req->xc);
                 misses++;
-                return new ItbPageFault;
+                return new ItbPageFault(req->vaddr);
             }
 
             req->paddr = (pte->ppn << AlphaISA::PageShift) +
@@ -370,9 +356,8 @@ AlphaITB::translate(MemReqPtr &req) const
             if (!(pte->xre &
                   (1 << ICM_CM(xc->readMiscReg(AlphaISA::IPR_ICM))))) {
                 // instruction access fault
-                fault(req->vaddr, req->xc);
                 acv++;
-                return new ItbAcvFault;
+                return new ItbAcvFault(req->vaddr);
             }
 
             hits++;
@@ -465,34 +450,6 @@ AlphaDTB::regStats()
     accesses = read_accesses + write_accesses;
 }
 
-void
-AlphaDTB::fault(MemReqPtr &req, uint64_t flags) const
-{
-    ExecContext *xc = req->xc;
-    AlphaISA::VAddr vaddr = req->vaddr;
-
-    // Set fault address and flags.  Even though we're modeling an
-    // EV5, we use the EV6 technique of not latching fault registers
-    // on VPTE loads (instead of locking the registers until IPR_VA is
-    // read, like the EV5).  The EV6 approach is cleaner and seems to
-    // work with EV5 PAL code, but not the other way around.
-    if (!xc->misspeculating()
-        && !(req->flags & VPTE) && !(req->flags & NO_FAULT)) {
-        // set VA register with faulting address
-        xc->setMiscReg(AlphaISA::IPR_VA, req->vaddr);
-
-        // set MM_STAT register flags
-        xc->setMiscReg(AlphaISA::IPR_MM_STAT,
-            (((Opcode(xc->getInst()) & 0x3f) << 11)
-             | ((Ra(xc->getInst()) & 0x1f) << 6)
-             | (flags & 0x3f)));
-
-        // set VA_FORM register with faulting formatted address
-        xc->setMiscReg(AlphaISA::IPR_VA_FORM,
-            xc->readMiscReg(AlphaISA::IPR_MVPTBR) | (vaddr.vpn() << 3));
-    }
-}
-
 Fault
 AlphaDTB::translate(MemReqPtr &req, bool write) const
 {
@@ -507,10 +464,10 @@ AlphaDTB::translate(MemReqPtr &req, bool write) const
      * Check for alignment faults
      */
     if (req->vaddr & (req->size - 1)) {
-        fault(req, write ? MM_STAT_WR_MASK : 0);
         DPRINTF(TLB, "Alignment Fault on %#x, size = %d", req->vaddr,
                 req->size);
-        return genAlignmentFault();
+        uint64_t flags = write ? MM_STAT_WR_MASK : 0;
+        return new DtbAlignmentFault(req->vaddr, req->flags, flags);
     }
 
     if (pc & 0x1) {
@@ -525,12 +482,11 @@ AlphaDTB::translate(MemReqPtr &req, bool write) const
     } else {
         // verify that this is a good virtual address
         if (!validVirtualAddress(req->vaddr)) {
-            fault(req, (write ? MM_STAT_WR_MASK : 0) |
-                  MM_STAT_BAD_VA_MASK |
-                  MM_STAT_ACV_MASK);
-
             if (write) { write_acv++; } else { read_acv++; }
-            return new DtbPageFault;
+            uint64_t flags = (write ? MM_STAT_WR_MASK : 0) |
+                MM_STAT_BAD_VA_MASK |
+                MM_STAT_ACV_MASK;
+            return new DtbPageFault(req->vaddr, req->flags, flags);
         }
 
         // Check for "superpage" mapping
@@ -544,10 +500,10 @@ AlphaDTB::translate(MemReqPtr &req, bool write) const
             // only valid in kernel mode
             if (DTB_CM_CM(xc->readMiscReg(AlphaISA::IPR_DTB_CM)) !=
                 AlphaISA::mode_kernel) {
-                fault(req, ((write ? MM_STAT_WR_MASK : 0) |
-                            MM_STAT_ACV_MASK));
                 if (write) { write_acv++; } else { read_acv++; }
-                return new DtbAcvFault;
+                uint64_t flags = ((write ? MM_STAT_WR_MASK : 0) |
+                                  MM_STAT_ACV_MASK);
+                return new DtbAcvFault(req->vaddr, req->flags, flags);
             }
 
             req->paddr = req->vaddr & PAddrImplMask;
@@ -574,12 +530,14 @@ AlphaDTB::translate(MemReqPtr &req, bool write) const
 
             if (!pte) {
                 // page fault
-                fault(req, (write ? MM_STAT_WR_MASK : 0) |
-                      MM_STAT_DTB_MISS_MASK);
                 if (write) { write_misses++; } else { read_misses++; }
+                uint64_t flags = (write ? MM_STAT_WR_MASK : 0) |
+                    MM_STAT_DTB_MISS_MASK;
                 return (req->flags & VPTE) ?
-                    (Fault)(new PDtbMissFault) :
-                    (Fault)(new NDtbMissFault);
+                    (Fault)(new PDtbMissFault(req->vaddr, req->flags,
+                                              flags)) :
+                    (Fault)(new NDtbMissFault(req->vaddr, req->flags,
+                                              flags));
             }
 
             req->paddr = (pte->ppn << AlphaISA::PageShift) +
@@ -588,29 +546,29 @@ AlphaDTB::translate(MemReqPtr &req, bool write) const
             if (write) {
                 if (!(pte->xwe & MODE2MASK(mode))) {
                     // declare the instruction access fault
-                    fault(req, MM_STAT_WR_MASK |
-                          MM_STAT_ACV_MASK |
-                          (pte->fonw ? MM_STAT_FONW_MASK : 0));
                     write_acv++;
-                    return new DtbPageFault;
+                    uint64_t flags = MM_STAT_WR_MASK |
+                        MM_STAT_ACV_MASK |
+                        (pte->fonw ? MM_STAT_FONW_MASK : 0);
+                    return new DtbPageFault(req->vaddr, req->flags, flags);
                 }
                 if (pte->fonw) {
-                    fault(req, MM_STAT_WR_MASK |
-                          MM_STAT_FONW_MASK);
                     write_acv++;
-                    return new DtbPageFault;
+                    uint64_t flags = MM_STAT_WR_MASK |
+                        MM_STAT_FONW_MASK;
+                    return new DtbPageFault(req->vaddr, req->flags, flags);
                 }
             } else {
                 if (!(pte->xre & MODE2MASK(mode))) {
-                    fault(req, MM_STAT_ACV_MASK |
-                          (pte->fonr ? MM_STAT_FONR_MASK : 0));
                     read_acv++;
-                    return new DtbAcvFault;
+                    uint64_t flags = MM_STAT_ACV_MASK |
+                        (pte->fonr ? MM_STAT_FONR_MASK : 0);
+                    return new DtbAcvFault(req->vaddr, req->flags, flags);
                 }
                 if (pte->fonr) {
-                    fault(req, MM_STAT_FONR_MASK);
                     read_acv++;
-                    return new DtbPageFault;
+                    uint64_t flags = MM_STAT_FONR_MASK;
+                    return new DtbPageFault(req->vaddr, req->flags, flags);
                 }
             }
         }
diff --git a/arch/alpha/tlb.hh b/arch/alpha/tlb.hh
index de955fa46..676345f01 100644
--- a/arch/alpha/tlb.hh
+++ b/arch/alpha/tlb.hh
@@ -31,6 +31,7 @@
 
 #include <map>
 
+#include "arch/alpha/ev5.hh"
 #include "arch/alpha/isa_traits.hh"
 #include "arch/alpha/faults.hh"
 #include "base/statistics.hh"
@@ -87,9 +88,6 @@ class AlphaITB : public AlphaTLB
     mutable Stats::Scalar<> acv;
     mutable Stats::Formula accesses;
 
-  protected:
-    void fault(Addr pc, ExecContext *xc) const;
-
   public:
     AlphaITB(const std::string &name, int size);
     virtual void regStats();
@@ -113,9 +111,6 @@ class AlphaDTB : public AlphaTLB
     Stats::Formula acv;
     Stats::Formula accesses;
 
-  protected:
-    void fault(MemReqPtr &req, uint64_t flags) const;
-
   public:
     AlphaDTB(const std::string &name, int size);
     virtual void regStats();

From de8baeb58aa2b86c56b0edd6af7541d8fcb4efdb Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 18:11:54 -0400
Subject: [PATCH 02/50] Move quiesce event to its own class.

SConscript:
    Move quiesce event to its own file/class.

--HG--
extra : convert_revision : 6aa7863adb529fc03142666213c3ec348825bd3b
---
 SConscript              |  1 +
 cpu/cpu_exec_context.cc | 39 ++++++++++++++++-----------------------
 cpu/cpu_exec_context.hh | 18 ++----------------
 cpu/quiesce_event.cc    | 20 ++++++++++++++++++++
 cpu/quiesce_event.hh    | 23 +++++++++++++++++++++++
 5 files changed, 62 insertions(+), 39 deletions(-)
 create mode 100644 cpu/quiesce_event.cc
 create mode 100644 cpu/quiesce_event.hh

diff --git a/SConscript b/SConscript
index 8f2ae761d..062661557 100644
--- a/SConscript
+++ b/SConscript
@@ -85,6 +85,7 @@ base_sources = Split('''
 	cpu/cpu_exec_context.cc
 	cpu/exetrace.cc
 	cpu/pc_event.cc
+        cpu/quiesce_event.cc
 	cpu/static_inst.cc
         cpu/sampler/sampler.cc
         cpu/trace/reader/mem_trace_reader.cc
diff --git a/cpu/cpu_exec_context.cc b/cpu/cpu_exec_context.cc
index b7238e73a..363244e60 100644
--- a/cpu/cpu_exec_context.cc
+++ b/cpu/cpu_exec_context.cc
@@ -38,6 +38,7 @@
 #include "base/output.hh"
 #include "base/trace.hh"
 #include "cpu/profile.hh"
+#include "cpu/quiesce_event.hh"
 #include "kern/kernel_stats.hh"
 #include "sim/serialize.hh"
 #include "sim/sim_exit.hh"
@@ -57,10 +58,12 @@ CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num, System *_sys,
     : _status(ExecContext::Unallocated), cpu(_cpu), thread_num(_thread_num),
       cpu_id(-1), lastActivate(0), lastSuspend(0), mem(_mem), itb(_itb),
       dtb(_dtb), system(_sys), memctrl(_sys->memctrl), physmem(_sys->physmem),
-      profile(NULL), quiesceEvent(this), func_exe_inst(0), storeCondFailures(0)
+      profile(NULL), func_exe_inst(0), storeCondFailures(0)
 {
     proxy = new ProxyExecContext<CPUExecContext>(this);
 
+    quiesceEvent = new EndQuiesceEvent(proxy);
+
     memset(&regs, 0, sizeof(RegFile));
 
     if (cpu->params->profile) {
@@ -82,7 +85,7 @@ CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num,
                          Process *_process, int _asid)
     : _status(ExecContext::Unallocated),
       cpu(_cpu), thread_num(_thread_num), cpu_id(-1), lastActivate(0),
-      lastSuspend(0), process(_process), mem(process->getMemory()), asid(_asid),
+      lastSuspend(0), process(_process), mem(NULL), asid(_asid),
       func_exe_inst(0), storeCondFailures(0)
 {
     memset(&regs, 0, sizeof(RegFile));
@@ -91,7 +94,7 @@ CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num,
 
 CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num,
                          FunctionalMemory *_mem, int _asid)
-    : cpu(_cpu), thread_num(_thread_num), process(0), mem(_mem), asid(_asid),
+    : cpu(_cpu), thread_num(_thread_num), process(0), mem(NULL), asid(_asid),
       func_exe_inst(0), storeCondFailures(0)
 {
     memset(&regs, 0, sizeof(RegFile));
@@ -121,23 +124,6 @@ CPUExecContext::dumpFuncProfile()
     profile->dump(proxy, *os);
 }
 
-CPUExecContext::EndQuiesceEvent::EndQuiesceEvent(CPUExecContext *_cpuXC)
-    : Event(&mainEventQueue), cpuXC(_cpuXC)
-{
-}
-
-void
-CPUExecContext::EndQuiesceEvent::process()
-{
-    cpuXC->activate();
-}
-
-const char*
-CPUExecContext::EndQuiesceEvent::description()
-{
-    return "End Quiesce Event.";
-}
-
 void
 CPUExecContext::profileClear()
 {
@@ -189,8 +175,8 @@ CPUExecContext::serialize(ostream &os)
 
 #if FULL_SYSTEM
     Tick quiesceEndTick = 0;
-    if (quiesceEvent.scheduled())
-        quiesceEndTick = quiesceEvent.when();
+    if (quiesceEvent->scheduled())
+        quiesceEndTick = quiesceEvent->when();
     SERIALIZE_SCALAR(quiesceEndTick);
 
 #endif
@@ -210,7 +196,7 @@ CPUExecContext::unserialize(Checkpoint *cp, const std::string &section)
     Tick quiesceEndTick;
     UNSERIALIZE_SCALAR(quiesceEndTick);
     if (quiesceEndTick)
-        quiesceEvent.schedule(quiesceEndTick);
+        quiesceEvent->schedule(quiesceEndTick);
 #endif
 }
 
@@ -223,7 +209,14 @@ CPUExecContext::activate(int delay)
 
     lastActivate = curTick;
 
+    if (status() == ExecContext::Unallocated) {
+        cpu->activateWhenReady(thread_num);
+        return;
+    }
+
     _status = ExecContext::Active;
+
+    // status() == Suspended
     cpu->activateContext(thread_num, delay);
 }
 
diff --git a/cpu/cpu_exec_context.hh b/cpu/cpu_exec_context.hh
index beaf67352..40153ff08 100644
--- a/cpu/cpu_exec_context.hh
+++ b/cpu/cpu_exec_context.hh
@@ -135,23 +135,9 @@ class CPUExecContext
     Addr profilePC;
     void dumpFuncProfile();
 
-    /** Event for timing out quiesce instruction */
-    struct EndQuiesceEvent : public Event
-    {
-        /** A pointer to the execution context that is quiesced */
-        CPUExecContext *cpuXC;
+    Event *quiesceEvent;
 
-        EndQuiesceEvent(CPUExecContext *_cpuXC);
-
-        /** Event process to occur at interrupt*/
-        virtual void process();
-
-        /** Event description */
-        virtual const char *description();
-    };
-    EndQuiesceEvent quiesceEvent;
-
-    Event *getQuiesceEvent() { return &quiesceEvent; }
+    Event *getQuiesceEvent() { return quiesceEvent; }
 
     Tick readLastActivate() { return lastActivate; }
 
diff --git a/cpu/quiesce_event.cc b/cpu/quiesce_event.cc
new file mode 100644
index 000000000..37814ae09
--- /dev/null
+++ b/cpu/quiesce_event.cc
@@ -0,0 +1,20 @@
+
+#include "cpu/exec_context.hh"
+#include "cpu/quiesce_event.hh"
+
+EndQuiesceEvent::EndQuiesceEvent(ExecContext *_xc)
+    : Event(&mainEventQueue), xc(_xc)
+{
+}
+
+void
+EndQuiesceEvent::process()
+{
+    xc->activate();
+}
+
+const char*
+EndQuiesceEvent::description()
+{
+    return "End Quiesce Event.";
+}
diff --git a/cpu/quiesce_event.hh b/cpu/quiesce_event.hh
new file mode 100644
index 000000000..18e88ecce
--- /dev/null
+++ b/cpu/quiesce_event.hh
@@ -0,0 +1,23 @@
+#ifndef __CPU_QUIESCE_EVENT_HH__
+#define __CPU_QUIESCE_EVENT_HH__
+
+#include "sim/eventq.hh"
+
+class ExecContext;
+
+/** Event for timing out quiesce instruction */
+struct EndQuiesceEvent : public Event
+{
+    /** A pointer to the execution context that is quiesced */
+    ExecContext *xc;
+
+    EndQuiesceEvent(ExecContext *_xc);
+
+    /** Event process to occur at interrupt*/
+    virtual void process();
+
+    /** Event description */
+    virtual const char *description();
+};
+
+#endif // __CPU_QUIESCE_EVENT_HH__

From c30f91c2f634a0b55a9b9b9145b1fbe605bb1a02 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 18:16:18 -0400
Subject: [PATCH 03/50] Namespace fix.

base/timebuf.hh:
    namespace fix.

--HG--
extra : convert_revision : 38e880b9394cf2923e2fb9775368cd93d719f950
---
 base/timebuf.hh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/base/timebuf.hh b/base/timebuf.hh
index 435803fae..f6b5b2781 100644
--- a/base/timebuf.hh
+++ b/base/timebuf.hh
@@ -31,8 +31,6 @@
 
 #include <vector>
 
-using namespace std;
-
 template <class T>
 class TimeBuffer
 {
@@ -42,7 +40,7 @@ class TimeBuffer
     int size;
 
     char *data;
-    vector<char *> index;
+    std::vector<char *> index;
     int base;
 
     void valid(int idx)

From a8b03e4d017b66d7b5502a101ea5b7115827a107 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 18:26:48 -0400
Subject: [PATCH 04/50] Updates for O3 model.

arch/alpha/isa/decoder.isa:
    Make IPR accessing instructions serializing so they are not issued incorrectly in the O3 model.
arch/alpha/isa/pal.isa:
    Allow IPR instructions to have flags.
base/traceflags.py:
    Include new trace flags from the two new CPU models.
cpu/SConscript:
    Create the templates for the split mem accessor methods.  Also include the new files from the new models (the Ozone model will be checked in next).
cpu/base_dyn_inst.cc:
cpu/base_dyn_inst.hh:
    Update to the BaseDynInst for the new models.

--HG--
extra : convert_revision : cc82db9c72ec3e29cea4c3fdff74a3843e287a35
---
 arch/alpha/isa/decoder.isa    |   32 +-
 arch/alpha/isa/pal.isa        |    6 +-
 base/traceflags.py            |   16 +-
 cpu/SConscript                |   32 +-
 cpu/base_dyn_inst.cc          |  167 +++-
 cpu/base_dyn_inst.hh          |  352 ++++++--
 cpu/o3/2bit_local_pred.cc     |   21 +-
 cpu/o3/2bit_local_pred.hh     |   21 +-
 cpu/o3/alpha_cpu.hh           |  302 +++++--
 cpu/o3/alpha_cpu_builder.cc   |  256 +++---
 cpu/o3/alpha_cpu_impl.hh      |  742 +++++++++++++----
 cpu/o3/alpha_dyn_inst.hh      |   70 +-
 cpu/o3/alpha_dyn_inst_impl.hh |  110 ++-
 cpu/o3/alpha_impl.hh          |   17 +-
 cpu/o3/alpha_params.hh        |   58 +-
 cpu/o3/bpred_unit.cc          |    4 +
 cpu/o3/bpred_unit.hh          |  144 +++-
 cpu/o3/bpred_unit_impl.hh     |  181 ++--
 cpu/o3/btb.cc                 |   26 +-
 cpu/o3/btb.hh                 |   59 +-
 cpu/o3/comm.hh                |  110 ++-
 cpu/o3/commit.cc              |    2 +-
 cpu/o3/commit.hh              |  301 ++++++-
 cpu/o3/commit_impl.hh         | 1142 ++++++++++++++++++++++----
 cpu/o3/cpu.cc                 |  981 ++++++++++++++++++----
 cpu/o3/cpu.hh                 |  356 ++++++--
 cpu/o3/cpu_policy.hh          |   33 +-
 cpu/o3/decode.cc              |    2 +-
 cpu/o3/decode.hh              |  170 +++-
 cpu/o3/decode_impl.hh         |  665 ++++++++++-----
 cpu/o3/fetch.cc               |    2 +-
 cpu/o3/fetch.hh               |  236 +++++-
 cpu/o3/fetch_impl.hh          | 1049 ++++++++++++++++++------
 cpu/o3/free_list.cc           |   36 +-
 cpu/o3/free_list.hh           |   82 +-
 cpu/o3/fu_pool.cc             |  281 +++++++
 cpu/o3/fu_pool.hh             |  159 ++++
 cpu/o3/iew.cc                 |    2 +-
 cpu/o3/iew.hh                 |  328 ++++++--
 cpu/o3/iew_impl.hh            | 1454 ++++++++++++++++++++++++---------
 cpu/o3/inst_queue.hh          |  354 ++++++--
 cpu/o3/inst_queue_impl.hh     | 1252 +++++++++++++++++-----------
 cpu/o3/lsq.cc                 |   36 +
 cpu/o3/lsq.hh                 |  307 +++++++
 cpu/o3/lsq_impl.hh            |  645 +++++++++++++++
 cpu/o3/lsq_unit.cc            |   36 +
 cpu/o3/lsq_unit.hh            |  703 ++++++++++++++++
 cpu/o3/lsq_unit_impl.hh       |  893 ++++++++++++++++++++
 cpu/o3/mem_dep_unit.cc        |   10 +
 cpu/o3/mem_dep_unit.hh        |  216 +++--
 cpu/o3/mem_dep_unit_impl.hh   |  550 ++++++++-----
 cpu/o3/ras.cc                 |   18 +-
 cpu/o3/ras.hh                 |   35 +-
 cpu/o3/regfile.hh             |  127 +--
 cpu/o3/rename.cc              |    2 +-
 cpu/o3/rename.hh              |  350 ++++++--
 cpu/o3/rename_impl.hh         | 1422 +++++++++++++++++++++-----------
 cpu/o3/rename_map.cc          |  216 ++---
 cpu/o3/rename_map.hh          |   61 +-
 cpu/o3/rob.hh                 |  214 ++++-
 cpu/o3/rob_impl.hh            |  636 +++++++++++---
 cpu/o3/sat_counter.cc         |    6 +-
 cpu/o3/sat_counter.hh         |   10 +-
 cpu/o3/scoreboard.cc          |  105 +++
 cpu/o3/scoreboard.hh          |  114 +++
 cpu/o3/store_set.cc           |  151 ++--
 cpu/o3/store_set.hh           |   47 +-
 cpu/o3/thread_state.hh        |  143 ++++
 cpu/o3/tournament_pred.cc     |   47 +-
 cpu/o3/tournament_pred.hh     |   33 +-
 cpu/thread_state.hh           |   92 +++
 python/m5/objects/FUPool.py   |    8 +
 72 files changed, 14628 insertions(+), 4218 deletions(-)
 create mode 100644 cpu/o3/fu_pool.cc
 create mode 100644 cpu/o3/fu_pool.hh
 create mode 100644 cpu/o3/lsq.cc
 create mode 100644 cpu/o3/lsq.hh
 create mode 100644 cpu/o3/lsq_impl.hh
 create mode 100644 cpu/o3/lsq_unit.cc
 create mode 100644 cpu/o3/lsq_unit.hh
 create mode 100644 cpu/o3/lsq_unit_impl.hh
 create mode 100644 cpu/o3/scoreboard.cc
 create mode 100644 cpu/o3/scoreboard.hh
 create mode 100644 cpu/o3/thread_state.hh
 create mode 100644 cpu/thread_state.hh
 create mode 100644 python/m5/objects/FUPool.py

diff --git a/arch/alpha/isa/decoder.isa b/arch/alpha/isa/decoder.isa
index e09673269..905ace4e1 100644
--- a/arch/alpha/isa/decoder.isa
+++ b/arch/alpha/isa/decoder.isa
@@ -73,7 +73,7 @@ decode OPCODE default Unknown::unknown() {
                         uint64_t tmp = write_result;
                         // see stq_c
                         Ra = (tmp == 0 || tmp == 1) ? tmp : Ra;
-                    }}, mem_flags = LOCKED);
+                    }}, mem_flags = LOCKED, inst_flags = IsNonSpeculative);
         0x2f: stq_c({{ Mem.uq = Ra; }},
                     {{
                         uint64_t tmp = write_result;
@@ -85,7 +85,7 @@ decode OPCODE default Unknown::unknown() {
                         // mailbox access, and we don't update the
                         // result register at all.
                         Ra = (tmp == 0 || tmp == 1) ? tmp : Ra;
-                    }}, mem_flags = LOCKED);
+                    }}, mem_flags = LOCKED, inst_flags = IsNonSpeculative);
     }
 
     format IntegerOperate {
@@ -591,8 +591,8 @@ decode OPCODE default Unknown::unknown() {
             0x02e: fcmovle({{ Fc = (Fa <= 0) ? Fb : Fc; }});
             0x02f: fcmovgt({{ Fc = (Fa >  0) ? Fb : Fc; }});
 
-            0x024: mt_fpcr({{ FPCR = Fa.uq; }});
-            0x025: mf_fpcr({{ Fa.uq = FPCR; }});
+            0x024: mt_fpcr({{ FPCR = Fa.uq; }}, IsSerializing, IsSerializeBefore);
+            0x025: mf_fpcr({{ Fa.uq = FPCR; }}, IsSerializing, IsSerializeBefore);
         }
     }
 
@@ -623,7 +623,7 @@ decode OPCODE default Unknown::unknown() {
 #else
                 Ra = curTick;
 #endif
-            }});
+            }}, IsNonSpeculative);
 
             // All of the barrier instructions below do nothing in
             // their execute() methods (hence the empty code blocks).
@@ -641,8 +641,8 @@ decode OPCODE default Unknown::unknown() {
             // a barrier on integer and FP traps.  "EXCB is thus a
             // superset of TRAPB." (Alpha ARM, Sec 4.11.4) We treat
             // them the same though.
-            0x0000: trapb({{ }}, IsSerializing, No_OpClass);
-            0x0400: excb({{ }}, IsSerializing, No_OpClass);
+            0x0000: trapb({{ }}, IsSerializing, IsSerializeBefore, No_OpClass);
+            0x0400: excb({{ }}, IsSerializing, IsSerializeBefore, No_OpClass);
             0x4000: mb({{ }}, IsMemBarrier, MemReadOp);
             0x4400: wmb({{ }}, IsWriteBarrier, MemWriteOp);
         }
@@ -694,11 +694,11 @@ decode OPCODE default Unknown::unknown() {
             }}, IsNonSpeculative);
             0x83: callsys({{
                 xc->syscall();
-            }}, IsNonSpeculative);
+            }}, IsNonSpeculative, IsSerializeAfter);
             // Read uniq reg into ABI return value register (r0)
-            0x9e: rduniq({{ R0 = Runiq; }});
+            0x9e: rduniq({{ R0 = Runiq; }}, IsSerializing, IsSerializeBefore);
             // Write uniq reg with value from ABI arg register (r16)
-            0x9f: wruniq({{ Runiq = R16; }});
+            0x9f: wruniq({{ Runiq = R16; }}, IsSerializing, IsSerializeBefore);
         }
     }
 #endif
@@ -735,7 +735,7 @@ decode OPCODE default Unknown::unknown() {
         format HwMoveIPR {
             1: hw_mfpr({{
                 Ra = xc->readMiscRegWithEffect(ipr_index, fault);
-            }});
+            }}, IsSerializing, IsSerializeBefore);
         }
     }
 
@@ -745,14 +745,14 @@ decode OPCODE default Unknown::unknown() {
             1: hw_mtpr({{
                 xc->setMiscRegWithEffect(ipr_index, Ra);
                 if (traceData) { traceData->setData(Ra); }
-            }});
+            }}, IsSerializing, IsSerializeBefore);
         }
     }
 
     format BasicOperate {
         0x1e: decode PALMODE {
             0: OpcdecFault::hw_rei();
-            1:hw_rei({{ xc->hwrei(); }}, IsSerializing);
+            1:hw_rei({{ xc->hwrei(); }}, IsSerializing, IsSerializeBefore);
         }
 
         // M5 special opcodes use the reserved 0x01 opcode space
@@ -762,13 +762,13 @@ decode OPCODE default Unknown::unknown() {
             }}, IsNonSpeculative);
             0x01: quiesce({{
                 AlphaPseudo::quiesce(xc->xcBase());
-            }}, IsNonSpeculative);
+            }}, IsNonSpeculative, IsQuiesce);
             0x02: quiesceNs({{
                 AlphaPseudo::quiesceNs(xc->xcBase(), R16);
-            }}, IsNonSpeculative);
+            }}, IsNonSpeculative, IsQuiesce);
             0x03: quiesceCycles({{
                 AlphaPseudo::quiesceCycles(xc->xcBase(), R16);
-            }}, IsNonSpeculative);
+            }}, IsNonSpeculative, IsQuiesce);
             0x04: quiesceTime({{
                 R0 = AlphaPseudo::quiesceTime(xc->xcBase());
             }}, IsNonSpeculative);
diff --git a/arch/alpha/isa/pal.isa b/arch/alpha/isa/pal.isa
index e07bea5a8..63af56359 100644
--- a/arch/alpha/isa/pal.isa
+++ b/arch/alpha/isa/pal.isa
@@ -259,9 +259,11 @@ output decoder {{
     }
 }};
 
-def format HwMoveIPR(code) {{
+def format HwMoveIPR(code, *flags) {{
+    all_flags = ['IprAccessOp']
+    all_flags += flags
     iop = InstObjParams(name, Name, 'HwMoveIPR', CodeBlock(code),
-                        ['IprAccessOp'])
+                        all_flags)
     header_output = BasicDeclare.subst(iop)
     decoder_output = BasicConstructor.subst(iop)
     decode_block = BasicDecode.subst(iop)
diff --git a/base/traceflags.py b/base/traceflags.py
index e814a00fb..bd0f258a0 100644
--- a/base/traceflags.py
+++ b/base/traceflags.py
@@ -133,15 +133,24 @@ baseFlags = [
     'ROB',
     'FreeList',
     'RenameMap',
-    'LDSTQ',
+    'LSQ',
+    'LSQUnit',
     'StoreSet',
     'MemDepUnit',
     'DynInst',
     'FullCPU',
     'CommitRate',
-    'OoOCPU',
+    'OzoneCPU',
+    'FE',
+    'IBE',
+    'BE',
+    'OzoneLSQ',
     'HWPrefetch',
     'Stack',
+    'DependGraph',
+    'Activity',
+    'Scoreboard',
+    'Writeback'
     ]
 
 #
@@ -159,7 +168,8 @@ compoundFlagMap = {
     'EthernetAll' : [ 'Ethernet', 'EthernetPIO', 'EthernetDMA', 'EthernetData' , 'EthernetDesc', 'EthernetIntr', 'EthernetSM', 'EthernetCksum' ],
     'EthernetNoData' : [ 'Ethernet', 'EthernetPIO', 'EthernetDesc', 'EthernetIntr', 'EthernetSM', 'EthernetCksum' ],
     'IdeAll' : [ 'IdeCtrl', 'IdeDisk' ],
-    'FullCPUAll' : [ 'Fetch', 'Decode', 'Rename', 'IEW', 'Commit', 'IQ', 'ROB', 'FreeList', 'RenameMap', 'LDSTQ', 'StoreSet', 'MemDepUnit', 'DynInst', 'FullCPU']
+    'FullCPUAll' : [ 'Fetch', 'Decode', 'Rename', 'IEW', 'Commit', 'IQ', 'ROB', 'FreeList', 'RenameMap', 'LSQ', 'LSQUnit', 'StoreSet', 'MemDepUnit', 'DynInst', 'FullCPU', 'Activity','Scoreboard','Writeback'],
+    'OzoneCPUAll' : [ 'BE', 'FE', 'IBE', 'OzoneLSQ', 'OzoneCPU']
 }
 
 #############################################################
diff --git a/cpu/SConscript b/cpu/SConscript
index af6bab4eb..888dbdc22 100644
--- a/cpu/SConscript
+++ b/cpu/SConscript
@@ -53,6 +53,14 @@ exec_sig_template = '''
 virtual Fault execute(%s *xc, Trace::InstRecord *traceData) const = 0;
 '''
 
+mem_ini_sig_template = '''
+virtual Fault initiateAcc(%s *xc, Trace::InstRecord *traceData) const { panic("Not defined!"); };
+'''
+
+mem_comp_sig_template = '''
+virtual Fault completeAcc(uint8_t *data, %s *xc, Trace::InstRecord *traceData) const { panic("Not defined!"); return NoFault; };
+'''
+
 # Generate header.  
 def gen_cpu_exec_signatures(target, source, env):
     f = open(str(target[0]), 'w')
@@ -63,6 +71,8 @@ def gen_cpu_exec_signatures(target, source, env):
     for cpu in env['CPU_MODELS']:
         xc_type = CpuModel.dict[cpu].strings['CPU_exec_context']
         print >> f, exec_sig_template % xc_type
+        print >> f, mem_ini_sig_template % xc_type
+        print >> f, mem_comp_sig_template % xc_type
     print >> f, '''
 #endif  // __CPU_STATIC_INST_EXEC_SIGS_HH__
 '''
@@ -104,20 +114,40 @@ if 'AlphaFullCPU' in env['CPU_MODELS']:
         o3/decode.cc
         o3/fetch.cc
         o3/free_list.cc
+        o3/fu_pool.cc
         o3/cpu.cc
         o3/iew.cc
         o3/inst_queue.cc
-        o3/ldstq.cc
+        o3/lsq_unit.cc
+        o3/lsq.cc
         o3/mem_dep_unit.cc
         o3/ras.cc
         o3/rename.cc
         o3/rename_map.cc
         o3/rob.cc
         o3/sat_counter.cc
+        o3/scoreboard.cc
         o3/store_set.cc
         o3/tournament_pred.cc
         ''')
 
+if 'OzoneSimpleCPU' in env['CPU_MODELS']:
+    sources += Split('''
+        ozone/cpu.cc
+        ozone/cpu_builder.cc
+        ozone/dyn_inst.cc
+        ozone/front_end.cc
+        ozone/inorder_back_end.cc
+        ozone/inst_queue.cc
+        ozone/rename_table.cc
+        ''')
+
+if 'OzoneCPU' in env['CPU_MODELS']:
+    sources += Split('''
+        ozone/back_end.cc
+        ozone/lsq_unit.cc
+        ''')
+
 # FullCPU sources are included from m5/SConscript since they're not
 # below this point in the file hierarchy.
 
diff --git a/cpu/base_dyn_inst.cc b/cpu/base_dyn_inst.cc
index bf7c35cad..6ce9b4455 100644
--- a/cpu/base_dyn_inst.cc
+++ b/cpu/base_dyn_inst.cc
@@ -26,10 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_BASE_DYN_INST_CC__
-#define __CPU_BASE_DYN_INST_CC__
-
 #include <iostream>
+#include <set>
 #include <string>
 #include <sstream>
 
@@ -43,6 +41,8 @@
 #include "cpu/base_dyn_inst.hh"
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/alpha_cpu.hh"
+#include "cpu/ozone/simple_impl.hh"
+#include "cpu/ozone/ozone_impl.hh"
 
 using namespace std;
 using namespace TheISA;
@@ -54,21 +54,23 @@ using namespace TheISA;
 
 unsigned int MyHashFunc(const BaseDynInst *addr)
 {
-  unsigned a = (unsigned)addr;
-  unsigned hash = (((a >> 14) ^ ((a >> 2) & 0xffff))) & 0x7FFFFFFF;
+    unsigned a = (unsigned)addr;
+    unsigned hash = (((a >> 14) ^ ((a >> 2) & 0xffff))) & 0x7FFFFFFF;
 
-  return hash;
+    return hash;
 }
 
-typedef m5::hash_map<const BaseDynInst *, const BaseDynInst *, MyHashFunc> my_hash_t;
+typedef m5::hash_map<const BaseDynInst *, const BaseDynInst *, MyHashFunc>
+my_hash_t;
+
 my_hash_t thishash;
 #endif
 
 template <class Impl>
-BaseDynInst<Impl>::BaseDynInst(MachInst machInst, Addr inst_PC,
+BaseDynInst<Impl>::BaseDynInst(ExtMachInst machInst, Addr inst_PC,
                                Addr pred_PC, InstSeqNum seq_num,
                                FullCPU *cpu)
-    : staticInst(machInst), traceData(NULL), cpu(cpu), cpuXC(cpu->cpuXCBase())
+  : staticInst(machInst), traceData(NULL), cpu(cpu)/*, xc(cpu->xcBase())*/
 {
     seqNum = seq_num;
 
@@ -83,6 +85,7 @@ template <class Impl>
 BaseDynInst<Impl>::BaseDynInst(StaticInstPtr &_staticInst)
     : staticInst(_staticInst), traceData(NULL)
 {
+    seqNum = 0;
     initVars();
 }
 
@@ -90,8 +93,10 @@ template <class Impl>
 void
 BaseDynInst<Impl>::initVars()
 {
+    req = NULL;
     effAddr = MemReq::inval_addr;
     physEffAddr = MemReq::inval_addr;
+    storeSize = 0;
 
     readyRegs = 0;
 
@@ -100,13 +105,27 @@ BaseDynInst<Impl>::initVars()
     issued = false;
     executed = false;
     canCommit = false;
+    committed = false;
     squashed = false;
     squashedInIQ = false;
+    squashedInLSQ = false;
+    squashedInROB = false;
     eaCalcDone = false;
+    memOpDone = false;
+    lqIdx = -1;
+    sqIdx = -1;
+    reachedCommit = false;
 
     blockingInst = false;
     recoverInst = false;
 
+    iqEntry = false;
+    robEntry = false;
+
+    serializeBefore = false;
+    serializeAfter = false;
+    serializeHandled = false;
+
     // Eventually make this a parameter.
     threadNumber = 0;
 
@@ -114,22 +133,63 @@ BaseDynInst<Impl>::initVars()
     asid = 0;
 
     // Initialize the fault to be unimplemented opcode.
-    fault = new UnimplementedOpcodeFault;
+//    fault = new UnimplementedOpcodeFault;
+    fault = NoFault;
 
     ++instcount;
 
-    DPRINTF(FullCPU, "DynInst: Instruction created.  Instcount=%i\n",
-            instcount);
+    if (instcount > 1500) {
+        cpu->dumpInsts();
+#ifdef DEBUG
+        dumpSNList();
+#endif
+        assert(instcount <= 1500);
+    }
+
+    DPRINTF(DynInst, "DynInst: [sn:%lli] Instruction created. Instcount=%i\n",
+            seqNum, instcount);
+
+#ifdef DEBUG
+    cpu->snList.insert(seqNum);
+#endif
 }
 
 template <class Impl>
 BaseDynInst<Impl>::~BaseDynInst()
 {
+    if (req) {
+        req = NULL;
+    }
+
+    if (traceData) {
+        delete traceData;
+    }
+
     --instcount;
-    DPRINTF(FullCPU, "DynInst: Instruction destroyed.  Instcount=%i\n",
-            instcount);
+
+    DPRINTF(DynInst, "DynInst: [sn:%lli] Instruction destroyed. Instcount=%i\n",
+            seqNum, instcount);
+#ifdef DEBUG
+    cpu->snList.erase(seqNum);
+#endif
 }
 
+#ifdef DEBUG
+template <class Impl>
+void
+BaseDynInst<Impl>::dumpSNList()
+{
+    std::set<InstSeqNum>::iterator sn_it = cpu->snList.begin();
+
+    int count = 0;
+    while (sn_it != cpu->snList.end()) {
+        cprintf("%i: [sn:%lli] not destroyed\n", count, (*sn_it));
+        count++;
+        sn_it++;
+    }
+}
+#endif
+
 template <class Impl>
 void
 BaseDynInst<Impl>::prefetch(Addr addr, unsigned flags)
@@ -139,14 +199,14 @@ BaseDynInst<Impl>::prefetch(Addr addr, unsigned flags)
     // state.
 
     // Generate a MemReq so we can translate the effective address.
-    MemReqPtr req = new MemReq(addr, cpuXC->getProxy(), 1, flags);
+    MemReqPtr req = new MemReq(addr, thread->getXCProxy(), 1, flags);
     req->asid = asid;
 
     // Prefetches never cause faults.
     fault = NoFault;
 
     // note this is a local, not BaseDynInst::fault
-    Fault trans_fault = cpuXC->translateDataReadReq(req);
+    Fault trans_fault = cpu->translateDataReadReq(req);
 
     if (trans_fault == NoFault && !(req->flags & UNCACHEABLE)) {
         // It's a valid address to cacheable space.  Record key MemReq
@@ -162,15 +222,6 @@ BaseDynInst<Impl>::prefetch(Addr addr, unsigned flags)
         effAddr = physEffAddr = MemReq::inval_addr;
     }
 
-    /**
-     * @todo
-     * Replace the disjoint functional memory with a unified one and remove
-     * this hack.
-     */
-#if !FULL_SYSTEM
-    req->paddr = req->vaddr;
-#endif
-
     if (traceData) {
         traceData->setAddr(addr);
     }
@@ -184,10 +235,10 @@ BaseDynInst<Impl>::writeHint(Addr addr, int size, unsigned flags)
     // will casue a TLB miss trap if necessary... not sure whether
     // that's the best thing to do or not.  We don't really need the
     // MemReq otherwise, since wh64 has no functional effect.
-    MemReqPtr req = new MemReq(addr, cpuXC->getProxy(), size, flags);
+    MemReqPtr req = new MemReq(addr, thread->getXCProxy(), size, flags);
     req->asid = asid;
 
-    fault = cpuXC->translateDataWriteReq(req);
+    fault = cpu->translateDataWriteReq(req);
 
     if (fault == NoFault && !(req->flags & UNCACHEABLE)) {
         // Record key MemReq parameters so we can generate another one
@@ -212,18 +263,18 @@ template <class Impl>
 Fault
 BaseDynInst<Impl>::copySrcTranslate(Addr src)
 {
-    MemReqPtr req = new MemReq(src, cpuXC->getProxy(), 64);
+    MemReqPtr req = new MemReq(src, thread->getXCProxy(), 64);
     req->asid = asid;
 
     // translate to physical address
-    Fault fault = cpuXC->translateDataReadReq(req);
+    Fault fault = cpu->translateDataReadReq(req);
 
     if (fault == NoFault) {
-        cpuXC->copySrcAddr = src;
-        cpuXC->copySrcPhysAddr = req->paddr;
+        thread->copySrcAddr = src;
+        thread->copySrcPhysAddr = req->paddr;
     } else {
-        cpuXC->copySrcAddr = 0;
-        cpuXC->copySrcPhysAddr = 0;
+        thread->copySrcAddr = 0;
+        thread->copySrcPhysAddr = 0;
     }
     return fault;
 }
@@ -236,18 +287,18 @@ Fault
 BaseDynInst<Impl>::copy(Addr dest)
 {
     uint8_t data[64];
-    FunctionalMemory *mem = cpuXC->mem;
-    assert(cpuXC->copySrcPhysAddr || cpuXC->misspeculating());
-    MemReqPtr req = new MemReq(dest, cpuXC->getProxy(), 64);
+    FunctionalMemory *mem = thread->mem;
+    assert(thread->copySrcPhysAddr || thread->misspeculating());
+    MemReqPtr req = new MemReq(dest, thread->getXCProxy(), 64);
     req->asid = asid;
 
     // translate to physical address
-    Fault fault = cpuXC->translateDataWriteReq(req);
+    Fault fault = cpu->translateDataWriteReq(req);
 
     if (fault == NoFault) {
         Addr dest_addr = req->paddr;
         // Need to read straight from memory since we have more than 8 bytes.
-        req->paddr = cpuXC->copySrcPhysAddr;
+        req->paddr = thread->copySrcPhysAddr;
         mem->read(req, data);
         req->paddr = dest_addr;
         mem->write(req, data);
@@ -275,7 +326,6 @@ BaseDynInst<Impl>::dump(std::string &outstring)
     outstring = s.str();
 }
 
-
 #if 0
 template <class Impl>
 Fault
@@ -337,6 +387,28 @@ BaseDynInst<Impl>::mem_access(mem_cmd cmd, Addr addr, void *p, int nbytes)
 
 #endif
 
+template <class Impl>
+void
+BaseDynInst<Impl>::markSrcRegReady()
+{
+    if (++readyRegs == numSrcRegs()) {
+        canIssue = true;
+    }
+}
+
+template <class Impl>
+void
+BaseDynInst<Impl>::markSrcRegReady(RegIndex src_idx)
+{
+    ++readyRegs;
+
+    _readySrcRegIdx[src_idx] = true;
+
+    if (readyRegs == numSrcRegs()) {
+        canIssue = true;
+    }
+}
+
 template <class Impl>
 bool
 BaseDynInst<Impl>::eaSrcsReady()
@@ -345,8 +417,7 @@ BaseDynInst<Impl>::eaSrcsReady()
     // EA calc depends on.  (i.e. src reg 0 is the source of the data to be
     // stored)
 
-    for (int i = 1; i < numSrcRegs(); ++i)
-    {
+    for (int i = 1; i < numSrcRegs(); ++i) {
         if (!_readySrcRegIdx[i])
             return false;
     }
@@ -361,4 +432,16 @@ template <>
 int
 BaseDynInst<AlphaSimpleImpl>::instcount = 0;
 
-#endif // __CPU_BASE_DYN_INST_CC__
+// Forward declaration
+template class BaseDynInst<SimpleImpl>;
+
+template <>
+int
+BaseDynInst<SimpleImpl>::instcount = 0;
+
+// Forward declaration
+template class BaseDynInst<OzoneImpl>;
+
+template <>
+int
+BaseDynInst<OzoneImpl>::instcount = 0;
diff --git a/cpu/base_dyn_inst.hh b/cpu/base_dyn_inst.hh
index 3a7852f79..ecad6ad64 100644
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@@ -29,21 +29,24 @@
 #ifndef __CPU_BASE_DYN_INST_HH__
 #define __CPU_BASE_DYN_INST_HH__
 
+#include <list>
 #include <string>
-#include <vector>
 
 #include "base/fast_alloc.hh"
 #include "base/trace.hh"
 #include "config/full_system.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/inst_seq.hh"
-#include "cpu/o3/comm.hh"
 #include "cpu/static_inst.hh"
-#include "encumbered/cpu/full/bpred_update.hh"
 #include "encumbered/cpu/full/op_class.hh"
+#include "mem/functional/memory_control.hh"
+#include "sim/system.hh"
+/*
+#include "encumbered/cpu/full/bpred_update.hh"
 #include "encumbered/cpu/full/spec_memory.hh"
 #include "encumbered/cpu/full/spec_state.hh"
 #include "encumbered/mem/functional/main.hh"
+*/
 
 /**
  * @file
@@ -59,20 +62,29 @@ class BaseDynInst : public FastAlloc, public RefCounted
   public:
     // Typedef for the CPU.
     typedef typename Impl::FullCPU FullCPU;
+    typedef typename FullCPU::ImplState ImplState;
 
-    /// Binary machine instruction type.
+    // Binary machine instruction type.
     typedef TheISA::MachInst MachInst;
-    /// Logical register index type.
+    // Extended machine instruction type
+    typedef TheISA::ExtMachInst ExtMachInst;
+    // Logical register index type.
     typedef TheISA::RegIndex RegIndex;
-    /// Integer register index type.
+    // Integer register index type.
     typedef TheISA::IntReg IntReg;
 
+    // The DynInstPtr type.
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    // The list of instructions iterator type.
+    typedef typename std::list<DynInstPtr>::iterator ListIt;
+
     enum {
-        MaxInstSrcRegs = TheISA::MaxInstSrcRegs,        //< Max source regs
-        MaxInstDestRegs = TheISA::MaxInstDestRegs,      //< Max dest regs
+        MaxInstSrcRegs = TheISA::MaxInstSrcRegs,	/// Max source regs
+        MaxInstDestRegs = TheISA::MaxInstDestRegs,	/// Max dest regs
     };
 
-    /** The static inst used by this dyn inst. */
+    /** The StaticInst used by this BaseDynInst. */
     StaticInstPtr staticInst;
 
     ////////////////////////////////////////////
@@ -80,11 +92,27 @@ class BaseDynInst : public FastAlloc, public RefCounted
     // INSTRUCTION EXECUTION
     //
     ////////////////////////////////////////////
+    /** InstRecord that tracks this instructions. */
     Trace::InstRecord *traceData;
 
+    /**
+     * Does a read to a given address.
+     * @param addr The address to read.
+     * @param data The read's data is written into this parameter.
+     * @param flags The request's flags.
+     * @return Returns any fault due to the read.
+     */
     template <class T>
     Fault read(Addr addr, T &data, unsigned flags);
 
+    /**
+     * Does a write to a given address.
+     * @param data The data to be written.
+     * @param addr The address to write to.
+     * @param flags The request's flags.
+     * @param res The result of the write (for load locked/store conditionals).
+     * @return Returns any fault due to the write.
+     */
     template <class T>
     Fault write(T data, Addr addr, unsigned flags,
                         uint64_t *res);
@@ -96,14 +124,17 @@ class BaseDynInst : public FastAlloc, public RefCounted
 
     /** @todo: Consider making this private. */
   public:
-    /** Is this instruction valid. */
-    bool valid;
-
     /** The sequence number of the instruction. */
     InstSeqNum seqNum;
 
-    /** How many source registers are ready. */
-    unsigned readyRegs;
+    /** Is the instruction in the IQ */
+    bool iqEntry;
+
+    /** Is the instruction in the ROB */
+    bool robEntry;
+
+    /** Is the instruction in the LSQ */
+    bool lsqEntry;
 
     /** Is the instruction completed. */
     bool completed;
@@ -120,12 +151,21 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Can this instruction commit. */
     bool canCommit;
 
+    /** Is this instruction committed. */
+    bool committed;
+
     /** Is this instruction squashed. */
     bool squashed;
 
     /** Is this instruction squashed in the instruction queue. */
     bool squashedInIQ;
 
+    /** Is this instruction squashed in the instruction queue. */
+    bool squashedInLSQ;
+
+    /** Is this instruction squashed in the instruction queue. */
+    bool squashedInROB;
+
     /** Is this a recover instruction. */
     bool recoverInst;
 
@@ -141,15 +181,21 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** data address space ID, for loads & stores. */
     short asid;
 
+    /** How many source registers are ready. */
+    unsigned readyRegs;
+
     /** Pointer to the FullCPU object. */
     FullCPU *cpu;
 
     /** Pointer to the exec context.  Will not exist in the final version. */
-    CPUExecContext *cpuXC;
+    ImplState *thread;
 
     /** The kind of fault this instruction has generated. */
     Fault fault;
 
+    /** The memory request. */
+    MemReqPtr req;
+
     /** The effective virtual address (lds & stores only). */
     Addr effAddr;
 
@@ -197,17 +243,29 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Count of total number of dynamic instructions. */
     static int instcount;
 
-    /** Whether or not the source register is ready.  Not sure this should be
-     *  here vs. the derived class.
+#ifdef DEBUG
+    void dumpSNList();
+#endif
+
+    /** Whether or not the source register is ready.
+     *  @todo: Not sure this should be here vs the derived class.
      */
     bool _readySrcRegIdx[MaxInstSrcRegs];
 
   public:
-    /** BaseDynInst constructor given a binary instruction. */
-    BaseDynInst(MachInst inst, Addr PC, Addr Pred_PC, InstSeqNum seq_num,
+    /** BaseDynInst constructor given a binary instruction.
+     *  @param inst The binary instruction.
+     *  @param PC The PC of the instruction.
+     *  @param pred_PC The predicted next PC.
+     *  @param seq_num The sequence number of the instruction.
+     *  @param cpu Pointer to the instruction's CPU.
+     */
+    BaseDynInst(ExtMachInst inst, Addr PC, Addr pred_PC, InstSeqNum seq_num,
                 FullCPU *cpu);
 
-    /** BaseDynInst constructor given a static inst pointer. */
+    /** BaseDynInst constructor given a StaticInst pointer.
+     *  @param _staticInst The StaticInst for this BaseDynInst.
+     */
     BaseDynInst(StaticInstPtr &_staticInst);
 
     /** BaseDynInst destructor. */
@@ -218,12 +276,20 @@ class BaseDynInst : public FastAlloc, public RefCounted
     void initVars();
 
   public:
+    /**
+     *  @todo: Make this function work; currently it is a dummy function.
+     *  @param fault Last fault.
+     *  @param cmd Last command.
+     *  @param addr Virtual address of access.
+     *  @param p Memory accessed.
+     *  @param nbytes Access size.
+     */
     void
-    trace_mem(Fault fault,      // last fault
-              MemCmd cmd,       // last command
-              Addr addr,        // virtual address of access
-              void *p,          // memory accessed
-              int nbytes);      // access size
+    trace_mem(Fault fault,
+              MemCmd cmd,
+              Addr addr,
+              void *p,
+              int nbytes);
 
     /** Dumps out contents of this BaseDynInst. */
     void dump();
@@ -237,6 +303,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Checks whether or not this instruction has had its branch target
      *  calculated yet.  For now it is not utilized and is hacked to be
      *  always false.
+     *  @todo: Actually use this instruction.
      */
     bool doneTargCalc() { return false; }
 
@@ -252,12 +319,10 @@ class BaseDynInst : public FastAlloc, public RefCounted
     Addr readPredTarg() { return predPC; }
 
     /** Returns whether the instruction was predicted taken or not. */
-    bool predTaken() {
-        return( predPC != (PC + sizeof(MachInst) ) );
-    }
+    bool predTaken() { return predPC != (PC + sizeof(MachInst)); }
 
     /** Returns whether the instruction mispredicted. */
-    bool mispredicted() { return (predPC != nextPC); }
+    bool mispredicted() { return predPC != nextPC; }
 
     //
     //  Instruction types.  Forward checks to StaticInst object.
@@ -280,9 +345,51 @@ class BaseDynInst : public FastAlloc, public RefCounted
     bool isUncondCtrl()	  const { return staticInst->isUncondCtrl(); }
     bool isThreadSync()   const { return staticInst->isThreadSync(); }
     bool isSerializing()  const { return staticInst->isSerializing(); }
+    bool isSerializeBefore() const
+    { return staticInst->isSerializeBefore() || serializeBefore; }
+    bool isSerializeAfter() const
+    { return staticInst->isSerializeAfter() || serializeAfter; }
     bool isMemBarrier()   const { return staticInst->isMemBarrier(); }
     bool isWriteBarrier() const { return staticInst->isWriteBarrier(); }
     bool isNonSpeculative() const { return staticInst->isNonSpeculative(); }
+    bool isQuiesce() const { return staticInst->isQuiesce(); }
+
+    /** Temporarily sets this instruction as a serialize before instruction. */
+    void setSerializeBefore() { serializeBefore = true; }
+
+    /** Clears the serializeBefore part of this instruction. */
+    void clearSerializeBefore() { serializeBefore = false; }
+
+    /** Checks if this serializeBefore is only temporarily set. */
+    bool isTempSerializeBefore() { return serializeBefore; }
+
+    /** Tracks if instruction has been externally set as serializeBefore. */
+    bool serializeBefore;
+
+    /** Temporarily sets this instruction as a serialize after instruction. */
+    void setSerializeAfter() { serializeAfter = true; }
+
+    /** Clears the serializeAfter part of this instruction.*/
+    void clearSerializeAfter() { serializeAfter = false; }
+
+    /** Checks if this serializeAfter is only temporarily set. */
+    bool isTempSerializeAfter() { return serializeAfter; }
+
+    /** Tracks if instruction has been externally set as serializeAfter. */
+    bool serializeAfter;
+
+    /** Checks if the serialization part of this instruction has been
+     *  handled.  This does not apply to the temporary serializing
+     *  state; it only applies to this instruction's own permanent
+     *  serializing state.
+     */
+    bool isSerializeHandled() { return serializeHandled; }
+
+    /** Sets the serialization part of this instruction as handled. */
+    void setSerializeHandled() { serializeHandled = true; }
+
+    /** Whether or not the serialization of this instruction has been handled. */
+    bool serializeHandled;
 
     /** Returns the opclass of this instruction. */
     OpClass opClass() const { return staticInst->opClass(); }
@@ -290,10 +397,10 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Returns the branch target address. */
     Addr branchTarget() const { return staticInst->branchTarget(PC); }
 
-    /** Number of source registers. */
-    int8_t numSrcRegs()	 const { return staticInst->numSrcRegs(); }
+    /** Returns the number of source registers. */
+    int8_t numSrcRegs()	const { return staticInst->numSrcRegs(); }
 
-    /** Number of destination registers. */
+    /** Returns the number of destination registers. */
     int8_t numDestRegs() const { return staticInst->numDestRegs(); }
 
     // the following are used to track physical register usage
@@ -302,16 +409,10 @@ class BaseDynInst : public FastAlloc, public RefCounted
     int8_t numIntDestRegs() const { return staticInst->numIntDestRegs(); }
 
     /** Returns the logical register index of the i'th destination register. */
-    RegIndex destRegIdx(int i) const
-    {
-        return staticInst->destRegIdx(i);
-    }
+    RegIndex destRegIdx(int i) const { return staticInst->destRegIdx(i); }
 
     /** Returns the logical register index of the i'th source register. */
-    RegIndex srcRegIdx(int i) const
-    {
-        return staticInst->srcRegIdx(i);
-    }
+    RegIndex srcRegIdx(int i) const { return staticInst->srcRegIdx(i); }
 
     /** Returns the result of an integer instruction. */
     uint64_t readIntResult() { return instResult.integer; }
@@ -324,27 +425,12 @@ class BaseDynInst : public FastAlloc, public RefCounted
 
     //Push to .cc file.
     /** Records that one of the source registers is ready. */
-    void markSrcRegReady()
-    {
-        ++readyRegs;
-        if(readyRegs == numSrcRegs()) {
-            canIssue = true;
-        }
-    }
+    void markSrcRegReady();
 
     /** Marks a specific register as ready.
      *  @todo: Move this to .cc file.
      */
-    void markSrcRegReady(RegIndex src_idx)
-    {
-        ++readyRegs;
-
-        _readySrcRegIdx[src_idx] = 1;
-
-        if(readyRegs == numSrcRegs()) {
-            canIssue = true;
-        }
-    }
+    void markSrcRegReady(RegIndex src_idx);
 
     /** Returns if a source register is ready. */
     bool isReadySrcRegIdx(int idx) const
@@ -355,7 +441,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Sets this instruction as completed. */
     void setCompleted() { completed = true; }
 
-    /** Returns whethe or not this instruction is completed. */
+    /** Returns whether or not this instruction is completed. */
     bool isCompleted() const { return completed; }
 
     /** Sets this instruction as ready to issue. */
@@ -385,34 +471,94 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Returns whether or not this instruction is ready to commit. */
     bool readyToCommit() const { return canCommit; }
 
+    /** Sets this instruction as committed. */
+    void setCommitted() { committed = true; }
+
+    /** Returns whether or not this instruction is committed. */
+    bool isCommitted() const { return committed; }
+
     /** Sets this instruction as squashed. */
     void setSquashed() { squashed = true; }
 
     /** Returns whether or not this instruction is squashed. */
     bool isSquashed() const { return squashed; }
 
+    //Instruction Queue Entry
+    //-----------------------
+    /** Sets this instruction as a entry the IQ. */
+    void setInIQ() { iqEntry = true; }
+
+    /** Sets this instruction as a entry the IQ. */
+    void removeInIQ() { iqEntry = false; }
+
     /** Sets this instruction as squashed in the IQ. */
-    void setSquashedInIQ() { squashedInIQ = true; }
+    void setSquashedInIQ() { squashedInIQ = true; squashed = true;}
 
     /** Returns whether or not this instruction is squashed in the IQ. */
     bool isSquashedInIQ() const { return squashedInIQ; }
 
+    /** Returns whether or not this instruction has issued. */
+    bool isInIQ() const { return iqEntry; }
+
+
+    //Load / Store Queue Functions
+    //-----------------------
+    /** Sets this instruction as a entry the LSQ. */
+    void setInLSQ() { lsqEntry = true; }
+
+    /** Sets this instruction as a entry the LSQ. */
+    void removeInLSQ() { lsqEntry = false; }
+
+    /** Sets this instruction as squashed in the LSQ. */
+    void setSquashedInLSQ() { squashedInLSQ = true;}
+
+    /** Returns whether or not this instruction is squashed in the LSQ. */
+    bool isSquashedInLSQ() const { return squashedInLSQ; }
+
+    /** Returns whether or not this instruction is in the LSQ. */
+    bool isInLSQ() const { return lsqEntry; }
+
+
+    //Reorder Buffer Functions
+    //-----------------------
+    /** Sets this instruction as a entry the ROB. */
+    void setInROB() { robEntry = true; }
+
+    /** Sets this instruction as a entry the ROB. */
+    void removeInROB() { robEntry = false; }
+
+    /** Sets this instruction as squashed in the ROB. */
+    void setSquashedInROB() { squashedInROB = true; }
+
+    /** Returns whether or not this instruction is squashed in the ROB. */
+    bool isSquashedInROB() const { return squashedInROB; }
+
+    /** Returns whether or not this instruction is in the ROB. */
+    bool isInROB() const { return robEntry; }
+
     /** Read the PC of this instruction. */
     const Addr readPC() const { return PC; }
 
     /** Set the next PC of this instruction (its actual target). */
     void setNextPC(uint64_t val) { nextPC = val; }
 
+    void setASID(short addr_space_id) { asid = addr_space_id; }
+
+    void setThread(unsigned tid) { threadNumber = tid; }
+
+    void setState(ImplState *state) { thread = state; }
+
     /** Returns the exec context.
      *  @todo: Remove this once the ExecContext is no longer used.
      */
-    ExecContext *xcBase() { return cpuXC->getProxy(); }
+    ExecContext *xcBase() { return thread->getXCProxy(); }
 
   private:
     /** Instruction effective address.
      *  @todo: Consider if this is necessary or not.
      */
     Addr instEffAddr;
+
     /** Whether or not the effective address calculation is completed.
      *  @todo: Consider if this is necessary or not.
      */
@@ -423,7 +569,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     void setEA(Addr &ea) { instEffAddr = ea; eaCalcDone = true; }
 
     /** Returns the effective address. */
-    const Addr &getEA() const { return instEffAddr; }
+    const Addr &getEA() const { return req->vaddr; }
 
     /** Returns whether or not the eff. addr. calculation has been completed. */
     bool doneEACalc() { return eaCalcDone; }
@@ -431,12 +577,26 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Returns whether or not the eff. addr. source registers are ready. */
     bool eaSrcsReady();
 
+    /** Whether or not the memory operation is done. */
+    bool memOpDone;
+
   public:
     /** Load queue index. */
     int16_t lqIdx;
 
     /** Store queue index. */
     int16_t sqIdx;
+
+    bool reachedCommit;
+
+    /** Iterator pointing to this BaseDynInst in the list of all insts. */
+    ListIt instListIt;
+
+    /** Returns iterator to this instruction in the list of all insts. */
+    ListIt &getInstListIt() { return instListIt; }
+
+    /** Sets iterator for this instruction in the list of all insts. */
+    void setInstListIt(ListIt _instListIt) { instListIt = _instListIt; }
 };
 
 template<class Impl>
@@ -444,34 +604,47 @@ template<class T>
 inline Fault
 BaseDynInst<Impl>::read(Addr addr, T &data, unsigned flags)
 {
-    MemReqPtr req = new MemReq(addr, cpuXC->getProxy(), sizeof(T), flags);
+    if (executed) {
+        fault = cpu->read(req, data, lqIdx);
+        return fault;
+    }
+
+    req = new MemReq(addr, thread->getXCProxy(), sizeof(T), flags);
     req->asid = asid;
+    req->thread_num = threadNumber;
+    req->pc = this->PC;
+
+    if ((req->vaddr & (TheISA::VMPageSize - 1)) + req->size >
+        TheISA::VMPageSize) {
+        return TheISA::genAlignmentFault();
+    }
 
     fault = cpu->translateDataReadReq(req);
 
-    // Record key MemReq parameters so we can generate another one
-    // just like it for the timing access without calling translate()
-    // again (which might mess up the TLB).
-    // Do I ever really need this? -KTL 3/05
     effAddr = req->vaddr;
     physEffAddr = req->paddr;
     memReqFlags = req->flags;
 
-    /**
-     * @todo
-     * Replace the disjoint functional memory with a unified one and remove
-     * this hack.
-     */
-#if !FULL_SYSTEM
-    req->paddr = req->vaddr;
-#endif
-
     if (fault == NoFault) {
+#if FULL_SYSTEM
+        if (cpu->system->memctrl->badaddr(physEffAddr)) {
+            fault = TheISA::genMachineCheckFault();
+            data = (T)-1;
+            this->setExecuted();
+        } else {
+            fault = cpu->read(req, data, lqIdx);
+        }
+#else
         fault = cpu->read(req, data, lqIdx);
+#endif
     } else {
         // Return a fixed value to keep simulation deterministic even
         // along misspeculated paths.
         data = (T)-1;
+
+        // Commit will have to clean up whatever happened.  Set this
+        // instruction as executed.
+        this->setExecuted();
     }
 
     if (traceData) {
@@ -492,30 +665,33 @@ BaseDynInst<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
         traceData->setData(data);
     }
 
-    MemReqPtr req = new MemReq(addr, cpuXC->getProxy(), sizeof(T), flags);
+    req = new MemReq(addr, thread->getXCProxy(), sizeof(T), flags);
 
     req->asid = asid;
+    req->thread_num = threadNumber;
+    req->pc = this->PC;
+
+    if ((req->vaddr & (TheISA::VMPageSize - 1)) + req->size >
+        TheISA::VMPageSize) {
+        return TheISA::genAlignmentFault();
+    }
 
     fault = cpu->translateDataWriteReq(req);
 
-    // Record key MemReq parameters so we can generate another one
-    // just like it for the timing access without calling translate()
-    // again (which might mess up the TLB).
     effAddr = req->vaddr;
     physEffAddr = req->paddr;
     memReqFlags = req->flags;
 
-    /**
-     * @todo
-     * Replace the disjoint functional memory with a unified one and remove
-     * this hack.
-     */
-#if !FULL_SYSTEM
-    req->paddr = req->vaddr;
-#endif
-
     if (fault == NoFault) {
+#if FULL_SYSTEM
+        if (cpu->system->memctrl->badaddr(physEffAddr)) {
+            fault = TheISA::genMachineCheckFault();
+        } else {
+            fault = cpu->write(req, data, sqIdx);
+        }
+#else
         fault = cpu->write(req, data, sqIdx);
+#endif
     }
 
     if (res) {
diff --git a/cpu/o3/2bit_local_pred.cc b/cpu/o3/2bit_local_pred.cc
index d9744eec7..458fbd663 100644
--- a/cpu/o3/2bit_local_pred.cc
+++ b/cpu/o3/2bit_local_pred.cc
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "base/intmath.hh"
 #include "base/trace.hh"
 #include "cpu/o3/2bit_local_pred.hh"
 
@@ -36,17 +37,25 @@ DefaultBP::DefaultBP(unsigned _localPredictorSize,
       localCtrBits(_localCtrBits),
       instShiftAmt(_instShiftAmt)
 {
-    // Should do checks here to make sure sizes are correct (powers of 2).
+    if (!isPowerOf2(localPredictorSize)) {
+        fatal("Invalid local predictor size!\n");
+    }
+
+    localPredictorSets = localPredictorSize / localCtrBits;
+
+    if (!isPowerOf2(localPredictorSets)) {
+        fatal("Invalid number of local predictor sets! Check localCtrBits.\n");
+    }
 
     // Setup the index mask.
-    indexMask = localPredictorSize - 1;
+    indexMask = localPredictorSets - 1;
 
     DPRINTF(Fetch, "Branch predictor: index mask: %#x\n", indexMask);
 
     // Setup the array of counters for the local predictor.
-    localCtrs = new SatCounter[localPredictorSize];
+    localCtrs.resize(localPredictorSets);
 
-    for (int i = 0; i < localPredictorSize; ++i)
+    for (int i = 0; i < localPredictorSets; ++i)
         localCtrs[i].setBits(_localCtrBits);
 
     DPRINTF(Fetch, "Branch predictor: local predictor size: %i\n",
@@ -68,8 +77,6 @@ DefaultBP::lookup(Addr &branch_addr)
     DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
             local_predictor_idx);
 
-    assert(local_predictor_idx < localPredictorSize);
-
     local_prediction = localCtrs[local_predictor_idx].read();
 
     DPRINTF(Fetch, "Branch predictor: prediction is %i.\n",
@@ -102,8 +109,6 @@ DefaultBP::update(Addr &branch_addr, bool taken)
     DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
             local_predictor_idx);
 
-    assert(local_predictor_idx < localPredictorSize);
-
     if (taken) {
         DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n");
         localCtrs[local_predictor_idx].increment();
diff --git a/cpu/o3/2bit_local_pred.hh b/cpu/o3/2bit_local_pred.hh
index 97433e542..38d3f4842 100644
--- a/cpu/o3/2bit_local_pred.hh
+++ b/cpu/o3/2bit_local_pred.hh
@@ -26,18 +26,23 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_2BIT_LOCAL_PRED_HH__
-#define __CPU_O3_CPU_2BIT_LOCAL_PRED_HH__
+#ifndef __CPU_O3_2BIT_LOCAL_PRED_HH__
+#define __CPU_O3_2BIT_LOCAL_PRED_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
 #include "cpu/o3/sat_counter.hh"
 
+#include <vector>
+
 class DefaultBP
 {
   public:
     /**
      * Default branch predictor constructor.
+     * @param localPredictorSize Size of the local predictor.
+     * @param localCtrBits Number of bits per counter.
+     * @param instShiftAmt Offset amount for instructions to ignore alignment.
      */
     DefaultBP(unsigned localPredictorSize, unsigned localCtrBits,
               unsigned instShiftAmt);
@@ -59,8 +64,11 @@ class DefaultBP
 
   private:
 
-    /** Returns the taken/not taken prediction given the value of the
+    /**
+     *  Returns the taken/not taken prediction given the value of the
      *  counter.
+     *  @param count The value of the counter.
+     *  @return The prediction based on the counter value.
      */
     inline bool getPrediction(uint8_t &count);
 
@@ -68,11 +76,14 @@ class DefaultBP
     inline unsigned getLocalIndex(Addr &PC);
 
     /** Array of counters that make up the local predictor. */
-    SatCounter *localCtrs;
+    std::vector<SatCounter> localCtrs;
 
     /** Size of the local predictor. */
     unsigned localPredictorSize;
 
+    /** Number of sets. */
+    unsigned localPredictorSets;
+
     /** Number of bits of the local predictor's counters. */
     unsigned localCtrBits;
 
@@ -83,4 +94,4 @@ class DefaultBP
     unsigned indexMask;
 };
 
-#endif // __CPU_O3_CPU_2BIT_LOCAL_PRED_HH__
+#endif // __CPU_O3_2BIT_LOCAL_PRED_HH__
diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index 0352e9972..68e149e77 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -26,14 +26,12 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo: Find all the stuff in ExecContext and ev5 that needs to be
-// specifically designed for this CPU.
+#ifndef __CPU_O3_ALPHA_FULL_CPU_HH__
+#define __CPU_O3_ALPHA_FULL_CPU_HH__
 
-#ifndef __CPU_O3_CPU_ALPHA_FULL_CPU_HH__
-#define __CPU_O3_CPU_ALPHA_FULL_CPU_HH__
-
-#include "cpu/o3/cpu.hh"
 #include "arch/isa_traits.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/o3/cpu.hh"
 #include "sim/byteswap.hh"
 
 template <class Impl>
@@ -46,17 +44,175 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     typedef TheISA::MiscRegFile MiscRegFile;
 
   public:
+    typedef O3ThreadState<Impl> ImplState;
+    typedef O3ThreadState<Impl> Thread;
     typedef typename Impl::Params Params;
 
-  public:
-    AlphaFullCPU(Params &params);
+    /** Constructs an AlphaFullCPU with the given parameters. */
+    AlphaFullCPU(Params *params);
+
+    class AlphaXC : public ExecContext
+    {
+      public:
+        AlphaFullCPU<Impl> *cpu;
+
+        O3ThreadState<Impl> *thread;
+
+        Tick lastActivate;
+        Tick lastSuspend;
+
+        Event *quiesceEvent;
+
+        virtual BaseCPU *getCpuPtr() { return cpu; }
+
+        virtual void setCpuId(int id) { cpu->cpu_id = id; }
+
+        virtual int readCpuId() { return cpu->cpu_id; }
+
+        virtual FunctionalMemory *getMemPtr() { return thread->mem; }
 
 #if FULL_SYSTEM
+        virtual System *getSystemPtr() { return cpu->system; }
+
+        virtual PhysicalMemory *getPhysMemPtr() { return cpu->physmem; }
+
+        virtual AlphaITB *getITBPtr() { return cpu->itb; }
+
+        virtual AlphaDTB * getDTBPtr() { return cpu->dtb; }
+#else
+        virtual Process *getProcessPtr() { return thread->process; }
+#endif
+
+        virtual Status status() const { return thread->status(); }
+
+        virtual void setStatus(Status new_status) { thread->setStatus(new_status); }
+
+        /// Set the status to Active.  Optional delay indicates number of
+        /// cycles to wait before beginning execution.
+        virtual void activate(int delay = 1);
+
+        /// Set the status to Suspended.
+        virtual void suspend();
+
+        /// Set the status to Unallocated.
+        virtual void deallocate();
+
+        /// Set the status to Halted.
+        virtual void halt();
+
+#if FULL_SYSTEM
+        virtual void dumpFuncProfile();
+#endif
+
+        virtual void takeOverFrom(ExecContext *old_context);
+
+        virtual void regStats(const std::string &name);
+
+        virtual void serialize(std::ostream &os);
+        virtual void unserialize(Checkpoint *cp, const std::string &section);
+
+#if FULL_SYSTEM
+        virtual Event *getQuiesceEvent();
+
+        // Not necessarily the best location for these...
+        // Having an extra function just to read these is obnoxious
+        virtual Tick readLastActivate();
+        virtual Tick readLastSuspend();
+
+        virtual void profileClear();
+        virtual void profileSample();
+#endif
+
+        virtual int getThreadNum() { return thread->tid; }
+
+        // Also somewhat obnoxious.  Really only used for the TLB fault.
+        // However, may be quite useful in SPARC.
+        virtual TheISA::MachInst getInst();
+
+        virtual void copyArchRegs(ExecContext *xc);
+
+        virtual void clearArchRegs();
+
+        //
+        // New accessors for new decoder.
+        //
+        virtual uint64_t readIntReg(int reg_idx);
+
+        virtual float readFloatRegSingle(int reg_idx);
+
+        virtual double readFloatRegDouble(int reg_idx);
+
+        virtual uint64_t readFloatRegInt(int reg_idx);
+
+        virtual void setIntReg(int reg_idx, uint64_t val);
+
+        virtual void setFloatRegSingle(int reg_idx, float val);
+
+        virtual void setFloatRegDouble(int reg_idx, double val);
+
+        virtual void setFloatRegInt(int reg_idx, uint64_t val);
+
+        virtual uint64_t readPC()
+        { return cpu->readPC(thread->tid); }
+
+        virtual void setPC(uint64_t val);
+
+        virtual uint64_t readNextPC()
+        { return cpu->readNextPC(thread->tid); }
+
+        virtual void setNextPC(uint64_t val);
+
+        virtual MiscReg readMiscReg(int misc_reg)
+        { return cpu->readMiscReg(misc_reg, thread->tid); }
+
+        virtual MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault)
+        { return cpu->readMiscRegWithEffect(misc_reg, fault, thread->tid); }
+
+        virtual Fault setMiscReg(int misc_reg, const MiscReg &val);
+
+        virtual Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
+
+        // Also not necessarily the best location for these two.
+        // Hopefully will go away once we decide upon where st cond
+        // failures goes.
+        virtual unsigned readStCondFailures() { return thread->storeCondFailures; }
+
+        virtual void setStCondFailures(unsigned sc_failures) { thread->storeCondFailures = sc_failures; }
+
+#if FULL_SYSTEM
+        virtual bool inPalMode() { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
+#endif
+
+        // Only really makes sense for old CPU model.  Still could be useful though.
+        virtual bool misspeculating() { return false; }
+
+#if !FULL_SYSTEM
+        virtual IntReg getSyscallArg(int i);
+
+        // used to shift args for indirect syscall
+        virtual void setSyscallArg(int i, IntReg val);
+
+        virtual void setSyscallReturn(SyscallReturn return_value);
+
+        virtual void syscall() { return cpu->syscall(thread->tid); }
+
+        // Same with st cond failures.
+        virtual Counter readFuncExeInst() { return thread->funcExeInst; }
+#endif
+    };
+
+    friend class AlphaXC;
+
+    std::vector<AlphaXC *> xcProxies;
+
+#if FULL_SYSTEM
+    /** ITB pointer. */
     AlphaITB *itb;
+    /** DTB pointer. */
     AlphaDTB *dtb;
 #endif
 
-  public:
+    /** Registers statistics. */
     void regStats();
 
 #if FULL_SYSTEM
@@ -67,16 +223,19 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 //    void clear_interrupt(int int_num, int index);
 //    void clear_interrupts();
 
+    /** Translates instruction requestion. */
     Fault translateInstReq(MemReqPtr &req)
     {
         return itb->translate(req);
     }
 
+    /** Translates data read request. */
     Fault translateDataReadReq(MemReqPtr &req)
     {
         return dtb->translate(req, false);
     }
 
+    /** Translates data write request. */
     Fault translateDataWriteReq(MemReqPtr &req)
     {
         return dtb->translate(req, true);
@@ -95,16 +254,19 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         return NoFault;
     }
 
+    /** Translates instruction requestion in syscall emulation mode. */
     Fault translateInstReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
 
+    /** Translates data read request in syscall emulation mode. */
     Fault translateDataReadReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
 
+    /** Translates data write request in syscall emulation mode. */
     Fault translateDataWriteReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
@@ -113,36 +275,36 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
 
     // Later on may want to remove this misc stuff from the regfile and
-    // have it handled at this level.  Might prove to be an issue when
+    // have it handled at this level.  This would be similar to moving certain
+    // IPRs into the devices themselves.  Might prove to be an issue when
     // trying to rename source/destination registers...
-    MiscReg readMiscReg(int misc_reg)
-    {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return 0;
-    }
+    MiscReg readMiscReg(int misc_reg, unsigned tid);
 
-    Fault setMiscReg(int misc_reg, const MiscReg &val)
-    {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return NoFault;
-    }
+    MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault, unsigned tid);
+
+    Fault setMiscReg(int misc_reg, const MiscReg &val, unsigned tid);
+
+    Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val, unsigned tid);
+
+    void squashFromXC(unsigned tid);
 
-    // Most of the full system code and syscall emulation is not yet
-    // implemented.  These functions do show what the final interface will
-    // look like.
 #if FULL_SYSTEM
+    void post_interrupt(int int_num, int index);
+
     int readIntrFlag();
+    /** Sets the interrupt flags. */
     void setIntrFlag(int val);
-    Fault hwrei();
-    bool inPalMode() { return AlphaISA::PcPAL(this->regFile.readPC()); }
+    /** HW return from error interrupt. */
+    Fault hwrei(unsigned tid);
+    /** Returns if a specific PC is a PAL mode PC. */
     bool inPalMode(uint64_t PC)
     { return AlphaISA::PcPAL(PC); }
 
-    void trap(Fault fault);
+    /** Traps to handle given fault. */
+    void trap(Fault fault, unsigned tid);
     bool simPalCheck(int palFunc);
 
+    /** Processes any interrupts. */
     void processInterrupts();
 #endif
 
@@ -152,84 +314,64 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     // register.  Actually, these functions should handle most of this
     // functionality by themselves; should look up the rename and then
     // set the register.
-    IntReg getSyscallArg(int i)
-    {
-        return this->cpuXC->readIntReg(AlphaISA::ArgumentReg0 + i);
-    }
+    /** Gets a syscall argument. */
+    IntReg getSyscallArg(int i, int tid);
 
-    // used to shift args for indirect syscall
-    void setSyscallArg(int i, IntReg val)
-    {
-        this->cpuXC->setIntReg(AlphaISA::ArgumentReg0 + i, val);
-    }
+    /** Used to shift args for indirect syscall. */
+    void setSyscallArg(int i, IntReg val, int tid);
 
-    void setSyscallReturn(int64_t return_value)
-    {
-        // check for error condition.  Alpha syscall convention is to
-        // indicate success/failure in reg a3 (r19) and put the
-        // return value itself in the standard return value reg (v0).
-        const int RegA3 = 19;	// only place this is used
-        if (return_value >= 0) {
-            // no error
-            this->cpuXC->setIntReg(RegA3, 0);
-            this->cpuXC->setIntReg(AlphaISA::ReturnValueReg, return_value);
-        } else {
-            // got an error, return details
-            this->cpuXC->setIntReg(RegA3, (IntReg) -1);
-            this->cpuXC->setIntReg(AlphaISA::ReturnValueReg, -return_value);
-        }
-    }
+    /** Sets the return value of a syscall. */
+    void setSyscallReturn(SyscallReturn return_value, int tid);
 
-    void syscall(short thread_num);
-    void squashStages();
+    /** Executes a syscall.
+     * @todo: Determine if this needs to be virtual.
+     */
+    virtual void syscall(int thread_num);
 
 #endif
 
-    void copyToXC();
-    void copyFromXC();
-
   public:
 #if FULL_SYSTEM
-    bool palShadowEnabled;
-
-    // Not sure this is used anywhere.
-    void intr_post(RegFile *regs, Fault fault, Addr pc);
-    // Actually used within exec files.  Implement properly.
-    void swapPALShadow(bool use_shadow);
-    // Called by CPU constructor.  Can implement as I please.
-    void initCPU(RegFile *regs);
-    // Called by initCPU.  Implement as I please.
-    void initIPRs(RegFile *regs);
-
+    /** Halts the CPU. */
     void halt() { panic("Halt not implemented!\n"); }
 #endif
 
-
+    /** Old CPU read from memory function. No longer used. */
     template <class T>
     Fault read(MemReqPtr &req, T &data)
     {
+//	panic("CPU READ NOT IMPLEMENTED W/NEW MEMORY\n");
+#if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
         if (req->flags & LOCKED) {
             req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
             req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
         }
 #endif
-
+#endif
         Fault error;
+        if (req->flags & LOCKED) {
+            lockAddr = req->paddr;
+            lockFlag = true;
+        }
+
         error = this->mem->read(req, data);
         data = gtoh(data);
         return error;
     }
 
+    /** CPU read function, forwards read to LSQ. */
     template <class T>
     Fault read(MemReqPtr &req, T &data, int load_idx)
     {
         return this->iew.ldstQueue.read(req, data, load_idx);
     }
 
+    /** Old CPU write to memory function. No longer used. */
     template <class T>
     Fault write(MemReqPtr &req, T &data)
     {
+#if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
         ExecContext *xc;
 
@@ -276,16 +418,32 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         }
 
 #endif
+#endif
+
+        if (req->flags & LOCKED) {
+            if (req->flags & UNCACHEABLE) {
+                req->result = 2;
+            } else {
+                if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
+                    req->result=1;
+                } else {
+                    req->result = 0;
+                }
+            }
+        }
 
         return this->mem->write(req, (T)htog(data));
     }
 
+    /** CPU write function, forwards write to LSQ. */
     template <class T>
     Fault write(MemReqPtr &req, T &data, int store_idx)
     {
         return this->iew.ldstQueue.write(req, data, store_idx);
     }
 
+    Addr lockAddr;
+    bool lockFlag;
 };
 
-#endif // __CPU_O3_CPU_ALPHA_FULL_CPU_HH__
+#endif // __CPU_O3_ALPHA_FULL_CPU_HH__
diff --git a/cpu/o3/alpha_cpu_builder.cc b/cpu/o3/alpha_cpu_builder.cc
index 6025b8ef2..d676a69c1 100644
--- a/cpu/o3/alpha_cpu_builder.cc
+++ b/cpu/o3/alpha_cpu_builder.cc
@@ -26,39 +26,20 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "base/inifile.hh"
-#include "base/loader/symtab.hh"
-#include "base/misc.hh"
+#include <string>
+
 #include "cpu/base.hh"
-#include "cpu/exetrace.hh"
 #include "cpu/o3/alpha_cpu.hh"
 #include "cpu/o3/alpha_impl.hh"
-#include "mem/base_mem.hh"
+#include "cpu/o3/alpha_params.hh"
+#include "cpu/o3/fu_pool.hh"
 #include "mem/cache/base_cache.hh"
-#include "mem/mem_interface.hh"
 #include "sim/builder.hh"
-#include "sim/debug.hh"
-#include "sim/host.hh"
-#include "sim/process.hh"
-#include "sim/sim_events.hh"
-#include "sim/sim_object.hh"
-#include "sim/stats.hh"
-
-#if FULL_SYSTEM
-#include "base/remote_gdb.hh"
-#include "mem/functional/memory_control.hh"
-#include "mem/functional/physical.hh"
-#include "sim/system.hh"
-#include "arch/tlb.hh"
-#include "arch/vtophys.hh"
-#else // !FULL_SYSTEM
-#include "mem/functional/functional.hh"
-#endif // FULL_SYSTEM
 
 class DerivAlphaFullCPU : public AlphaFullCPU<AlphaSimpleImpl>
 {
   public:
-    DerivAlphaFullCPU(AlphaSimpleParams p)
+    DerivAlphaFullCPU(AlphaSimpleParams *p)
         : AlphaFullCPU<AlphaSimpleImpl>(p)
     { }
 };
@@ -75,7 +56,9 @@ SimObjectParam<AlphaITB *> itb;
 SimObjectParam<AlphaDTB *> dtb;
 #else
 SimObjectVectorParam<Process *> workload;
+//SimObjectParam<PageTable *> page_table;
 #endif // FULL_SYSTEM
+
 SimObjectParam<FunctionalMemory *> mem;
 
 Param<Counter> max_insts_any_thread;
@@ -86,6 +69,8 @@ Param<Counter> max_loads_all_threads;
 SimObjectParam<BaseCache *> icache;
 SimObjectParam<BaseCache *> dcache;
 
+Param<unsigned> cachePorts;
+
 Param<unsigned> decodeToFetchDelay;
 Param<unsigned> renameToFetchDelay;
 Param<unsigned> iewToFetchDelay;
@@ -112,25 +97,22 @@ Param<unsigned> executeIntWidth;
 Param<unsigned> executeFloatWidth;
 Param<unsigned> executeBranchWidth;
 Param<unsigned> executeMemoryWidth;
+SimObjectParam<FUPool *> fuPool;
 
 Param<unsigned> iewToCommitDelay;
 Param<unsigned> renameToROBDelay;
 Param<unsigned> commitWidth;
 Param<unsigned> squashWidth;
 
-#if 0
 Param<unsigned> localPredictorSize;
-Param<unsigned> localPredictorCtrBits;
-#endif
-Param<unsigned> local_predictor_size;
-Param<unsigned> local_ctr_bits;
-Param<unsigned> local_history_table_size;
-Param<unsigned> local_history_bits;
-Param<unsigned> global_predictor_size;
-Param<unsigned> global_ctr_bits;
-Param<unsigned> global_history_bits;
-Param<unsigned> choice_predictor_size;
-Param<unsigned> choice_ctr_bits;
+Param<unsigned> localCtrBits;
+Param<unsigned> localHistoryTableSize;
+Param<unsigned> localHistoryBits;
+Param<unsigned> globalPredictorSize;
+Param<unsigned> globalCtrBits;
+Param<unsigned> globalHistoryBits;
+Param<unsigned> choicePredictorSize;
+Param<unsigned> choiceCtrBits;
 
 Param<unsigned> BTBEntries;
 Param<unsigned> BTBTagSize;
@@ -147,6 +129,16 @@ Param<unsigned> numPhysFloatRegs;
 Param<unsigned> numIQEntries;
 Param<unsigned> numROBEntries;
 
+Param<unsigned> smtNumFetchingThreads;
+Param<std::string>   smtFetchPolicy;
+Param<std::string>   smtLSQPolicy;
+Param<unsigned> smtLSQThreshold;
+Param<std::string>   smtIQPolicy;
+Param<unsigned> smtIQThreshold;
+Param<std::string>   smtROBPolicy;
+Param<unsigned> smtROBThreshold;
+Param<std::string>   smtCommitPolicy;
+
 Param<unsigned> instShiftAmt;
 
 Param<bool> defer_registration;
@@ -168,6 +160,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(dtb, "Data translation buffer"),
 #else
     INIT_PARAM(workload, "Processes to run"),
+//    INIT_PARAM(page_table, "Page table"),
 #endif // FULL_SYSTEM
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
@@ -190,13 +183,14 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL),
     INIT_PARAM_DFLT(dcache, "L1 data cache", NULL),
 
+    INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200),
+
     INIT_PARAM(decodeToFetchDelay, "Decode to fetch delay"),
     INIT_PARAM(renameToFetchDelay, "Rename to fetch delay"),
     INIT_PARAM(iewToFetchDelay, "Issue/Execute/Writeback to fetch"
                "delay"),
     INIT_PARAM(commitToFetchDelay, "Commit to fetch delay"),
     INIT_PARAM(fetchWidth, "Fetch width"),
-
     INIT_PARAM(renameToDecodeDelay, "Rename to decode delay"),
     INIT_PARAM(iewToDecodeDelay, "Issue/Execute/Writeback to decode"
                "delay"),
@@ -222,6 +216,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(executeFloatWidth, "Floating point execute width"),
     INIT_PARAM(executeBranchWidth, "Branch execute width"),
     INIT_PARAM(executeMemoryWidth, "Memory execute width"),
+    INIT_PARAM_DFLT(fuPool, "Functional unit pool", NULL),
 
     INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
                "delay"),
@@ -229,20 +224,15 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(commitWidth, "Commit width"),
     INIT_PARAM(squashWidth, "Squash width"),
 
-#if 0
-    INIT_PARAM(localPredictorSize, "Size of the local predictor in entries. "
-               "Must be a power of 2."),
-    INIT_PARAM(localPredictorCtrBits, "Number of bits per counter for bpred"),
-#endif
-    INIT_PARAM(local_predictor_size, "Size of local predictor"),
-    INIT_PARAM(local_ctr_bits, "Bits per counter"),
-    INIT_PARAM(local_history_table_size, "Size of local history table"),
-    INIT_PARAM(local_history_bits, "Bits for the local history"),
-    INIT_PARAM(global_predictor_size, "Size of global predictor"),
-    INIT_PARAM(global_ctr_bits, "Bits per counter"),
-    INIT_PARAM(global_history_bits, "Bits of history"),
-    INIT_PARAM(choice_predictor_size, "Size of choice predictor"),
-    INIT_PARAM(choice_ctr_bits, "Bits of choice counters"),
+    INIT_PARAM(localPredictorSize, "Size of local predictor"),
+    INIT_PARAM(localCtrBits, "Bits per counter"),
+    INIT_PARAM(localHistoryTableSize, "Size of local history table"),
+    INIT_PARAM(localHistoryBits, "Bits for the local history"),
+    INIT_PARAM(globalPredictorSize, "Size of global predictor"),
+    INIT_PARAM(globalCtrBits, "Bits per counter"),
+    INIT_PARAM(globalHistoryBits, "Bits of history"),
+    INIT_PARAM(choicePredictorSize, "Size of choice predictor"),
+    INIT_PARAM(choiceCtrBits, "Bits of choice counters"),
 
     INIT_PARAM(BTBEntries, "Number of BTB entries"),
     INIT_PARAM(BTBTagSize, "Size of the BTB tags, in bits"),
@@ -260,6 +250,16 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(numIQEntries, "Number of instruction queue entries"),
     INIT_PARAM(numROBEntries, "Number of reorder buffer entries"),
 
+    INIT_PARAM_DFLT(smtNumFetchingThreads, "SMT Number of Fetching Threads", 1),
+    INIT_PARAM_DFLT(smtFetchPolicy, "SMT Fetch Policy", "SingleThread"),
+    INIT_PARAM_DFLT(smtLSQPolicy,   "SMT LSQ Sharing Policy",    "Partitioned"),
+    INIT_PARAM_DFLT(smtLSQThreshold,"SMT LSQ Threshold", 100),
+    INIT_PARAM_DFLT(smtIQPolicy,    "SMT IQ Policy",    "Partitioned"),
+    INIT_PARAM_DFLT(smtIQThreshold, "SMT IQ Threshold", 100),
+    INIT_PARAM_DFLT(smtROBPolicy,   "SMT ROB Sharing Policy", "Partitioned"),
+    INIT_PARAM_DFLT(smtROBThreshold,"SMT ROB Threshold", 100),
+    INIT_PARAM_DFLT(smtCommitPolicy,"SMT Commit Fetch Policy", "RoundRobin"),
+
     INIT_PARAM(instShiftAmt, "Number of bits to shift instructions by"),
     INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
 
@@ -287,101 +287,113 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
 
 #endif
 
-    AlphaSimpleParams params;
+    AlphaSimpleParams *params = new AlphaSimpleParams;
 
-    params.clock = clock;
+    params->clock = clock;
 
-    params.name = getInstanceName();
-    params.numberOfThreads = actual_num_threads;
+    params->name = getInstanceName();
+    params->numberOfThreads = actual_num_threads;
 
 #if FULL_SYSTEM
-    params.system = system;
-    params.cpu_id = cpu_id;
-    params.itb = itb;
-    params.dtb = dtb;
+    params->system = system;
+    params->cpu_id = cpu_id;
+    params->itb = itb;
+    params->dtb = dtb;
 #else
-    params.workload = workload;
+    params->workload = workload;
+    //@todo: change to pageTable
+//    params->pTable = page_table;
 #endif // FULL_SYSTEM
 
-    params.mem = mem;
+    params->mem = mem;
 
-    params.max_insts_any_thread = max_insts_any_thread;
-    params.max_insts_all_threads = max_insts_all_threads;
-    params.max_loads_any_thread = max_loads_any_thread;
-    params.max_loads_all_threads = max_loads_all_threads;
+    params->max_insts_any_thread = max_insts_any_thread;
+    params->max_insts_all_threads = max_insts_all_threads;
+    params->max_loads_any_thread = max_loads_any_thread;
+    params->max_loads_all_threads = max_loads_all_threads;
 
     //
     // Caches
     //
-    params.icacheInterface = icache ? icache->getInterface() : NULL;
-    params.dcacheInterface = dcache ? dcache->getInterface() : NULL;
+    params->icacheInterface = icache ? icache->getInterface() : NULL;
+    params->dcacheInterface = dcache ? dcache->getInterface() : NULL;
+    params->cachePorts = cachePorts;
 
-    params.decodeToFetchDelay = decodeToFetchDelay;
-    params.renameToFetchDelay = renameToFetchDelay;
-    params.iewToFetchDelay = iewToFetchDelay;
-    params.commitToFetchDelay = commitToFetchDelay;
-    params.fetchWidth = fetchWidth;
+    params->decodeToFetchDelay = decodeToFetchDelay;
+    params->renameToFetchDelay = renameToFetchDelay;
+    params->iewToFetchDelay = iewToFetchDelay;
+    params->commitToFetchDelay = commitToFetchDelay;
+    params->fetchWidth = fetchWidth;
 
-    params.renameToDecodeDelay = renameToDecodeDelay;
-    params.iewToDecodeDelay = iewToDecodeDelay;
-    params.commitToDecodeDelay = commitToDecodeDelay;
-    params.fetchToDecodeDelay = fetchToDecodeDelay;
-    params.decodeWidth = decodeWidth;
+    params->renameToDecodeDelay = renameToDecodeDelay;
+    params->iewToDecodeDelay = iewToDecodeDelay;
+    params->commitToDecodeDelay = commitToDecodeDelay;
+    params->fetchToDecodeDelay = fetchToDecodeDelay;
+    params->decodeWidth = decodeWidth;
 
-    params.iewToRenameDelay = iewToRenameDelay;
-    params.commitToRenameDelay = commitToRenameDelay;
-    params.decodeToRenameDelay = decodeToRenameDelay;
-    params.renameWidth = renameWidth;
+    params->iewToRenameDelay = iewToRenameDelay;
+    params->commitToRenameDelay = commitToRenameDelay;
+    params->decodeToRenameDelay = decodeToRenameDelay;
+    params->renameWidth = renameWidth;
 
-    params.commitToIEWDelay = commitToIEWDelay;
-    params.renameToIEWDelay = renameToIEWDelay;
-    params.issueToExecuteDelay = issueToExecuteDelay;
-    params.issueWidth = issueWidth;
-    params.executeWidth = executeWidth;
-    params.executeIntWidth = executeIntWidth;
-    params.executeFloatWidth = executeFloatWidth;
-    params.executeBranchWidth = executeBranchWidth;
-    params.executeMemoryWidth = executeMemoryWidth;
+    params->commitToIEWDelay = commitToIEWDelay;
+    params->renameToIEWDelay = renameToIEWDelay;
+    params->issueToExecuteDelay = issueToExecuteDelay;
+    params->issueWidth = issueWidth;
+    params->executeWidth = executeWidth;
+    params->executeIntWidth = executeIntWidth;
+    params->executeFloatWidth = executeFloatWidth;
+    params->executeBranchWidth = executeBranchWidth;
+    params->executeMemoryWidth = executeMemoryWidth;
+    params->fuPool = fuPool;
 
-    params.iewToCommitDelay = iewToCommitDelay;
-    params.renameToROBDelay = renameToROBDelay;
-    params.commitWidth = commitWidth;
-    params.squashWidth = squashWidth;
-#if 0
-    params.localPredictorSize = localPredictorSize;
-    params.localPredictorCtrBits = localPredictorCtrBits;
-#endif
-    params.local_predictor_size = local_predictor_size;
-    params.local_ctr_bits = local_ctr_bits;
-    params.local_history_table_size = local_history_table_size;
-    params.local_history_bits = local_history_bits;
-    params.global_predictor_size = global_predictor_size;
-    params.global_ctr_bits = global_ctr_bits;
-    params.global_history_bits = global_history_bits;
-    params.choice_predictor_size = choice_predictor_size;
-    params.choice_ctr_bits = choice_ctr_bits;
+    params->iewToCommitDelay = iewToCommitDelay;
+    params->renameToROBDelay = renameToROBDelay;
+    params->commitWidth = commitWidth;
+    params->squashWidth = squashWidth;
 
-    params.BTBEntries = BTBEntries;
-    params.BTBTagSize = BTBTagSize;
 
-    params.RASSize = RASSize;
+    params->localPredictorSize = localPredictorSize;
+    params->localCtrBits = localCtrBits;
+    params->localHistoryTableSize = localHistoryTableSize;
+    params->localHistoryBits = localHistoryBits;
+    params->globalPredictorSize = globalPredictorSize;
+    params->globalCtrBits = globalCtrBits;
+    params->globalHistoryBits = globalHistoryBits;
+    params->choicePredictorSize = choicePredictorSize;
+    params->choiceCtrBits = choiceCtrBits;
 
-    params.LQEntries = LQEntries;
-    params.SQEntries = SQEntries;
-    params.SSITSize = SSITSize;
-    params.LFSTSize = LFSTSize;
+    params->BTBEntries = BTBEntries;
+    params->BTBTagSize = BTBTagSize;
 
-    params.numPhysIntRegs = numPhysIntRegs;
-    params.numPhysFloatRegs = numPhysFloatRegs;
-    params.numIQEntries = numIQEntries;
-    params.numROBEntries = numROBEntries;
+    params->RASSize = RASSize;
 
-    params.instShiftAmt = 2;
+    params->LQEntries = LQEntries;
+    params->SQEntries = SQEntries;
 
-    params.defReg = defer_registration;
+    params->SSITSize = SSITSize;
+    params->LFSTSize = LFSTSize;
 
-    params.functionTrace = function_trace;
-    params.functionTraceStart = function_trace_start;
+    params->numPhysIntRegs = numPhysIntRegs;
+    params->numPhysFloatRegs = numPhysFloatRegs;
+    params->numIQEntries = numIQEntries;
+    params->numROBEntries = numROBEntries;
+
+    params->smtNumFetchingThreads = smtNumFetchingThreads;
+    params->smtFetchPolicy = smtFetchPolicy;
+    params->smtIQPolicy    = smtIQPolicy;
+    params->smtLSQPolicy    = smtLSQPolicy;
+    params->smtLSQThreshold = smtLSQThreshold;
+    params->smtROBPolicy   = smtROBPolicy;
+    params->smtROBThreshold = smtROBThreshold;
+    params->smtCommitPolicy = smtCommitPolicy;
+
+    params->instShiftAmt = 2;
+
+    params->deferRegistration = defer_registration;
+
+    params->functionTrace = function_trace;
+    params->functionTraceStart = function_trace_start;
 
     cpu = new DerivAlphaFullCPU(params);
 
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 9f1fa24f6..86f7d9f28 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -30,6 +30,7 @@
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
+#include "cpu/quiesce_event.hh"
 #include "mem/cache/cache.hh" // for dynamic cast
 #include "mem/mem_interface.hh"
 #include "sim/builder.hh"
@@ -39,18 +40,79 @@
 #include "cpu/o3/alpha_cpu.hh"
 #include "cpu/o3/alpha_params.hh"
 #include "cpu/o3/comm.hh"
+#include "cpu/o3/thread_state.hh"
 
 #if FULL_SYSTEM
 #include "arch/alpha/osfpal.hh"
-#include "arch/alpha/isa_traits.hh"
+#include "arch/isa_traits.hh"
 #endif
 
+using namespace TheISA;
+
 template <class Impl>
-AlphaFullCPU<Impl>::AlphaFullCPU(Params &params)
+AlphaFullCPU<Impl>::AlphaFullCPU(Params *params)
+#if FULL_SYSTEM
+    : FullO3CPU<Impl>(params), itb(params->itb), dtb(params->dtb)
+#else
     : FullO3CPU<Impl>(params)
+#endif
 {
     DPRINTF(FullCPU, "AlphaFullCPU: Creating AlphaFullCPU object.\n");
 
+    this->thread.resize(this->numThreads);
+
+    for (int i = 0; i < this->numThreads; ++i) {
+#if FULL_SYSTEM
+        assert(i == 0);
+        this->thread[i] = new Thread(this, 0, params->mem);
+//        this->system->execContexts[i] = this->thread[i]->getXCProxy();
+        this->thread[i]->setStatus(ExecContext::Suspended);
+
+#else
+        if (i < params->workload.size()) {
+            DPRINTF(FullCPU, "FullCPU: Workload[%i]'s starting PC is %#x, "
+                    "process is %#x",
+                    i, params->workload[i]->prog_entry, this->thread[i]);
+            this->thread[i] = new Thread(this, i, params->workload[i], i);
+            assert(params->workload[i]->getMemory() != NULL);
+
+            this->thread[i]->setStatus(ExecContext::Suspended);
+            //usedTids[i] = true;
+            //threadMap[i] = i;
+        } else {
+            //Allocate Empty execution context so M5 can use later
+            //when scheduling threads to CPU
+            Process* dummy_proc = NULL;
+
+            this->thread[i] = new Thread(this, i, dummy_proc, i);
+            //usedTids[i] = false;
+        }
+#endif // !FULL_SYSTEM
+
+        this->thread[i]->numInst = 0;
+
+        xcProxies.push_back(new AlphaXC);
+
+        xcProxies[i]->cpu = this;
+        xcProxies[i]->thread = this->thread[i];
+
+        xcProxies[i]->quiesceEvent = new EndQuiesceEvent(xcProxies[i]);
+        xcProxies[i]->lastActivate = 0;
+        xcProxies[i]->lastSuspend = 0;
+
+
+        this->thread[i]->xcProxy = xcProxies[i];
+
+        this->execContexts.push_back(this->thread[i]->getXCProxy());
+    }
+
+
+    for (int i=0; i < this->numThreads; i++) {
+        this->thread[i]->funcExeInst = 0;
+    }
+
+    // Sets CPU pointers. These must be set at this level because the CPU
+    // pointers are defined to be the highest level of CPU class.
     this->fetch.setCPU(this);
     this->decode.setCPU(this);
     this->rename.setCPU(this);
@@ -58,6 +120,10 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params &params)
     this->commit.setCPU(this);
 
     this->rob.setCPU(this);
+    this->regFile.setCPU(this);
+
+    lockAddr = 0;
+    lockFlag = false;
 }
 
 template <class Impl>
@@ -73,182 +139,436 @@ AlphaFullCPU<Impl>::regStats()
     this->commit.regStats();
 }
 
-#if !FULL_SYSTEM
-
-// Will probably need to know which thread is calling syscall
-// Will need to pass that information in to the DynInst when it is constructed,
-// so that this call can be made with the proper thread number.
+#if FULL_SYSTEM
 template <class Impl>
 void
-AlphaFullCPU<Impl>::syscall(short thread_num)
+AlphaFullCPU<Impl>::AlphaXC::dumpFuncProfile()
 {
-    DPRINTF(FullCPU, "AlphaFullCPU: Syscall() called.\n\n");
+}
+#endif
 
-    // Commit stage needs to run as well.
-    this->commit.tick();
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
+{
+}
 
-    squashStages();
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::activate(int delay)
+{
+    DPRINTF(FullCPU, "Calling activate on AlphaXC\n");
+//    warn("Calling activate on AlphaXC");
+    if (thread->status() == ExecContext::Active)
+        return;
+
+    lastActivate = curTick;
+
+    if (thread->status() == ExecContext::Unallocated) {
+        cpu->activateWhenReady(thread->tid);
+        return;
+    }
+
+    thread->setStatus(ExecContext::Active);
+
+    // status() == Suspended
+    cpu->activateContext(thread->tid, delay);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::suspend()
+{
+    DPRINTF(FullCPU, "Calling suspend on AlphaXC\n");
+//    warn("Calling suspend on AlphaXC");
+    if (thread->status() == ExecContext::Suspended)
+        return;
+
+    lastActivate = curTick;
+    lastSuspend = curTick;
+/*
+#if FULL_SYSTEM
+    // Don't change the status from active if there are pending interrupts
+    if (cpu->check_interrupts()) {
+        assert(status() == ExecContext::Active);
+        return;
+    }
+#endif
+*/
+    thread->setStatus(ExecContext::Suspended);
+    cpu->suspendContext(thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::deallocate()
+{
+    DPRINTF(FullCPU, "Calling deallocate on AlphaXC\n");
+//    warn("Calling deallocate on AlphaXC");
+    if (thread->status() == ExecContext::Unallocated)
+        return;
+
+    thread->setStatus(ExecContext::Unallocated);
+    cpu->deallocateContext(thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::halt()
+{
+    DPRINTF(FullCPU, "Calling halt on AlphaXC\n");
+//    warn("Calling halt on AlphaXC");
+    if (thread->status() == ExecContext::Halted)
+        return;
+
+    thread->setStatus(ExecContext::Halted);
+    cpu->haltContext(thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::regStats(const std::string &name)
+{}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::serialize(std::ostream &os)
+{}
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::unserialize(Checkpoint *cp, const std::string &section)
+{}
+
+#if FULL_SYSTEM
+template <class Impl>
+Event *
+AlphaFullCPU<Impl>::AlphaXC::getQuiesceEvent()
+{
+    return quiesceEvent;
+}
+
+template <class Impl>
+Tick
+AlphaFullCPU<Impl>::AlphaXC::readLastActivate()
+{
+    return lastActivate;
+}
+
+template <class Impl>
+Tick
+AlphaFullCPU<Impl>::AlphaXC::readLastSuspend()
+{
+    return lastSuspend;
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::profileClear()
+{}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::profileSample()
+{}
+#endif
+
+template <class Impl>
+TheISA::MachInst
+AlphaFullCPU<Impl>::AlphaXC:: getInst()
+{
+    return thread->inst;
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::copyArchRegs(ExecContext *xc)
+{
+    // This function will mess things up unless the ROB is empty and
+    // there are no instructions in the pipeline.
+    unsigned tid = thread->tid;
+    PhysRegIndex renamed_reg;
+
+    // First loop through the integer registers.
+    for (int i = 0; i < AlphaISA::NumIntRegs; ++i) {
+        renamed_reg = cpu->renameMap[tid].lookup(i);
+
+        DPRINTF(FullCPU, "FullCPU: Copying over register %i, had data %lli, "
+                "now has data %lli.\n",
+                renamed_reg, cpu->readIntReg(renamed_reg),
+                xc->readIntReg(i));
+
+        cpu->setIntReg(renamed_reg, xc->readIntReg(i));
+    }
+
+    // Then loop through the floating point registers.
+    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i) {
+        renamed_reg = cpu->renameMap[tid].lookup(i + AlphaISA::FP_Base_DepTag);
+        cpu->setFloatRegDouble(renamed_reg,
+                               xc->readFloatRegDouble(i));
+        cpu->setFloatRegInt(renamed_reg,
+                            xc->readFloatRegInt(i));
+    }
+
+    // Copy the misc regs.
+    cpu->regFile.miscRegs[tid].copyMiscRegs(xc);
+
+    // Then finally set the PC and the next PC.
+    cpu->setPC(xc->readPC(), tid);
+    cpu->setNextPC(xc->readNextPC(), tid);
+#if !FULL_SYSTEM
+    this->thread->funcExeInst = xc->readFuncExeInst();
+#endif
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::clearArchRegs()
+{}
+
+//
+// New accessors for new decoder.
+//
+template <class Impl>
+uint64_t
+AlphaFullCPU<Impl>::AlphaXC::readIntReg(int reg_idx)
+{
+    DPRINTF(Fault, "Reading int register through the XC!\n");
+    return cpu->readArchIntReg(reg_idx, thread->tid);
+}
+
+template <class Impl>
+float
+AlphaFullCPU<Impl>::AlphaXC::readFloatRegSingle(int reg_idx)
+{
+    DPRINTF(Fault, "Reading float register through the XC!\n");
+    return cpu->readArchFloatRegSingle(reg_idx, thread->tid);
+}
+
+template <class Impl>
+double
+AlphaFullCPU<Impl>::AlphaXC::readFloatRegDouble(int reg_idx)
+{
+    DPRINTF(Fault, "Reading float register through the XC!\n");
+    return cpu->readArchFloatRegDouble(reg_idx, thread->tid);
+}
+
+template <class Impl>
+uint64_t
+AlphaFullCPU<Impl>::AlphaXC::readFloatRegInt(int reg_idx)
+{
+    DPRINTF(Fault, "Reading floatint register through the XC!\n");
+    return cpu->readArchFloatRegInt(reg_idx, thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setIntReg(int reg_idx, uint64_t val)
+{
+    DPRINTF(Fault, "Setting int register through the XC!\n");
+    cpu->setArchIntReg(reg_idx, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setFloatRegSingle(int reg_idx, float val)
+{
+    DPRINTF(Fault, "Setting float register through the XC!\n");
+    cpu->setArchFloatRegSingle(reg_idx, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setFloatRegDouble(int reg_idx, double val)
+{
+    DPRINTF(Fault, "Setting float register through the XC!\n");
+    cpu->setArchFloatRegDouble(reg_idx, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setFloatRegInt(int reg_idx, uint64_t val)
+{
+    DPRINTF(Fault, "Setting floatint register through the XC!\n");
+    cpu->setArchFloatRegInt(reg_idx, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setPC(uint64_t val)
+{
+    cpu->setPC(val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setNextPC(uint64_t val)
+{
+    cpu->setNextPC(val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::AlphaXC::setMiscReg(int misc_reg, const MiscReg &val)
+{
+    DPRINTF(Fault, "Setting misc register through the XC!\n");
+
+    Fault ret_fault = cpu->setMiscReg(misc_reg, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+
+    return ret_fault;
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::AlphaXC::setMiscRegWithEffect(int misc_reg, const MiscReg &val)
+{
+    DPRINTF(Fault, "Setting misc register through the XC!\n");
+
+    Fault ret_fault = cpu->setMiscRegWithEffect(misc_reg, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+
+    return ret_fault;
+}
+
+#if !FULL_SYSTEM
+
+template <class Impl>
+TheISA::IntReg
+AlphaFullCPU<Impl>::AlphaXC::getSyscallArg(int i)
+{
+    return cpu->getSyscallArg(i, thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setSyscallArg(int i, IntReg val)
+{
+    cpu->setSyscallArg(i, val, thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setSyscallReturn(SyscallReturn return_value)
+{
+    cpu->setSyscallReturn(return_value, thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::syscall(int tid)
+{
+    DPRINTF(FullCPU, "AlphaFullCPU: [tid:%i] Executing syscall().\n\n", tid);
+
+    DPRINTF(Activity,"Activity: syscall() called.\n");
 
     // Temporarily increase this by one to account for the syscall
     // instruction.
-    ++(this->funcExeInst);
+    ++(this->thread[tid]->funcExeInst);
 
-    // Copy over all important state to xc once all the unrolling is done.
-    copyToXC();
-
-    // This is hardcoded to thread 0 while the CPU is only single threaded.
-    this->thread[0]->syscall();
-
-    // Copy over all important state back to CPU.
-    copyFromXC();
+    // Execute the actual syscall.
+    this->thread[tid]->syscall();
 
     // Decrease funcExeInst by one as the normal commit will handle
-    // incrememnting it.
-    --(this->funcExeInst);
-}
-
-// This is not a pretty function, and should only be used if it is necessary
-// to fake having everything squash all at once (ie for non-full system
-// syscalls).  Maybe put this at the FullCPU level?
-template <class Impl>
-void
-AlphaFullCPU<Impl>::squashStages()
-{
-    InstSeqNum rob_head = this->rob.readHeadSeqNum();
-
-    // Now hack the time buffer to put this sequence number in the places
-    // where the stages might read it.
-    for (int i = 0; i < 5; ++i)
-    {
-        this->timeBuffer.access(-i)->commitInfo.doneSeqNum = rob_head;
-    }
-
-    this->fetch.squash(this->rob.readHeadNextPC());
-    this->fetchQueue.advance();
-
-    this->decode.squash();
-    this->decodeQueue.advance();
-
-    this->rename.squash();
-    this->renameQueue.advance();
-    this->renameQueue.advance();
-
-    // Be sure to advance the IEW queues so that the commit stage doesn't
-    // try to set an instruction as completed at the same time that it
-    // might be deleting it.
-    this->iew.squash();
-    this->iewQueue.advance();
-    this->iewQueue.advance();
-    // Needs to tell the LSQ to write back all of its data
-    this->iew.lsqWriteback();
-
-    this->rob.squash(rob_head);
-    this->commit.setSquashing();
-
-    // Now hack the time buffer to clear the sequence numbers in the places
-    // where the stages might read it.?
-    for (int i = 0; i < 5; ++i)
-    {
-        this->timeBuffer.access(-i)->commitInfo.doneSeqNum = 0;
-    }
-
+    // incrementing it.
+    --(this->thread[tid]->funcExeInst);
 }
 
 #endif // FULL_SYSTEM
 
 template <class Impl>
-void
-AlphaFullCPU<Impl>::copyToXC()
+MiscReg
+AlphaFullCPU<Impl>::readMiscReg(int misc_reg, unsigned tid)
 {
-    PhysRegIndex renamed_reg;
-
-    // First loop through the integer registers.
-    for (int i = 0; i < AlphaISA::NumIntRegs; ++i)
-    {
-        renamed_reg = this->renameMap.lookup(i);
-        this->cpuXC->setIntReg(i, this->regFile.readIntReg(renamed_reg));
-        DPRINTF(FullCPU, "FullCPU: Copying register %i, has data %lli.\n",
-                renamed_reg, this->regFile.intRegFile[renamed_reg]);
-    }
-
-    // Then loop through the floating point registers.
-    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i)
-    {
-        renamed_reg = this->renameMap.lookup(i + AlphaISA::FP_Base_DepTag);
-        this->cpuXC->setFloatRegDouble(i,
-            this->regFile.readFloatRegDouble(renamed_reg));
-        this->cpuXC->setFloatRegInt(i,
-            this->regFile.readFloatRegInt(renamed_reg));
-    }
-
-    this->cpuXC->setMiscReg(AlphaISA::Fpcr_DepTag,
-                            this->regFile.readMiscReg(AlphaISA::Fpcr_DepTag));
-    this->cpuXC->setMiscReg(AlphaISA::Uniq_DepTag,
-                            this->regFile.readMiscReg(AlphaISA::Uniq_DepTag));
-    this->cpuXC->setMiscReg(AlphaISA::Lock_Flag_DepTag,
-                            this->regFile.readMiscReg(AlphaISA::Lock_Flag_DepTag));
-    this->cpuXC->setMiscReg(AlphaISA::Lock_Addr_DepTag,
-                            this->regFile.readMiscReg(AlphaISA::Lock_Addr_DepTag));
-
-    this->cpuXC->setPC(this->rob.readHeadPC());
-    this->cpuXC->setNextPC(this->cpuXC->readPC()+4);
-
-#if !FULL_SYSTEM
-    this->cpuXC->setFuncExeInst(this->funcExeInst);
-#endif
+    return this->regFile.readMiscReg(misc_reg, tid);
+}
+
+template <class Impl>
+MiscReg
+AlphaFullCPU<Impl>::readMiscRegWithEffect(int misc_reg, Fault &fault,
+                                          unsigned tid)
+{
+    return this->regFile.readMiscRegWithEffect(misc_reg, fault, tid);
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::setMiscReg(int misc_reg, const MiscReg &val, unsigned tid)
+{
+    // I think that these registers should always be set, regardless of what
+    // mode the thread is in.  The main difference is if the thread needs to
+    // squash as a result of the write, which is controlled by the AlphaXC.
+//    if (!this->thread[tid]->trapPending) {
+        return this->regFile.setMiscReg(misc_reg, val, tid);
+//    } else {
+//        return NoFault;
+//    }
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::setMiscRegWithEffect(int misc_reg, const MiscReg &val,
+                                         unsigned tid)
+{
+//    if (!this->thread[tid]->trapPending) {
+        return this->regFile.setMiscRegWithEffect(misc_reg, val, tid);
+//    } else {
+//        return NoFault;
+//    }
 }
 
-// This function will probably mess things up unless the ROB is empty and
-// there are no instructions in the pipeline.
 template <class Impl>
 void
-AlphaFullCPU<Impl>::copyFromXC()
+AlphaFullCPU<Impl>::squashFromXC(unsigned tid)
 {
-    PhysRegIndex renamed_reg;
-
-    // First loop through the integer registers.
-    for (int i = 0; i < AlphaISA::NumIntRegs; ++i)
-    {
-        renamed_reg = this->renameMap.lookup(i);
-
-        DPRINTF(FullCPU, "FullCPU: Copying over register %i, had data %lli, "
-                "now has data %lli.\n",
-                renamed_reg, this->regFile.intRegFile[renamed_reg],
-                this->cpuXC->readIntReg(i));
-
-        this->regFile.setIntReg(renamed_reg, this->cpuXC->readIntReg(i));
-    }
-
-    // Then loop through the floating point registers.
-    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i)
-    {
-        renamed_reg = this->renameMap.lookup(i + AlphaISA::FP_Base_DepTag);
-        this->regFile.setFloatRegDouble(renamed_reg,
-                                        this->cpuXC->readFloatRegDouble(i));
-        this->regFile.setFloatRegInt(renamed_reg,
-                                     this->cpuXC->readFloatRegInt(i));
-    }
-
-    // Then loop through the misc registers.
-    this->regFile.setMiscReg(AlphaISA::Fpcr_DepTag,
-                             this->cpuXC->readMiscReg(AlphaISA::Fpcr_DepTag));
-    this->regFile.setMiscReg(AlphaISA::Uniq_DepTag,
-                             this->cpuXC->readMiscReg(AlphaISA::Uniq_DepTag));
-    this->regFile.setMiscReg(AlphaISA::Lock_Flag_DepTag,
-                             this->cpuXC->readMiscReg(AlphaISA::Lock_Flag_DepTag));
-    this->regFile.setMiscReg(AlphaISA::Lock_Addr_DepTag,
-                             this->cpuXC->readMiscReg(AlphaISA::Lock_Addr_DepTag));
-
-    // Then finally set the PC and the next PC.
-//    regFile.pc = cpuXC->regs.pc;
-//    regFile.npc = cpuXC->regs.npc;
-#if !FULL_SYSTEM
-    this->funcExeInst = this->cpuXC->readFuncExeInst();
-#endif
+//    this->thread[tid]->trapPending = true;
+    this->thread[tid]->inSyscall = true;
+    this->commit.generateXCEvent(tid);
 }
 
 #if FULL_SYSTEM
 
+template <class Impl>
+void
+AlphaFullCPU<Impl>::post_interrupt(int int_num, int index)
+{
+    BaseCPU::post_interrupt(int_num, index);
+
+    if (this->thread[0]->status() == ExecContext::Suspended) {
+        DPRINTF(IPI,"Suspended Processor awoke\n");
+        xcProxies[0]->activate();
+    }
+}
+
 template <class Impl>
 int
 AlphaFullCPU<Impl>::readIntrFlag()
@@ -263,23 +583,26 @@ AlphaFullCPU<Impl>::setIntrFlag(int val)
     this->regFile.setIntrFlag(val);
 }
 
-// Can force commit stage to squash and stuff.
 template <class Impl>
 Fault
-AlphaFullCPU<Impl>::hwrei()
+AlphaFullCPU<Impl>::hwrei(unsigned tid)
 {
-    if (!inPalMode())
+#if 0
+    if (!inPalMode(this->readPC(tid)))
         return new AlphaISA::UnimplementedOpcodeFault;
 
-    this->setNextPC(this->regFile.miscRegs.readReg(AlphaISA::IPR_EXC_ADDR));
+    setNextPC(cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR, tid), tid);
 
-//    kernelStats.hwrei();
+    cpu->kernelStats->hwrei();
 
-    if ((this->regFile.miscRegs.readReg(AlphaISA::IPR_EXC_ADDR) & 1) == 0)
+//    if ((this->regFile.miscRegs[tid].readReg(AlphaISA::IPR_EXC_ADDR) & 1) == 0)
 //        AlphaISA::swap_palshadow(&regs, false);
 
-    this->checkInterrupts = true;
-
+    cpu->checkInterrupts = true;
+#endif
+//    panic("Do not call this function!");
+    // Need to clear the lock flag upon returning from an interrupt.
+    this->lockFlag = false;
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
 }
@@ -312,8 +635,10 @@ AlphaFullCPU<Impl>::simPalCheck(int palFunc)
 // stage.
 template <class Impl>
 void
-AlphaFullCPU<Impl>::trap(Fault fault)
+AlphaFullCPU<Impl>::trap(Fault fault, unsigned tid)
 {
+
+    fault->invoke(this->xcProxies[tid]);
 /*    // Keep in mind that a trap may be initiated by fetch if there's a TLB
     // miss
     uint64_t PC = this->commit.readCommitPC();
@@ -344,32 +669,93 @@ AlphaFullCPU<Impl>::trap(Fault fault)
         swapPALShadow(true);
 
     this->regFile.setPC(this->regFile.miscRegs.readReg(AlphaISA::IPR_PAL_BASE) +
-                         (dynamic_cast<AlphaFault *>(fault.get()))->vect());
-    this->regFile.setNextPC(PC + sizeof(MachInst));*/
+                         (dynamic_cast<AlphaFault *>(fault.get()))->vect(), 0);
+    this->regFile.setNextPC(PC + sizeof(MachInst), 0);*/
 }
 
 template <class Impl>
 void
 AlphaFullCPU<Impl>::processInterrupts()
 {
-    // Check for interrupts here.  For now can copy the code that exists
-    // within isa_fullsys_traits.hh.
-}
+    // Check for interrupts here.  For now can copy the code that
+    // exists within isa_fullsys_traits.hh.  Also assume that thread 0
+    // is the one that handles the interrupts.
 
-// swap_palshadow swaps in the values of the shadow registers and
-// swaps them with the values of the physical registers that map to the
-// same logical index.
-template <class Impl>
-void
-AlphaFullCPU<Impl>::swapPALShadow(bool use_shadow)
-{
-    if (palShadowEnabled == use_shadow)
-        panic("swap_palshadow: wrong PAL shadow state");
+    // Check if there are any outstanding interrupts
+    //Handle the interrupts
+    int ipl = 0;
+    int summary = 0;
 
-    palShadowEnabled = use_shadow;
+    this->checkInterrupts = false;
 
-    // Will have to lookup in rename map to get physical registers, then
-    // swap.
+    if (this->readMiscReg(IPR_ASTRR, 0))
+        panic("asynchronous traps not implemented\n");
+
+    if (this->readMiscReg(IPR_SIRR, 0)) {
+        for (int i = INTLEVEL_SOFTWARE_MIN;
+             i < INTLEVEL_SOFTWARE_MAX; i++) {
+            if (this->readMiscReg(IPR_SIRR, 0) & (ULL(1) << i)) {
+                // See table 4-19 of the 21164 hardware reference
+                ipl = (i - INTLEVEL_SOFTWARE_MIN) + 1;
+                summary |= (ULL(1) << i);
+            }
+        }
+    }
+
+    uint64_t interrupts = this->intr_status();
+
+    if (interrupts) {
+        for (int i = INTLEVEL_EXTERNAL_MIN;
+             i < INTLEVEL_EXTERNAL_MAX; i++) {
+            if (interrupts & (ULL(1) << i)) {
+                // See table 4-19 of the 21164 hardware reference
+                ipl = i;
+                summary |= (ULL(1) << i);
+            }
+        }
+    }
+
+    if (ipl && ipl > this->readMiscReg(IPR_IPLR, 0)) {
+        this->setMiscReg(IPR_ISR, summary, 0);
+        this->setMiscReg(IPR_INTID, ipl, 0);
+        this->trap(Fault(new InterruptFault), 0);
+        DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
+                this->readMiscReg(IPR_IPLR, 0), ipl, summary);
+    }
 }
 
 #endif // FULL_SYSTEM
+
+#if !FULL_SYSTEM
+template <class Impl>
+TheISA::IntReg
+AlphaFullCPU<Impl>::getSyscallArg(int i, int tid)
+{
+    return this->readArchIntReg(AlphaISA::ArgumentReg0 + i, tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::setSyscallArg(int i, IntReg val, int tid)
+{
+    this->setArchIntReg(AlphaISA::ArgumentReg0 + i, val, tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::setSyscallReturn(SyscallReturn return_value, int tid)
+{
+    // check for error condition.  Alpha syscall convention is to
+    // indicate success/failure in reg a3 (r19) and put the
+    // return value itself in the standard return value reg (v0).
+    if (return_value.successful()) {
+        // no error
+        this->setArchIntReg(SyscallSuccessReg, 0, tid);
+        this->setArchIntReg(ReturnValueReg, return_value.value(), tid);
+    } else {
+        // got an error, return details
+        this->setArchIntReg(SyscallSuccessReg, (IntReg) -1, tid);
+        this->setArchIntReg(ReturnValueReg, -return_value.value(), tid);
+    }
+}
+#endif
diff --git a/cpu/o3/alpha_dyn_inst.hh b/cpu/o3/alpha_dyn_inst.hh
index e7f7d3a57..e0b73f17e 100644
--- a/cpu/o3/alpha_dyn_inst.hh
+++ b/cpu/o3/alpha_dyn_inst.hh
@@ -26,21 +26,24 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_ALPHA_DYN_INST_HH__
-#define __CPU_O3_CPU_ALPHA_DYN_INST_HH__
+#ifndef __CPU_O3_ALPHA_DYN_INST_HH__
+#define __CPU_O3_ALPHA_DYN_INST_HH__
 
 #include "cpu/base_dyn_inst.hh"
+#include "cpu/inst_seq.hh"
 #include "cpu/o3/alpha_cpu.hh"
 #include "cpu/o3/alpha_impl.hh"
-#include "cpu/inst_seq.hh"
 
 /**
- * Mostly implementation specific AlphaDynInst.  It is templated in case there
- * are other implementations that are similar enough to be able to use this
- * class without changes.  This is mainly useful if there are multiple similar
- * CPU implementations of the same ISA.
+ * Mostly implementation & ISA specific AlphaDynInst. As with most other classes
+ * in the new CPU model, it is templated on the Impl to allow for passing in of
+ * all types, such as the CPU type and the ISA type. The AlphaDynInst serves
+ * as the primary interface to the CPU; it plays the role that the ExecContext
+ * does for the old CPU and the SimpleCPU. The goal is to abstract ExecContext
+ * purely into an interface, and have it forward calls to the appropriate
+ * CPU interface, which in the new CPU model's case would be this AlphaDynInst,
+ * or any other high level implementation specific DynInst.
  */
-
 template <class Impl>
 class AlphaDynInst : public BaseDynInst<Impl>
 {
@@ -50,6 +53,8 @@ class AlphaDynInst : public BaseDynInst<Impl>
 
     /** Binary machine instruction type. */
     typedef TheISA::MachInst MachInst;
+    /** Extended machine instruction type. */
+    typedef TheISA::ExtMachInst ExtMachInst;
     /** Logical register index type. */
     typedef TheISA::RegIndex RegIndex;
     /** Integer register index type. */
@@ -64,55 +69,60 @@ class AlphaDynInst : public BaseDynInst<Impl>
 
   public:
     /** BaseDynInst constructor given a binary instruction. */
-    AlphaDynInst(MachInst inst, Addr PC, Addr Pred_PC, InstSeqNum seq_num,
+    AlphaDynInst(ExtMachInst inst, Addr PC, Addr Pred_PC, InstSeqNum seq_num,
                  FullCPU *cpu);
 
     /** BaseDynInst constructor given a static inst pointer. */
     AlphaDynInst(StaticInstPtr &_staticInst);
 
     /** Executes the instruction.*/
-    Fault execute()
-    {
-        return this->fault = this->staticInst->execute(this, this->traceData);
-    }
+    Fault execute();
+
+    Fault initiateAcc();
+
+    Fault completeAcc();
+
+  private:
+    /** Initializes variables. */
+    void initVars();
 
   public:
     MiscReg readMiscReg(int misc_reg)
     {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return 0;
+        return this->cpu->readMiscReg(misc_reg, this->threadNumber);
     }
 
     MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault)
     {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return 0;
+        return this->cpu->readMiscRegWithEffect(misc_reg, fault,
+                                                this->threadNumber);
     }
 
     Fault setMiscReg(int misc_reg, const MiscReg &val)
     {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return NoFault;
+        return this->cpu->setMiscReg(misc_reg, val, this->threadNumber);
     }
 
     Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val)
     {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return NoFault;
+        return this->cpu->setMiscRegWithEffect(misc_reg, val,
+                                               this->threadNumber);
     }
 
 #if FULL_SYSTEM
+    /** Calls hardware return from error interrupt. */
     Fault hwrei();
+    /** Reads interrupt flag. */
     int readIntrFlag();
+    /** Sets interrupt flag. */
     void setIntrFlag(int val);
+    /** Checks if system is in PAL mode. */
     bool inPalMode();
+    /** Traps to handle specified fault. */
     void trap(Fault fault);
     bool simPalCheck(int palFunc);
 #else
+    /** Calls a syscall. */
     void syscall();
 #endif
 
@@ -237,16 +247,24 @@ class AlphaDynInst : public BaseDynInst<Impl>
     }
 
   public:
+    /** Calculates EA part of a memory instruction. Currently unused, though
+     * it may be useful in the future when memory instructions aren't
+     * executed with the EA calculation and the memory access being atomic.
+     */
     Fault calcEA()
     {
         return this->staticInst->eaCompInst()->execute(this, this->traceData);
     }
 
+    /** Does the memory access part of a memory instruction. Currently unused,
+     * though it may be useful in the future when memory instructions aren't
+     * executed with the EA calculation and the memory access being atomic.
+     */
     Fault memAccess()
     {
         return this->staticInst->memAccInst()->execute(this, this->traceData);
     }
 };
 
-#endif // __CPU_O3_CPU_ALPHA_DYN_INST_HH__
+#endif // __CPU_O3_ALPHA_DYN_INST_HH__
 
diff --git a/cpu/o3/alpha_dyn_inst_impl.hh b/cpu/o3/alpha_dyn_inst_impl.hh
index 96b7d3430..b5999f8d1 100644
--- a/cpu/o3/alpha_dyn_inst_impl.hh
+++ b/cpu/o3/alpha_dyn_inst_impl.hh
@@ -29,57 +29,117 @@
 #include "cpu/o3/alpha_dyn_inst.hh"
 
 template <class Impl>
-AlphaDynInst<Impl>::AlphaDynInst(MachInst inst, Addr PC, Addr Pred_PC,
+AlphaDynInst<Impl>::AlphaDynInst(ExtMachInst inst, Addr PC, Addr Pred_PC,
                                  InstSeqNum seq_num, FullCPU *cpu)
     : BaseDynInst<Impl>(inst, PC, Pred_PC, seq_num, cpu)
 {
-    // Make sure to have the renamed register entries set to the same
-    // as the normal register entries.  It will allow the IQ to work
-    // without any modifications.
-    for (int i = 0; i < this->staticInst->numDestRegs(); i++)
-    {
-        _destRegIdx[i] = this->staticInst->destRegIdx(i);
-    }
-
-    for (int i = 0; i < this->staticInst->numSrcRegs(); i++)
-    {
-        _srcRegIdx[i] = this->staticInst->srcRegIdx(i);
-        this->_readySrcRegIdx[i] = 0;
-    }
-
+    initVars();
 }
 
 template <class Impl>
 AlphaDynInst<Impl>::AlphaDynInst(StaticInstPtr &_staticInst)
     : BaseDynInst<Impl>(_staticInst)
+{
+    initVars();
+}
+
+template <class Impl>
+void
+AlphaDynInst<Impl>::initVars()
 {
     // Make sure to have the renamed register entries set to the same
     // as the normal register entries.  It will allow the IQ to work
     // without any modifications.
-    for (int i = 0; i < _staticInst->numDestRegs(); i++)
-    {
-        _destRegIdx[i] = _staticInst->destRegIdx(i);
+    for (int i = 0; i < this->staticInst->numDestRegs(); i++) {
+        _destRegIdx[i] = this->staticInst->destRegIdx(i);
     }
 
-    for (int i = 0; i < _staticInst->numSrcRegs(); i++)
-    {
-        _srcRegIdx[i] = _staticInst->srcRegIdx(i);
+    for (int i = 0; i < this->staticInst->numSrcRegs(); i++) {
+        _srcRegIdx[i] = this->staticInst->srcRegIdx(i);
+        this->_readySrcRegIdx[i] = 0;
     }
 }
 
+template <class Impl>
+Fault
+AlphaDynInst<Impl>::execute()
+{
+    // @todo: Pretty convoluted way to avoid squashing from happening when using
+    // the XC during an instruction's execution (specifically for instructions
+    // that have sideeffects that use the XC).  Fix this.
+    bool in_syscall = this->thread->inSyscall;
+    this->thread->inSyscall = true;
+
+    this->fault = this->staticInst->execute(this, this->traceData);
+
+    this->thread->inSyscall = in_syscall;
+
+    return this->fault;
+}
+
+template <class Impl>
+Fault
+AlphaDynInst<Impl>::initiateAcc()
+{
+    // @todo: Pretty convoluted way to avoid squashing from happening when using
+    // the XC during an instruction's execution (specifically for instructions
+    // that have sideeffects that use the XC).  Fix this.
+    bool in_syscall = this->thread->inSyscall;
+    this->thread->inSyscall = true;
+
+    this->fault = this->staticInst->initiateAcc(this, this->traceData);
+
+    this->thread->inSyscall = in_syscall;
+
+    return this->fault;
+}
+
+template <class Impl>
+Fault
+AlphaDynInst<Impl>::completeAcc()
+{
+    if (this->isLoad()) {
+        this->fault = this->staticInst->completeAcc(this->req->data,
+                                                    this,
+                                                    this->traceData);
+    } else if (this->isStore()) {
+        this->fault = this->staticInst->completeAcc((uint8_t*)&this->req->result,
+                                                    this,
+                                                    this->traceData);
+    } else {
+        panic("Unknown type!");
+    }
+
+    return this->fault;
+}
+
 #if FULL_SYSTEM
 template <class Impl>
 Fault
 AlphaDynInst<Impl>::hwrei()
 {
-    return this->cpu->hwrei();
+    if (!this->cpu->inPalMode(this->readPC()))
+        return new AlphaISA::UnimplementedOpcodeFault;
+
+    this->setNextPC(this->cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR,
+                                           this->threadNumber));
+
+    this->cpu->kernelStats->hwrei();
+
+    // Tell CPU to clear any state it needs to if a hwrei is taken.
+    this->cpu->hwrei(this->threadNumber);
+
+    this->cpu->checkInterrupts = true;
+
+    // FIXME: XXX check for interrupts? XXX
+    return NoFault;
 }
 
 template <class Impl>
 int
 AlphaDynInst<Impl>::readIntrFlag()
 {
-return this->cpu->readIntrFlag();
+    return this->cpu->readIntrFlag();
 }
 
 template <class Impl>
@@ -93,14 +153,14 @@ template <class Impl>
 bool
 AlphaDynInst<Impl>::inPalMode()
 {
-    return this->cpu->inPalMode();
+    return this->cpu->inPalMode(this->PC);
 }
 
 template <class Impl>
 void
 AlphaDynInst<Impl>::trap(Fault fault)
 {
-    this->cpu->trap(fault);
+    this->cpu->trap(fault, this->threadNumber);
 }
 
 template <class Impl>
diff --git a/cpu/o3/alpha_impl.hh b/cpu/o3/alpha_impl.hh
index 5e39fcb37..f404bd3ec 100644
--- a/cpu/o3/alpha_impl.hh
+++ b/cpu/o3/alpha_impl.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_ALPHA_IMPL_HH__
-#define __CPU_O3_CPU_ALPHA_IMPL_HH__
+#ifndef __CPU_O3_ALPHA_IMPL_HH__
+#define __CPU_O3_ALPHA_IMPL_HH__
 
 #include "arch/alpha/isa_traits.hh"
 
@@ -41,7 +41,7 @@ class AlphaDynInst;
 template <class Impl>
 class AlphaFullCPU;
 
-/** Implementation specific struct that defines several key things to the
+/** Implementation specific struct that defines several key types to the
  *  CPU, the stages within the CPU, the time buffers, and the DynInst.
  *  The struct defines the ISA, the CPU policy, the specific DynInst, the
  *  specific FullCPU, and all of the structs from the time buffers to do
@@ -54,10 +54,10 @@ struct AlphaSimpleImpl
     /** The type of MachInst. */
     typedef TheISA::MachInst MachInst;
 
-    /** The CPU policy to be used (ie fetch, decode, etc.). */
+    /** The CPU policy to be used, which defines all of the CPU stages. */
     typedef SimpleCPUPolicy<AlphaSimpleImpl> CPUPol;
 
-    /** The DynInst to be used. */
+    /** The DynInst type to be used. */
     typedef AlphaDynInst<AlphaSimpleImpl> DynInst;
 
     /** The refcounted DynInst pointer to be used.  In most cases this is
@@ -65,15 +65,16 @@ struct AlphaSimpleImpl
      */
     typedef RefCountingPtr<DynInst> DynInstPtr;
 
-    /** The FullCPU to be used. */
+    /** The FullCPU type to be used. */
     typedef AlphaFullCPU<AlphaSimpleImpl> FullCPU;
 
     /** The Params to be passed to each stage. */
     typedef AlphaSimpleParams Params;
 
     enum {
-        MaxWidth = 8
+      MaxWidth = 8,
+      MaxThreads = 4
     };
 };
 
-#endif // __CPU_O3_CPU_ALPHA_IMPL_HH__
+#endif // __CPU_O3_ALPHA_IMPL_HH__
diff --git a/cpu/o3/alpha_params.hh b/cpu/o3/alpha_params.hh
index 79b0937e3..04b790815 100644
--- a/cpu/o3/alpha_params.hh
+++ b/cpu/o3/alpha_params.hh
@@ -26,18 +26,19 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_ALPHA_SIMPLE_PARAMS_HH__
-#define __CPU_O3_CPU_ALPHA_SIMPLE_PARAMS_HH__
+#ifndef __CPU_O3_ALPHA_PARAMS_HH__
+#define __CPU_O3_ALPHA_PARAMS_HH__
 
 #include "cpu/o3/cpu.hh"
 
 //Forward declarations
-class System;
-class AlphaITB;
 class AlphaDTB;
+class AlphaITB;
+class FUPool;
 class FunctionalMemory;
-class Process;
 class MemInterface;
+class Process;
+class System;
 
 /**
  * This file defines the parameters that will be used for the AlphaFullCPU.
@@ -56,6 +57,9 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     Process *process;
 #endif // FULL_SYSTEM
 
+    //Page Table
+//    PageTable *pTable;
+
     FunctionalMemory *mem;
 
     //
@@ -64,6 +68,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     MemInterface *icacheInterface;
     MemInterface *dcacheInterface;
 
+    unsigned cachePorts;
+
     //
     // Fetch
     //
@@ -102,6 +108,7 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned executeFloatWidth;
     unsigned executeBranchWidth;
     unsigned executeMemoryWidth;
+    FUPool *fuPool;
 
     //
     // Commit
@@ -114,20 +121,15 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     //
     // Branch predictor (BP & BTB)
     //
-/*
     unsigned localPredictorSize;
-    unsigned localPredictorCtrBits;
-*/
-
-    unsigned local_predictor_size;
-    unsigned local_ctr_bits;
-    unsigned local_history_table_size;
-    unsigned local_history_bits;
-    unsigned global_predictor_size;
-    unsigned global_ctr_bits;
-    unsigned global_history_bits;
-    unsigned choice_predictor_size;
-    unsigned choice_ctr_bits;
+    unsigned localCtrBits;
+    unsigned localHistoryTableSize;
+    unsigned localHistoryBits;
+    unsigned globalPredictorSize;
+    unsigned globalCtrBits;
+    unsigned globalHistoryBits;
+    unsigned choicePredictorSize;
+    unsigned choiceCtrBits;
 
     unsigned BTBEntries;
     unsigned BTBTagSize;
@@ -154,10 +156,24 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned numIQEntries;
     unsigned numROBEntries;
 
+    //SMT Parameters
+    unsigned smtNumFetchingThreads;
+
+    std::string   smtFetchPolicy;
+
+    std::string   smtIQPolicy;
+    unsigned smtIQThreshold;
+
+    std::string   smtLSQPolicy;
+    unsigned smtLSQThreshold;
+
+    std::string   smtCommitPolicy;
+
+    std::string   smtROBPolicy;
+    unsigned smtROBThreshold;
+
     // Probably can get this from somewhere.
     unsigned instShiftAmt;
-
-    bool defReg;
 };
 
-#endif // __CPU_O3_CPU_ALPHA_PARAMS_HH__
+#endif // __CPU_O3_ALPHA_PARAMS_HH__
diff --git a/cpu/o3/bpred_unit.cc b/cpu/o3/bpred_unit.cc
index 85bd6f0a6..a78dcf463 100644
--- a/cpu/o3/bpred_unit.cc
+++ b/cpu/o3/bpred_unit.cc
@@ -29,5 +29,9 @@
 #include "cpu/o3/bpred_unit_impl.hh"
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/alpha_dyn_inst.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
 
 template class TwobitBPredUnit<AlphaSimpleImpl>;
+template class TwobitBPredUnit<OzoneImpl>;
+template class TwobitBPredUnit<SimpleImpl>;
diff --git a/cpu/o3/bpred_unit.hh b/cpu/o3/bpred_unit.hh
index 2725684f7..67c300989 100644
--- a/cpu/o3/bpred_unit.hh
+++ b/cpu/o3/bpred_unit.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BPRED_UNIT_HH__
-#define __BPRED_UNIT_HH__
+#ifndef __CPU_O3_BPRED_UNIT_HH__
+#define __CPU_O3_BPRED_UNIT_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
@@ -35,9 +35,9 @@
 #include "cpu/inst_seq.hh"
 
 #include "cpu/o3/2bit_local_pred.hh"
-#include "cpu/o3/tournament_pred.hh"
 #include "cpu/o3/btb.hh"
 #include "cpu/o3/ras.hh"
+#include "cpu/o3/tournament_pred.hh"
 
 #include <list>
 
@@ -57,77 +57,171 @@ class TwobitBPredUnit
     typedef typename Impl::Params Params;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
-    TwobitBPredUnit(Params &params);
+    /**
+     * @param params The params object, that has the size of the BP and BTB.
+     */
+    TwobitBPredUnit(Params *params);
 
+    /**
+     * Registers statistics.
+     */
     void regStats();
 
-    bool predict(DynInstPtr &inst, Addr &PC);
+    /**
+     * Predicts whether or not the instruction is a taken branch, and the
+     * target of the branch if it is taken.
+     * @param inst The branch instruction.
+     * @param PC The predicted PC is passed back through this parameter.
+     * @param tid The thread id.
+     * @return Returns if the branch is taken or not.
+     */
+    bool predict(DynInstPtr &inst, Addr &PC, unsigned tid);
 
-    void update(const InstSeqNum &done_sn);
+    /**
+     * Tells the branch predictor to commit any updates until the given
+     * sequence number.
+     * @param done_sn The sequence number to commit any older updates up until.
+     * @param tid The thread id.
+     */
+    void update(const InstSeqNum &done_sn, unsigned tid);
 
-    void squash(const InstSeqNum &squashed_sn);
+    /**
+     * Squashes all outstanding updates until a given sequence number.
+     * @param squashed_sn The sequence number to squash any younger updates up
+     * until.
+     * @param tid The thread id.
+     */
+    void squash(const InstSeqNum &squashed_sn, unsigned tid);
 
+    /**
+     * Squashes all outstanding updates until a given sequence number, and
+     * corrects that sn's update with the proper address and taken/not taken.
+     * @param squashed_sn The sequence number to squash any younger updates up
+     * until.
+     * @param corr_target The correct branch target.
+     * @param actually_taken The correct branch direction.
+     * @param tid The thread id.
+     */
     void squash(const InstSeqNum &squashed_sn, const Addr &corr_target,
-                bool actually_taken);
+                bool actually_taken, unsigned tid);
 
+    /**
+     * Looks up a given PC in the BP to see if it is taken or not taken.
+     * @param inst_PC The PC to look up.
+     * @return Whether the branch is taken or not taken.
+     */
     bool BPLookup(Addr &inst_PC)
     { return BP.lookup(inst_PC); }
 
+    /**
+     * Looks up a given PC in the BTB to see if a matching entry exists.
+     * @param inst_PC The PC to look up.
+     * @return Whether the BTB contains the given PC.
+     */
     bool BTBValid(Addr &inst_PC)
-    { return BTB.valid(inst_PC); }
+    { return BTB.valid(inst_PC, 0); }
 
+    /**
+     * Looks up a given PC in the BTB to get the predicted target.
+     * @param inst_PC The PC to look up.
+     * @return The address of the target of the branch.
+     */
     Addr BTBLookup(Addr &inst_PC)
-    { return BTB.lookup(inst_PC); }
+    { return BTB.lookup(inst_PC, 0); }
 
-    // Will want to include global history.
+    /**
+     * Updates the BP with taken/not taken information.
+     * @param inst_PC The branch's PC that will be updated.
+     * @param taken Whether the branch was taken or not taken.
+     * @todo Make this update flexible enough to handle a global predictor.
+     */
     void BPUpdate(Addr &inst_PC, bool taken)
     { BP.update(inst_PC, taken); }
 
+    /**
+     * Updates the BTB with the target of a branch.
+     * @param inst_PC The branch's PC that will be updated.
+     * @param target_PC The branch's target that will be added to the BTB.
+     */
     void BTBUpdate(Addr &inst_PC, Addr &target_PC)
-    { BTB.update(inst_PC, target_PC); }
+    { BTB.update(inst_PC, target_PC,0); }
 
   private:
     struct PredictorHistory {
+        /**
+         * Makes a predictor history struct that contains a sequence number,
+         * the PC of its instruction, and whether or not it was predicted
+         * taken.
+         */
         PredictorHistory(const InstSeqNum &seq_num, const Addr &inst_PC,
-                         const bool pred_taken)
-            : seqNum(seq_num), PC(inst_PC), predTaken(pred_taken),
-              globalHistory(0), usedRAS(0), wasCall(0), RASIndex(0),
-              RASTarget(0)
+                         const bool pred_taken, const unsigned _tid)
+            : seqNum(seq_num), PC(inst_PC), RASTarget(0), globalHistory(0),
+              RASIndex(0), tid(_tid), predTaken(pred_taken), usedRAS(0),
+              wasCall(0)
         { }
 
+        /** The sequence number for the predictor history entry. */
         InstSeqNum seqNum;
 
+        /** The PC associated with the sequence number. */
         Addr PC;
 
-        bool predTaken;
+        /** The RAS target (only valid if a return). */
+        Addr RASTarget;
 
+        /** The global history at the time this entry was created. */
         unsigned globalHistory;
 
-        bool usedRAS;
-
-        bool wasCall;
-
+        /** The RAS index of the instruction (only valid if a call). */
         unsigned RASIndex;
 
-        Addr RASTarget;
+        /** The thread id. */
+        unsigned tid;
+
+        /** Whether or not it was predicted taken. */
+        bool predTaken;
+
+        /** Whether or not the RAS was used. */
+        bool usedRAS;
+
+        /** Whether or not the instruction was a call. */
+        bool wasCall;
     };
 
-    std::list<PredictorHistory> predHist;
+    typedef std::list<PredictorHistory> History;
 
+    /**
+     * The per-thread predictor history. This is used to update the predictor
+     * as instructions are committed, or restore it to the proper state after
+     * a squash.
+     */
+    History predHist[Impl::MaxThreads];
+
+    /** The branch predictor. */
     DefaultBP BP;
 
+    /** The BTB. */
     DefaultBTB BTB;
 
-    ReturnAddrStack RAS;
+    /** The per-thread return address stack. */
+    ReturnAddrStack RAS[Impl::MaxThreads];
 
+    /** Stat for number of BP lookups. */
     Stats::Scalar<> lookups;
+    /** Stat for number of conditional branches predicted. */
     Stats::Scalar<> condPredicted;
+    /** Stat for number of conditional branches predicted incorrectly. */
     Stats::Scalar<> condIncorrect;
+    /** Stat for number of BTB lookups. */
     Stats::Scalar<> BTBLookups;
+    /** Stat for number of BTB hits. */
     Stats::Scalar<> BTBHits;
+    /** Stat for number of times the BTB is correct. */
     Stats::Scalar<> BTBCorrect;
+    /** Stat for number of times the RAS is used to get a target. */
     Stats::Scalar<> usedRAS;
+    /** Stat for number of times the RAS is incorrect. */
     Stats::Scalar<> RASIncorrect;
 };
 
-#endif // __BPRED_UNIT_HH__
+#endif // __CPU_O3_BPRED_UNIT_HH__
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index 8d16a0cdf..f79b67b6c 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -30,16 +30,22 @@
 #include "base/traceflags.hh"
 #include "cpu/o3/bpred_unit.hh"
 
+#include <vector>
+#include <list>
+
+using namespace std;
+
 template<class Impl>
-TwobitBPredUnit<Impl>::TwobitBPredUnit(Params &params)
-  : BP(params.local_predictor_size,
-       params.local_ctr_bits,
-       params.instShiftAmt),
-    BTB(params.BTBEntries,
-        params.BTBTagSize,
-        params.instShiftAmt),
-    RAS(params.RASSize)
+TwobitBPredUnit<Impl>::TwobitBPredUnit(Params *params)
+  : BP(params->localPredictorSize,
+       params->localCtrBits,
+       params->instShiftAmt),
+    BTB(params->BTBEntries,
+        params->BTBTagSize,
+        params->instShiftAmt)
 {
+    for (int i=0; i < Impl::MaxThreads; i++)
+        RAS[i].init(params->RASSize);
 }
 
 template <class Impl>
@@ -79,7 +85,7 @@ TwobitBPredUnit<Impl>::regStats()
 
     usedRAS
         .name(name() + ".BPredUnit.usedRAS")
-        .desc("Number of times the RAS was used.")
+        .desc("Number of times the RAS was used to get a target.")
         ;
 
     RASIncorrect
@@ -90,7 +96,7 @@ TwobitBPredUnit<Impl>::regStats()
 
 template <class Impl>
 bool
-TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
+TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid)
 {
     // See if branch predictor predicts taken.
     // If so, get its target addr either from the BTB or the RAS.
@@ -106,18 +112,19 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
     ++lookups;
 
     if (inst->isUncondCtrl()) {
-        DPRINTF(Fetch, "BranchPred: Unconditional control.\n");
+        DPRINTF(Fetch, "BranchPred: [tid:%i] Unconditional control.\n", tid);
         pred_taken = true;
     } else {
         ++condPredicted;
 
         pred_taken = BPLookup(PC);
 
-        DPRINTF(Fetch, "BranchPred: Branch predictor predicted %i for PC %#x"
-                "\n", pred_taken, inst->readPC());
+        DPRINTF(Fetch, "BranchPred: [tid:%i]: Branch predictor predicted %i "
+                "for PC %#x\n",
+                tid, pred_taken, inst->readPC());
     }
 
-    PredictorHistory predict_record(inst->seqNum, PC, pred_taken);
+    PredictorHistory predict_record(inst->seqNum, PC, pred_taken, tid);
 
     // Now lookup in the BTB or RAS.
     if (pred_taken) {
@@ -126,45 +133,48 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
 
             // If it's a function return call, then look up the address
             // in the RAS.
-            target = RAS.top();
+            target = RAS[tid].top();
 
             // Record the top entry of the RAS, and its index.
             predict_record.usedRAS = true;
-            predict_record.RASIndex = RAS.topIdx();
+            predict_record.RASIndex = RAS[tid].topIdx();
             predict_record.RASTarget = target;
 
-            RAS.pop();
+            assert(predict_record.RASIndex < 16);
 
-            DPRINTF(Fetch, "BranchPred: Instruction %#x is a return, RAS "
-                    "predicted target: %#x, RAS index: %i.\n",
-                    inst->readPC(), target, predict_record.RASIndex);
+            RAS[tid].pop();
+
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Instruction %#x is a return, "
+                    "RAS predicted target: %#x, RAS index: %i.\n",
+                    tid, inst->readPC(), target, predict_record.RASIndex);
         } else {
             ++BTBLookups;
 
             if (inst->isCall()) {
-                RAS.push(PC+sizeof(MachInst));
+                RAS[tid].push(PC + sizeof(MachInst));
 
                 // Record that it was a call so that the top RAS entry can
                 // be popped off if the speculation is incorrect.
                 predict_record.wasCall = true;
 
-                DPRINTF(Fetch, "BranchPred: Instruction %#x was a call, "
-                        "adding %#x to the RAS.\n",
-                        inst->readPC(), PC+sizeof(MachInst));
+                DPRINTF(Fetch, "BranchPred: [tid:%i] Instruction %#x was a call"
+                        ", adding %#x to the RAS.\n",
+                        tid, inst->readPC(), PC + sizeof(MachInst));
             }
 
-            if (BTB.valid(PC)) {
+            if (BTB.valid(PC, tid)) {
                 ++BTBHits;
 
                 //If it's anything else, use the BTB to get the target addr.
-                target = BTB.lookup(PC);
+                target = BTB.lookup(PC, tid);
 
-                DPRINTF(Fetch, "BranchPred: Instruction %#x predicted target "
-                        "is %#x.\n", inst->readPC(), target);
+                DPRINTF(Fetch, "BranchPred: [tid:%i]: Instruction %#x predicted"
+                        " target is %#x.\n",
+                        tid, inst->readPC(), target);
 
             } else {
-                DPRINTF(Fetch, "BranchPred: BTB doesn't have a valid entry."
-                        "\n");
+                DPRINTF(Fetch, "BranchPred: [tid:%i]: BTB doesn't have a "
+                        "valid entry.\n",tid);
                 pred_taken = false;
             }
 
@@ -180,97 +190,112 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
         inst->setPredTarg(PC);
     }
 
-    predHist.push_front(predict_record);
+    predHist[tid].push_front(predict_record);
 
-    assert(!predHist.empty());
+    DPRINTF(Fetch, "[tid:%i] predHist.size(): %i\n", tid, predHist[tid].size());
 
     return pred_taken;
 }
 
 template <class Impl>
 void
-TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn)
+TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn, unsigned tid)
 {
-    DPRINTF(Fetch, "BranchPred: Commiting branches until sequence number "
-            "%i.\n", done_sn);
+    DPRINTF(Fetch, "BranchPred: [tid:%i]: Commiting branches until sequence"
+            "number %lli.\n", tid, done_sn);
 
-    while (!predHist.empty() && predHist.back().seqNum <= done_sn) {
-        assert(!predHist.empty());
+    while (!predHist[tid].empty() &&
+           predHist[tid].back().seqNum <= done_sn) {
+        // Update the branch predictor with the correct results.
+        BP.update(predHist[tid].back().PC,
+                  predHist[tid].back().predTaken);
 
-        // Update the branch predictor with the correct results of branches.
-        BP.update(predHist.back().PC, predHist.back().predTaken);
-
-        predHist.pop_back();
+        predHist[tid].pop_back();
     }
 }
 
 template <class Impl>
 void
-TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn)
+TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, unsigned tid)
 {
-    while (!predHist.empty() && predHist.front().seqNum > squashed_sn) {
-        if (predHist.front().usedRAS) {
-            DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, "
-                    "target: %#x.\n",
-                    predHist.front().RASIndex,
-                    predHist.front().RASTarget);
+    History &pred_hist = predHist[tid];
 
-            RAS.restore(predHist.front().RASIndex,
-                        predHist.front().RASTarget);
-        } else if (predHist.front().wasCall) {
-            DPRINTF(Fetch, "BranchPred: Removing speculative entry added "
-                    "to the RAS.\n");
+    while (!pred_hist.empty() &&
+           pred_hist.front().seqNum > squashed_sn) {
+       if (pred_hist.front().usedRAS) {
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Restoring top of RAS to: %i,"
+                    " target: %#x.\n",
+                    tid,
+                    pred_hist.front().RASIndex,
+                    pred_hist.front().RASTarget);
 
-            RAS.pop();
+            RAS[tid].restore(pred_hist.front().RASIndex,
+                             pred_hist.front().RASTarget);
+
+        } else if (pred_hist.front().wasCall) {
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Removing speculative entry added "
+                    "to the RAS.\n",tid);
+
+            RAS[tid].pop();
         }
 
-        predHist.pop_front();
+        pred_hist.pop_front();
     }
+
 }
 
 template <class Impl>
 void
 TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn,
                               const Addr &corr_target,
-                              const bool actually_taken)
+                              const bool actually_taken,
+                              unsigned tid)
 {
     // Now that we know that a branch was mispredicted, we need to undo
     // all the branches that have been seen up until this branch and
     // fix up everything.
 
+    History &pred_hist = predHist[tid];
+
     ++condIncorrect;
 
-    DPRINTF(Fetch, "BranchPred: Squashing from sequence number %i, "
+    DPRINTF(Fetch, "BranchPred: [tid:%i]: Squashing from sequence number %i, "
             "setting target to %#x.\n",
-            squashed_sn, corr_target);
+            tid, squashed_sn, corr_target);
 
-    while (!predHist.empty() && predHist.front().seqNum > squashed_sn) {
-
-        if (predHist.front().usedRAS) {
-            DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, "
+    while (!pred_hist.empty() &&
+           pred_hist.front().seqNum > squashed_sn) {
+        if (pred_hist.front().usedRAS) {
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Restoring top of RAS to: %i, "
                     "target: %#x.\n",
-                    predHist.front().RASIndex,
-                    predHist.front().RASTarget);
+                    tid,
+                    pred_hist.front().RASIndex,
+                    pred_hist.front().RASTarget);
 
-            RAS.restore(predHist.front().RASIndex,
-                        predHist.front().RASTarget);
-        } else if (predHist.front().wasCall) {
-            DPRINTF(Fetch, "BranchPred: Removing speculative entry added "
-                    "to the RAS.\n");
+            RAS[tid].restore(pred_hist.front().RASIndex,
+                             pred_hist.front().RASTarget);
+        } else if (pred_hist.front().wasCall) {
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Removing speculative entry"
+                    " added to the RAS.\n", tid);
 
-            RAS.pop();
+            RAS[tid].pop();
         }
 
-        predHist.pop_front();
+        pred_hist.pop_front();
     }
 
-    predHist.front().predTaken = actually_taken;
+    // If there's a squash due to a syscall, there may not be an entry
+    // corresponding to the squash.  In that case, don't bother trying to
+    // fix up the entry.
+    if (!pred_hist.empty()) {
+        pred_hist.front().predTaken = actually_taken;
 
-    if (predHist.front().usedRAS) {
-        ++RASIncorrect;
+        if (pred_hist.front().usedRAS) {
+            ++RASIncorrect;
+        }
+
+        BP.update(pred_hist.front().PC, actually_taken);
+
+        BTB.update(pred_hist.front().PC, corr_target, tid);
     }
-
-    BP.update(predHist.front().PC, actually_taken);
-
-    BTB.update(predHist.front().PC, corr_target);
 }
diff --git a/cpu/o3/btb.cc b/cpu/o3/btb.cc
index 2d39c3856..e084142d7 100644
--- a/cpu/o3/btb.cc
+++ b/cpu/o3/btb.cc
@@ -39,14 +39,15 @@ DefaultBTB::DefaultBTB(unsigned _numEntries,
       tagBits(_tagBits),
       instShiftAmt(_instShiftAmt)
 {
-    // @todo Check to make sure num_entries is valid (a power of 2)
-
     DPRINTF(Fetch, "BTB: Creating BTB object.\n");
 
-    btb = new BTBEntry[numEntries];
+    if (!isPowerOf2(numEntries)) {
+        fatal("BTB entries is not a power of 2!");
+    }
 
-    for (int i = 0; i < numEntries; ++i)
-    {
+    btb.resize(numEntries);
+
+    for (int i = 0; i < numEntries; ++i) {
         btb[i].valid = false;
     }
 
@@ -73,7 +74,7 @@ DefaultBTB::getTag(const Addr &inst_PC)
 }
 
 bool
-DefaultBTB::valid(const Addr &inst_PC)
+DefaultBTB::valid(const Addr &inst_PC, unsigned tid)
 {
     unsigned btb_idx = getIndex(inst_PC);
 
@@ -81,7 +82,9 @@ DefaultBTB::valid(const Addr &inst_PC)
 
     assert(btb_idx < numEntries);
 
-    if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
+    if (btb[btb_idx].valid
+        && inst_tag == btb[btb_idx].tag
+        && btb[btb_idx].tid == tid) {
         return true;
     } else {
         return false;
@@ -92,7 +95,7 @@ DefaultBTB::valid(const Addr &inst_PC)
 // address is valid, and also the address.  For now will just use addr = 0 to
 // represent invalid entry.
 Addr
-DefaultBTB::lookup(const Addr &inst_PC)
+DefaultBTB::lookup(const Addr &inst_PC, unsigned tid)
 {
     unsigned btb_idx = getIndex(inst_PC);
 
@@ -100,7 +103,9 @@ DefaultBTB::lookup(const Addr &inst_PC)
 
     assert(btb_idx < numEntries);
 
-    if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
+    if (btb[btb_idx].valid
+        && inst_tag == btb[btb_idx].tag
+        && btb[btb_idx].tid == tid) {
         return btb[btb_idx].target;
     } else {
         return 0;
@@ -108,12 +113,13 @@ DefaultBTB::lookup(const Addr &inst_PC)
 }
 
 void
-DefaultBTB::update(const Addr &inst_PC, const Addr &target)
+DefaultBTB::update(const Addr &inst_PC, const Addr &target, unsigned tid)
 {
     unsigned btb_idx = getIndex(inst_PC);
 
     assert(btb_idx < numEntries);
 
+    btb[btb_idx].tid = tid;
     btb[btb_idx].valid = true;
     btb[btb_idx].target = target;
     btb[btb_idx].tag = getTag(inst_PC);
diff --git a/cpu/o3/btb.hh b/cpu/o3/btb.hh
index 77bdc32ea..aaa9945f7 100644
--- a/cpu/o3/btb.hh
+++ b/cpu/o3/btb.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_BTB_HH__
-#define __CPU_O3_CPU_BTB_HH__
+#ifndef __CPU_O3_BTB_HH__
+#define __CPU_O3_BTB_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
@@ -42,39 +42,84 @@ class DefaultBTB
         {
         }
 
+        /** The entry's tag. */
         Addr tag;
+
+        /** The entry's target. */
         Addr target;
+
+        /** The entry's thread id. */
+        unsigned tid;
+
+        /** Whether or not the entry is valid. */
         bool valid;
     };
 
   public:
+    /** Creates a BTB with the given number of entries, number of bits per
+     *  tag, and instruction offset amount.
+     *  @param numEntries Number of entries for the BTB.
+     *  @param tagBits Number of bits for each tag in the BTB.
+     *  @param instShiftAmt Offset amount for instructions to ignore alignment.
+     */
     DefaultBTB(unsigned numEntries, unsigned tagBits,
                unsigned instShiftAmt);
 
-    Addr lookup(const Addr &inst_PC);
+    /** Looks up an address in the BTB. Must call valid() first on the address.
+     *  @param inst_PC The address of the branch to look up.
+     *  @param tid The thread id.
+     *  @return Returns the target of the branch.
+     */
+    Addr lookup(const Addr &inst_PC, unsigned tid);
 
-    bool valid(const Addr &inst_PC);
+    /** Checks if a branch is in the BTB.
+     *  @param inst_PC The address of the branch to look up.
+     *  @param tid The thread id.
+     *  @return Whether or not the branch exists in the BTB.
+     */
+    bool valid(const Addr &inst_PC, unsigned tid);
 
-    void update(const Addr &inst_PC, const Addr &target_PC);
+    /** Updates the BTB with the target of a branch.
+     *  @param inst_PC The address of the branch being updated.
+     *  @param target_PC The target address of the branch.
+     *  @param tid The thread id.
+     */
+    void update(const Addr &inst_PC, const Addr &target_PC,
+                unsigned tid);
 
   private:
+    /** Returns the index into the BTB, based on the branch's PC.
+     *  @param inst_PC The branch to look up.
+     *  @return Returns the index into the BTB.
+     */
     inline unsigned getIndex(const Addr &inst_PC);
 
+    /** Returns the tag bits of a given address.
+     *  @param inst_PC The branch's address.
+     *  @return Returns the tag bits.
+     */
     inline Addr getTag(const Addr &inst_PC);
 
-    BTBEntry *btb;
+    /** The actual BTB. */
+    std::vector<BTBEntry> btb;
 
+    /** The number of entries in the BTB. */
     unsigned numEntries;
 
+    /** The index mask. */
     unsigned idxMask;
 
+    /** The number of tag bits per entry. */
     unsigned tagBits;
 
+    /** The tag mask. */
     unsigned tagMask;
 
+    /** Number of bits to shift PC when calculating index. */
     unsigned instShiftAmt;
 
+    /** Number of bits to shift PC when calculating tag. */
     unsigned tagShiftAmt;
 };
 
-#endif // __CPU_O3_CPU_BTB_HH__
+#endif // __CPU_O3_BTB_HH__
diff --git a/cpu/o3/comm.hh b/cpu/o3/comm.hh
index c74c77ddf..1a8f394ca 100644
--- a/cpu/o3/comm.hh
+++ b/cpu/o3/comm.hh
@@ -26,21 +26,35 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_COMM_HH__
-#define __CPU_O3_CPU_COMM_HH__
+#ifndef __CPU_O3_COMM_HH__
+#define __CPU_O3_COMM_HH__
 
 #include <vector>
 
+#include "arch/faults.hh"
 #include "arch/isa_traits.hh"
 #include "cpu/inst_seq.hh"
 #include "sim/host.hh"
 
-// Find better place to put this typedef.
-// The impl might be the best place for this.
+// Typedef for physical register index type. Although the Impl would be the
+// most likely location for this, there are a few classes that need this
+// typedef yet are not templated on the Impl. For now it will be defined here.
 typedef short int PhysRegIndex;
 
 template<class Impl>
-struct SimpleFetchSimpleDecode {
+struct DefaultFetchDefaultDecode {
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    int size;
+
+    DynInstPtr insts[Impl::MaxWidth];
+    Fault fetchFault;
+    InstSeqNum fetchFaultSN;
+    bool clearFetchFault;
+};
+
+template<class Impl>
+struct DefaultDecodeDefaultRename {
     typedef typename Impl::DynInstPtr DynInstPtr;
 
     int size;
@@ -49,7 +63,7 @@ struct SimpleFetchSimpleDecode {
 };
 
 template<class Impl>
-struct SimpleDecodeSimpleRename {
+struct DefaultRenameDefaultIEW {
     typedef typename Impl::DynInstPtr DynInstPtr;
 
     int size;
@@ -58,28 +72,21 @@ struct SimpleDecodeSimpleRename {
 };
 
 template<class Impl>
-struct SimpleRenameSimpleIEW {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
-    int size;
-
-    DynInstPtr insts[Impl::MaxWidth];
-};
-
-template<class Impl>
-struct SimpleIEWSimpleCommit {
+struct DefaultIEWDefaultCommit {
     typedef typename Impl::DynInstPtr DynInstPtr;
 
     int size;
 
     DynInstPtr insts[Impl::MaxWidth];
 
-    bool squash;
-    bool branchMispredict;
-    bool branchTaken;
-    uint64_t mispredPC;
-    uint64_t nextPC;
-    InstSeqNum squashedSeqNum;
+    bool squash[Impl::MaxThreads];
+    bool branchMispredict[Impl::MaxThreads];
+    bool branchTaken[Impl::MaxThreads];
+    uint64_t mispredPC[Impl::MaxThreads];
+    uint64_t nextPC[Impl::MaxThreads];
+    InstSeqNum squashedSeqNum[Impl::MaxThreads];
+
+    bool includeSquashInst[Impl::MaxThreads];
 };
 
 template<class Impl>
@@ -91,63 +98,77 @@ struct IssueStruct {
     DynInstPtr insts[Impl::MaxWidth];
 };
 
+template<class Impl>
 struct TimeBufStruct {
     struct decodeComm {
         bool squash;
-        bool stall;
         bool predIncorrect;
         uint64_t branchAddr;
 
         InstSeqNum doneSeqNum;
 
-        // Might want to package this kind of branch stuff into a single
+        // @todo: Might want to package this kind of branch stuff into a single
         // struct as it is used pretty frequently.
         bool branchMispredict;
         bool branchTaken;
         uint64_t mispredPC;
         uint64_t nextPC;
+
+        unsigned branchCount;
     };
 
-    decodeComm decodeInfo;
+    decodeComm decodeInfo[Impl::MaxThreads];
 
     // Rename can't actually tell anything to squash or send a new PC back
     // because it doesn't do anything along those lines.  But maybe leave
     // these fields in here to keep the stages mostly orthagonal.
     struct renameComm {
         bool squash;
-        bool stall;
 
         uint64_t nextPC;
     };
 
-    renameComm renameInfo;
+    renameComm renameInfo[Impl::MaxThreads];
 
     struct iewComm {
-        bool stall;
-
         // Also eventually include skid buffer space.
+        bool usedIQ;
         unsigned freeIQEntries;
+        bool usedLSQ;
+        unsigned freeLSQEntries;
+
+        unsigned iqCount;
+        unsigned ldstqCount;
+
+        unsigned dispatched;
+        unsigned dispatchedToLSQ;
     };
 
-    iewComm iewInfo;
+    iewComm iewInfo[Impl::MaxThreads];
 
     struct commitComm {
-        bool squash;
-        bool stall;
+        bool usedROB;
         unsigned freeROBEntries;
+        bool emptyROB;
+
+        bool squash;
+        bool robSquashing;
 
         bool branchMispredict;
         bool branchTaken;
         uint64_t mispredPC;
         uint64_t nextPC;
 
-        bool robSquashing;
-
         // Represents the instruction that has either been retired or
         // squashed.  Similar to having a single bus that broadcasts the
         // retired or squashed sequence number.
         InstSeqNum doneSeqNum;
 
+        //Just in case we want to do a commit/squash on a cycle
+        //(necessary for multiple ROBs?)
+        bool commitInsts;
+        InstSeqNum squashSeqNum;
+
         // Extra bit of information so that the LDSTQ only updates when it
         // needs to.
         bool commitIsLoad;
@@ -155,9 +176,26 @@ struct TimeBufStruct {
         // Communication specifically to the IQ to tell the IQ that it can
         // schedule a non-speculative instruction.
         InstSeqNum nonSpecSeqNum;
+
+        // Hack for now to send back an uncached access to the IEW stage.
+        typedef typename Impl::DynInstPtr DynInstPtr;
+        bool uncached;
+        DynInstPtr uncachedLoad;
+
+        bool interruptPending;
+        bool clearInterrupt;
     };
 
-    commitComm commitInfo;
+    commitComm commitInfo[Impl::MaxThreads];
+
+    bool decodeBlock[Impl::MaxThreads];
+    bool decodeUnblock[Impl::MaxThreads];
+    bool renameBlock[Impl::MaxThreads];
+    bool renameUnblock[Impl::MaxThreads];
+    bool iewBlock[Impl::MaxThreads];
+    bool iewUnblock[Impl::MaxThreads];
+    bool commitBlock[Impl::MaxThreads];
+    bool commitUnblock[Impl::MaxThreads];
 };
 
-#endif //__CPU_O3_CPU_COMM_HH__
+#endif //__CPU_O3_COMM_HH__
diff --git a/cpu/o3/commit.cc b/cpu/o3/commit.cc
index cf33d7f8b..fe5e9c1de 100644
--- a/cpu/o3/commit.cc
+++ b/cpu/o3/commit.cc
@@ -30,4 +30,4 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/commit_impl.hh"
 
-template class SimpleCommit<AlphaSimpleImpl>;
+template class DefaultCommit<AlphaSimpleImpl>;
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 580c1a316..93b74ebb0 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -26,29 +26,42 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo: Maybe have a special method for handling interrupts/traps.
-//
-// Traps:  Have IEW send a signal to commit saying that there's a trap to
-// be handled.  Have commit send the PC back to the fetch stage, along
-// with the current commit PC.  Fetch will directly access the IPR and save
-// off all the proper stuff.  Commit can send out a squash, or something
-// close to it.
-// Do the same for hwrei().  However, requires that commit be specifically
-// built to support that kind of stuff.  Probably not horrible to have
-// commit support having the CPU tell it to squash the other stages and
-// restart at a given address.  The IPR register does become an issue.
-// Probably not a big deal if the IPR stuff isn't cycle accurate.  Can just
-// have the original function handle writing to the IPR register.
-
-#ifndef __CPU_O3_CPU_SIMPLE_COMMIT_HH__
-#define __CPU_O3_CPU_SIMPLE_COMMIT_HH__
+#ifndef __CPU_O3_COMMIT_HH__
+#define __CPU_O3_COMMIT_HH__
 
+#include "arch/faults.hh"
+#include "cpu/inst_seq.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
+#include "cpu/exetrace.hh"
 #include "mem/memory_interface.hh"
 
+template <class>
+class O3ThreadState;
+
+/**
+ * DefaultCommit handles single threaded and SMT commit. Its width is specified
+ * by the parameters; each cycle it tries to commit that many instructions. The
+ * SMT policy decides which thread it tries to commit instructions from. Non-
+ * speculative instructions must reach the head of the ROB before they are
+ * ready to execute; once they reach the head, commit will broadcast the
+ * instruction's sequence number to the previous stages so that they can issue/
+ * execute the instruction. Only one non-speculative instruction is handled per
+ * cycle. Commit is responsible for handling all back-end initiated redirects.
+ * It receives the redirect, and then broadcasts it to all stages, indicating
+ * the sequence number they should squash until, and any necessary branch mis-
+ * prediction information as well. It priortizes redirects by instruction's age,
+ * only broadcasting a redirect if it corresponds to an instruction that should
+ * currently be in the ROB. This is done by tracking the sequence number of the
+ * youngest instruction in the ROB, which gets updated to any squashing
+ * instruction's sequence number, and only broadcasting a redirect if it
+ * corresponds to an older instruction. Commit also supports multiple cycle
+ * squashing, to model a ROB that can only remove a certain number of
+ * instructions per cycle. Eventually traps and interrupts will most likely
+ * be handled here as well.
+ */
 template<class Impl>
-class SimpleCommit
+class DefaultCommit
 {
   public:
     // Typedefs from the Impl.
@@ -57,62 +70,191 @@ class SimpleCommit
     typedef typename Impl::Params Params;
     typedef typename Impl::CPUPol CPUPol;
 
+    typedef typename CPUPol::RenameMap RenameMap;
     typedef typename CPUPol::ROB ROB;
 
     typedef typename CPUPol::TimeStruct TimeStruct;
+    typedef typename CPUPol::FetchStruct FetchStruct;
     typedef typename CPUPol::IEWStruct IEWStruct;
     typedef typename CPUPol::RenameStruct RenameStruct;
 
-  public:
-    // I don't believe commit can block, so it will only have two
-    // statuses for now.
-    // Actually if there's a cache access that needs to block (ie
-    // uncachable load or just a mem access in commit) then the stage
-    // may have to wait.
-    enum Status {
+    typedef typename CPUPol::IEW IEW;
+
+    typedef O3ThreadState<Impl> Thread;
+
+    class TrapEvent : public Event {
+      private:
+        DefaultCommit<Impl> *commit;
+        unsigned tid;
+
+      public:
+        TrapEvent(DefaultCommit<Impl> *_commit, unsigned _tid);
+
+        void process();
+        const char *description();
+    };
+
+    /** Overall commit status. Used to determine if the CPU can deschedule
+     * itself due to a lack of activity.
+     */
+    enum CommitStatus{
+        Active,
+        Inactive
+    };
+
+    /** Individual thread status. */
+    enum ThreadStatus {
         Running,
         Idle,
         ROBSquashing,
-        DcacheMissStall,
-        DcacheMissComplete
+        TrapPending,
+        FetchTrapPending
+    };
+
+    /** Commit policy for SMT mode. */
+    enum CommitPolicy {
+        Aggressive,
+        RoundRobin,
+        OldestReady
     };
 
   private:
-    Status _status;
+    /** Overall commit status. */
+    CommitStatus _status;
+    /** Next commit status, to be set at the end of the cycle. */
+    CommitStatus _nextStatus;
+    /** Per-thread status. */
+    ThreadStatus commitStatus[Impl::MaxThreads];
+    /** Commit policy used in SMT mode. */
+    CommitPolicy commitPolicy;
 
   public:
-    SimpleCommit(Params &params);
+    /** Construct a DefaultCommit with the given parameters. */
+    DefaultCommit(Params *params);
 
+    /** Returns the name of the DefaultCommit. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets the CPU pointer. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets the list of threads. */
+    void setThreads(std::vector<Thread *> &threads);
+
+    /** Sets the main time buffer pointer, used for backwards communication. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    void setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr);
+
+    /** Sets the pointer to the queue coming from rename. */
     void setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr);
 
+    /** Sets the pointer to the queue coming from IEW. */
     void setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr);
 
+    /** Sets the poitner to the IEW stage. */
+    void setIEWStage(IEW *iew_stage);
+
+    /** The pointer to the IEW stage. Used solely to ensure that syscalls do
+     * not execute until all stores have written back.
+     */
+    IEW *iewStage;
+
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Sets pointer to the commited state rename map. */
+    void setRenameMap(RenameMap rm_ptr[Impl::MaxThreads]);
+
+    /** Sets pointer to the ROB. */
     void setROB(ROB *rob_ptr);
 
+    /** Initializes stage by sending back the number of free entries. */
+    void initStage();
+
+    /** Ticks the commit stage, which tries to commit instructions. */
     void tick();
 
+    /** Handles any squashes that are sent from IEW, and adds instructions
+     * to the ROB and tries to commit instructions.
+     */
     void commit();
 
-  private:
+    /** Returns the number of free ROB entries for a specific thread. */
+    unsigned numROBFreeEntries(unsigned tid);
 
+    void generateXCEvent(unsigned tid);
+
+  private:
+    /** Updates the overall status of commit with the nextStatus, and
+     * tell the CPU if commit is active/inactive. */
+    void updateStatus();
+
+    /** Sets the next status based on threads' statuses, which becomes the
+     * current status at the end of the cycle.
+     */
+    void setNextStatus();
+
+    /** Checks if the ROB is completed with squashing. This is for the case
+     * where the ROB can take multiple cycles to complete squashing.
+     */
+    bool robDoneSquashing();
+
+    /** Returns if any of the threads have the number of ROB entries changed
+     * on this cycle. Used to determine if the number of free ROB entries needs
+     * to be sent back to previous stages.
+     */
+    bool changedROBEntries();
+
+    void squashFromTrap(unsigned tid);
+
+    void squashFromXC(unsigned tid);
+
+    void squashInFlightInsts(unsigned tid);
+
+  private:
+    /** Commits as many instructions as possible. */
     void commitInsts();
 
+    /** Tries to commit the head ROB instruction passed in.
+     * @param head_inst The instruction to be committed.
+     */
     bool commitHead(DynInstPtr &head_inst, unsigned inst_num);
 
+    void generateTrapEvent(unsigned tid);
+
+    /** Gets instructions from rename and inserts them into the ROB. */
     void getInsts();
 
+    /** Marks completed instructions using information sent from IEW. */
     void markCompletedInsts();
 
-  public:
-    uint64_t readCommitPC();
+    /** Gets the thread to commit, based on the SMT policy. */
+    int getCommittingThread();
 
-    void setSquashing() { _status = ROBSquashing; }
+    /** Returns the thread ID to use based on a round robin policy. */
+    int roundRobin();
+
+    /** Returns the thread ID to use based on an oldest instruction policy. */
+    int oldestReady();
+
+  public:
+    /** Returns the PC of the head instruction of the ROB. */
+    uint64_t readPC();
+
+    uint64_t readPC(unsigned tid) { return PC[tid]; }
+
+    void setPC(uint64_t val, unsigned tid) { PC[tid] = val; }
+
+    uint64_t readNextPC(unsigned tid) { return nextPC[tid]; }
+
+    void setNextPC(uint64_t val, unsigned tid) { nextPC[tid] = val; }
+
+    /** Sets that the ROB is currently squashing. */
+    void setSquashing(unsigned tid);
 
   private:
     /** Time buffer interface. */
@@ -124,6 +266,10 @@ class SimpleCommit
     /** Wire to read information from IEW (for ROB). */
     typename TimeBuffer<TimeStruct>::wire robInfoFromIEW;
 
+    TimeBuffer<FetchStruct> *fetchQueue;
+
+    typename TimeBuffer<FetchStruct>::wire fromFetch;
+
     /** IEW instruction queue interface. */
     TimeBuffer<IEWStruct> *iewQueue;
 
@@ -136,22 +282,56 @@ class SimpleCommit
     /** Wire to read information from rename queue. */
     typename TimeBuffer<RenameStruct>::wire fromRename;
 
+  public:
     /** ROB interface. */
     ROB *rob;
 
+  private:
     /** Pointer to FullCPU. */
     FullCPU *cpu;
 
     /** Memory interface.  Used for d-cache accesses. */
     MemInterface *dcacheInterface;
 
+    std::vector<Thread *> thread;
+
   private:
+    Fault fetchFault;
+    InstSeqNum fetchFaultSN;
+    int fetchTrapWait;
+    /** Records that commit has written to the time buffer this cycle. Used for
+     * the CPU to determine if it can deschedule itself if there is no activity.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Records if the number of ROB entries has changed this cycle. If it has,
+     * then the number of free entries must be re-broadcast.
+     */
+    bool changedROBNumEntries[Impl::MaxThreads];
+
+    /** A counter of how many threads are currently squashing. */
+    int squashCounter;
+
+    /** Records if a thread has to squash this cycle due to a trap. */
+    bool trapSquash[Impl::MaxThreads];
+
+    /** Records if a thread has to squash this cycle due to an XC write. */
+    bool xcSquash[Impl::MaxThreads];
+
+    /** Priority List used for Commit Policy */
+    std::list<unsigned> priority_list;
+
     /** IEW to Commit delay, in ticks. */
     unsigned iewToCommitDelay;
 
+    /** Commit to IEW delay, in ticks. */
+    unsigned commitToIEWDelay;
+
     /** Rename to ROB delay, in ticks. */
     unsigned renameToROBDelay;
 
+    unsigned fetchToCommitDelay;
+
     /** Rename width, in instructions.  Used so ROB knows how many
      *  instructions to get from the rename instruction queue.
      */
@@ -165,16 +345,53 @@ class SimpleCommit
     /** Commit width, in instructions. */
     unsigned commitWidth;
 
-    Stats::Scalar<> commitCommittedInsts;
-    Stats::Scalar<> commitSquashedInsts;
-    Stats::Scalar<> commitSquashEvents;
-    Stats::Scalar<> commitNonSpecStalls;
-    Stats::Scalar<> commitCommittedBranches;
-    Stats::Scalar<> commitCommittedLoads;
-    Stats::Scalar<> commitCommittedMemRefs;
-    Stats::Scalar<> branchMispredicts;
+    /** Number of Reorder Buffers */
+    unsigned numRobs;
 
-    Stats::Distribution<> n_committed_dist;
+    /** Number of Active Threads */
+    unsigned numThreads;
+
+    Tick trapLatency;
+
+    Tick fetchTrapLatency;
+    Tick fetchFaultTick;
+
+    Addr PC[Impl::MaxThreads];
+
+    Addr nextPC[Impl::MaxThreads];
+
+    /** The sequence number of the youngest valid instruction in the ROB. */
+    InstSeqNum youngestSeqNum[Impl::MaxThreads];
+
+    /** Pointer to the list of active threads. */
+    std::list<unsigned> *activeThreads;
+
+    /** Rename map interface. */
+    RenameMap *renameMap[Impl::MaxThreads];
+
+    /** Stat for the total number of committed instructions. */
+    Stats::Scalar<> commitCommittedInsts;
+    /** Stat for the total number of squashed instructions discarded by commit.
+     */
+    Stats::Scalar<> commitSquashedInsts;
+    /** Stat for the total number of times commit is told to squash.
+     * @todo: Actually increment this stat.
+     */
+    Stats::Scalar<> commitSquashEvents;
+    /** Stat for the total number of times commit has had to stall due to a non-
+     * speculative instruction reaching the head of the ROB.
+     */
+    Stats::Scalar<> commitNonSpecStalls;
+    /** Stat for the total number of committed branches. */
+    Stats::Scalar<> commitCommittedBranches;
+    /** Stat for the total number of committed loads. */
+    Stats::Scalar<> commitCommittedLoads;
+    /** Stat for the total number of committed memory references. */
+    Stats::Scalar<> commitCommittedMemRefs;
+    /** Stat for the total number of branch mispredicts that caused a squash. */
+    Stats::Scalar<> branchMispredicts;
+    /** Distribution of the number of committed instructions each cycle. */
+    Stats::Distribution<> numCommittedDist;
 };
 
-#endif // __CPU_O3_CPU_SIMPLE_COMMIT_HH__
+#endif // __CPU_O3_COMMIT_HH__
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index e289bc0c0..ef1ba9282 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -26,25 +26,112 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <stdio.h>
+#include <string.h>
+
+#include "base/loader/symtab.hh"
 #include "base/timebuf.hh"
-#include "cpu/o3/commit.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/o3/commit.hh"
+#include "cpu/o3/thread_state.hh"
+
+using namespace std;
 
 template <class Impl>
-SimpleCommit<Impl>::SimpleCommit(Params &params)
-    : dcacheInterface(params.dcacheInterface),
-      iewToCommitDelay(params.iewToCommitDelay),
-      renameToROBDelay(params.renameToROBDelay),
-      renameWidth(params.renameWidth),
-      iewWidth(params.executeWidth),
-      commitWidth(params.commitWidth)
+DefaultCommit<Impl>::TrapEvent::TrapEvent(DefaultCommit<Impl> *_commit,
+                                          unsigned _tid)
+    : Event(&mainEventQueue, CPU_Tick_Pri), commit(_commit), tid(_tid)
 {
-    _status = Idle;
+    this->setFlags(Event::AutoDelete);
 }
 
 template <class Impl>
 void
-SimpleCommit<Impl>::regStats()
+DefaultCommit<Impl>::TrapEvent::process()
+{
+    commit->trapSquash[tid] = true;
+}
+
+template <class Impl>
+const char *
+DefaultCommit<Impl>::TrapEvent::description()
+{
+    return "Trap event";
+}
+
+template <class Impl>
+DefaultCommit<Impl>::DefaultCommit(Params *params)
+    : dcacheInterface(params->dcacheInterface),
+      squashCounter(0),
+      iewToCommitDelay(params->iewToCommitDelay),
+      commitToIEWDelay(params->commitToIEWDelay),
+      renameToROBDelay(params->renameToROBDelay),
+      fetchToCommitDelay(params->commitToFetchDelay),
+      renameWidth(params->renameWidth),
+      iewWidth(params->executeWidth),
+      commitWidth(params->commitWidth),
+      numThreads(params->numberOfThreads)
+{
+    _status = Active;
+    _nextStatus = Inactive;
+    string policy = params->smtCommitPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Assign commit policy
+    if (policy == "aggressive"){
+        commitPolicy = Aggressive;
+
+        DPRINTF(Commit,"Commit Policy set to Aggressive.");
+    } else if (policy == "roundrobin"){
+        commitPolicy = RoundRobin;
+
+        //Set-Up Priority List
+        for (int tid=0; tid < numThreads; tid++) {
+            priority_list.push_back(tid);
+        }
+
+        DPRINTF(Commit,"Commit Policy set to Round Robin.");
+    } else if (policy == "oldestready"){
+        commitPolicy = OldestReady;
+
+        DPRINTF(Commit,"Commit Policy set to Oldest Ready.");
+    } else {
+        assert(0 && "Invalid SMT Commit Policy. Options Are: {Aggressive,"
+               "RoundRobin,OldestReady}");
+    }
+
+    for (int i=0; i < numThreads; i++) {
+        commitStatus[i] = Idle;
+        changedROBNumEntries[i] = false;
+        trapSquash[i] = false;
+        xcSquash[i] = false;
+    }
+
+    // Hardcoded trap latency.
+    trapLatency = 6;
+    fetchTrapLatency = 12;
+    fetchFaultTick = 0;
+    fetchTrapWait = 0;
+}
+
+template <class Impl>
+std::string
+DefaultCommit<Impl>::name() const
+{
+    return cpu->name() + ".commit";
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::regStats()
 {
     commitCommittedInsts
         .name(name() + ".commitCommittedInsts")
@@ -79,7 +166,7 @@ SimpleCommit<Impl>::regStats()
         .name(name() + ".branchMispredicts")
         .desc("The number of times a branch was mispredicted")
         .prereq(branchMispredicts);
-    n_committed_dist
+    numCommittedDist
         .init(0,commitWidth,1)
         .name(name() + ".COM:committed_per_cycle")
         .desc("Number of insts commited each cycle")
@@ -89,15 +176,26 @@ SimpleCommit<Impl>::regStats()
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     DPRINTF(Commit, "Commit: Setting CPU pointer.\n");
     cpu = cpu_ptr;
+
+    // Commit must broadcast the number of free entries it has at the start of
+    // the simulation, so it starts as active.
+    cpu->activateStage(FullCPU::CommitIdx);
 }
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+DefaultCommit<Impl>::setThreads(vector<Thread *> &threads)
+{
+    thread = threads;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
     DPRINTF(Commit, "Commit: Setting time buffer pointer.\n");
     timeBuffer = tb_ptr;
@@ -111,7 +209,18 @@ SimpleCommit<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
+DefaultCommit<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+{
+    DPRINTF(Commit, "Commit: Setting fetch queue pointer.\n");
+    fetchQueue = fq_ptr;
+
+    // Setup wire to get instructions from rename (for the ROB).
+    fromFetch = fetchQueue->getWire(-fetchToCommitDelay);
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 {
     DPRINTF(Commit, "Commit: Setting rename queue pointer.\n");
     renameQueue = rq_ptr;
@@ -122,7 +231,7 @@ SimpleCommit<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
+DefaultCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 {
     DPRINTF(Commit, "Commit: Setting IEW queue pointer.\n");
     iewQueue = iq_ptr;
@@ -133,7 +242,33 @@ SimpleCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setROB(ROB *rob_ptr)
+DefaultCommit<Impl>::setIEWStage(IEW *iew_stage)
+{
+    iewStage = iew_stage;
+}
+
+template<class Impl>
+void
+DefaultCommit<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(Commit, "Commit: Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setRenameMap(RenameMap rm_ptr[])
+{
+    DPRINTF(Commit, "Setting rename map pointers.\n");
+
+    for (int i=0; i < numThreads; i++) {
+        renameMap[i] = &rm_ptr[i];
+    }
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setROB(ROB *rob_ptr)
 {
     DPRINTF(Commit, "Commit: Setting ROB pointer.\n");
     rob = rob_ptr;
@@ -141,41 +276,317 @@ SimpleCommit<Impl>::setROB(ROB *rob_ptr)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::tick()
+DefaultCommit<Impl>::initStage()
 {
-    // If the ROB is currently in its squash sequence, then continue
-    // to squash.  In this case, commit does not do anything.  Otherwise
-    // run commit.
-    if (_status == ROBSquashing) {
-        if (rob->isDoneSquashing()) {
-            _status = Running;
-        } else {
-            rob->doSquash();
+    rob->setActiveThreads(activeThreads);
+    rob->resetEntries();
 
-            // Send back sequence number of tail of ROB, so other stages
-            // can squash younger instructions.  Note that really the only
-            // stage that this is important for is the IEW stage; other
-            // stages can just clear all their state as long as selective
-            // replay isn't used.
-            toIEW->commitInfo.doneSeqNum = rob->readTailSeqNum();
-            toIEW->commitInfo.robSquashing = true;
-        }
-    } else {
-        commit();
+    // Broadcast the number of free entries.
+    for (int i=0; i < numThreads; i++) {
+        toIEW->commitInfo[i].usedROB = true;
+        toIEW->commitInfo[i].freeROBEntries = rob->numFreeEntries(i);
     }
 
-    markCompletedInsts();
-
-    // Writeback number of free ROB entries here.
-    DPRINTF(Commit, "Commit: ROB has %d free entries.\n",
-            rob->numFreeEntries());
-    toIEW->commitInfo.freeROBEntries = rob->numFreeEntries();
+    cpu->activityThisCycle();
 }
 
 template <class Impl>
 void
-SimpleCommit<Impl>::commit()
+DefaultCommit<Impl>::updateStatus()
 {
+    if (commitStatus[0] == TrapPending ||
+        commitStatus[0] == FetchTrapPending) {
+        _nextStatus = Active;
+    }
+
+    if (_nextStatus == Inactive && _status == Active) {
+        DPRINTF(Activity, "Deactivating stage.\n");
+        cpu->deactivateStage(FullCPU::CommitIdx);
+    } else if (_nextStatus == Active && _status == Inactive) {
+        DPRINTF(Activity, "Activating stage.\n");
+        cpu->activateStage(FullCPU::CommitIdx);
+    }
+
+    _status = _nextStatus;
+
+    // reset ROB changed variable
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+        changedROBNumEntries[tid] = false;
+    }
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setNextStatus()
+{
+    int squashes = 0;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (commitStatus[tid] == ROBSquashing) {
+            squashes++;
+        }
+    }
+
+    assert(squashes == squashCounter);
+
+    // If commit is currently squashing, then it will have activity for the
+    // next cycle. Set its next status as active.
+    if (squashCounter) {
+        _nextStatus = Active;
+    }
+}
+
+template <class Impl>
+bool
+DefaultCommit<Impl>::changedROBEntries()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (changedROBNumEntries[tid]) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+template <class Impl>
+unsigned
+DefaultCommit<Impl>::numROBFreeEntries(unsigned tid)
+{
+    return rob->numFreeEntries(tid);
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::generateTrapEvent(unsigned tid)
+{
+    DPRINTF(Commit, "Generating trap event for [tid:%i]\n", tid);
+
+    TrapEvent *trap = new TrapEvent(this, tid);
+
+    trap->schedule(curTick + trapLatency);
+
+    thread[tid]->trapPending = true;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::generateXCEvent(unsigned tid)
+{
+    DPRINTF(Commit, "Generating XC squash event for [tid:%i]\n", tid);
+
+    xcSquash[tid] = true;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashFromTrap(unsigned tid)
+{
+    // If we want to include the squashing instruction in the squash,
+    // then use one older sequence number.
+    // Hopefully this doesn't mess things up.  Basically I want to squash
+    // all instructions of this thread.
+    InstSeqNum squashed_inst = rob->isEmpty() ?
+        0 : rob->readHeadInst(tid)->seqNum - 1;
+
+    // All younger instructions will be squashed. Set the sequence
+    // number as the youngest instruction in the ROB (0 in this case.
+    // Hopefully nothing breaks.)
+    youngestSeqNum[tid] = 0;
+
+    rob->squash(squashed_inst, tid);
+    changedROBNumEntries[tid] = true;
+
+    // Send back the sequence number of the squashed instruction.
+    toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
+
+    // Send back the squash signal to tell stages that they should
+    // squash.
+    toIEW->commitInfo[tid].squash = true;
+
+    // Send back the rob squashing signal so other stages know that
+    // the ROB is in the process of squashing.
+    toIEW->commitInfo[tid].robSquashing = true;
+
+    toIEW->commitInfo[tid].branchMispredict = false;
+
+//    toIEW->commitInfo[tid].branchTaken = fromIEW->branchTaken[tid];
+
+    toIEW->commitInfo[tid].nextPC = PC[tid];
+
+    DPRINTF(Commit, "Squashing from trap, restarting at PC %#x\n", PC[tid]);
+    // Hopefully nobody tries to use the mispredPC becuase I said there
+    // wasn't a branch mispredict.
+//    toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
+
+    thread[tid]->trapPending = false;
+    thread[tid]->inSyscall = false;
+
+    trapSquash[tid] = false;
+
+    // Not sure what to set this to...
+    commitStatus[tid] = ROBSquashing;
+    cpu->activityThisCycle();
+
+    ++squashCounter;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashFromXC(unsigned tid)
+{
+    // For now these are identical.  In the future, the squash from trap
+    // might execute the trap prior to the squash.
+
+    // If we want to include the squashing instruction in the squash,
+    // then use one older sequence number.
+    // Hopefully this doesn't mess things up.  Basically I want to squash
+    // all instructions of this thread.
+    InstSeqNum squashed_inst = rob->isEmpty() ?
+        0 : rob->readHeadInst(tid)->seqNum - 1;;
+
+    // All younger instructions will be squashed. Set the sequence
+    // number as the youngest instruction in the ROB (0 in this case.
+    // Hopefully nothing breaks.)
+    youngestSeqNum[tid] = 0;
+
+    rob->squash(squashed_inst, tid);
+    changedROBNumEntries[tid] = true;
+
+    // Send back the sequence number of the squashed instruction.
+    toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
+
+    // Send back the squash signal to tell stages that they should
+    // squash.
+    toIEW->commitInfo[tid].squash = true;
+
+    // Send back the rob squashing signal so other stages know that
+    // the ROB is in the process of squashing.
+    toIEW->commitInfo[tid].robSquashing = true;
+
+    toIEW->commitInfo[tid].branchMispredict = false;
+
+//    toIEW->commitInfo[tid].branchTaken = fromIEW->branchTaken[tid];
+
+    toIEW->commitInfo[tid].nextPC = PC[tid];
+
+    DPRINTF(Commit, "Squashing from XC, restarting at PC %#x\n", PC[tid]);
+    // Hopefully nobody tries to use the mispredPC becuase I said there
+    // wasn't a branch mispredict.
+//    toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
+
+    thread[tid]->inSyscall = false;
+    assert(!thread[tid]->trapPending);
+    // Not sure what to set this to...
+    commitStatus[tid] = ROBSquashing;
+    cpu->activityThisCycle();
+
+    xcSquash[tid] = false;
+
+    ++squashCounter;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashInFlightInsts(unsigned tid)
+{
+    // @todo: Fix this hardcoded number.
+    for (int i = 0; i < -5; ++i) {
+        for (int j = 0; j < (*iewQueue)[i].size; ++j) {
+            DynInstPtr inst = (*iewQueue)[i].insts[j];
+            if (inst->threadNumber == tid &&
+                !inst->isSquashed()) {
+                inst->setSquashed();
+            }
+        }
+    }
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::tick()
+{
+    wroteToTimeBuffer = false;
+    _nextStatus = Inactive;
+
+    // If the ROB is currently in its squash sequence, then continue
+    // to squash.  In this case, commit does not do anything.  Otherwise
+    // run commit.
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    // Maybe this should be dependent upon any of the commits actually
+    // squashing.
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (commitStatus[tid] == ROBSquashing) {
+
+            if (rob->isDoneSquashing(tid)) {
+                commitStatus[tid] = Running;
+                --squashCounter;
+            } else {
+                DPRINTF(Commit,"[tid:%u]: Still Squashing, cannot commit any"
+                        "insts this cycle.\n", tid);
+            }
+        }
+    }
+
+    commit();
+
+    markCompletedInsts();
+
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (!rob->isEmpty(tid) && rob->readHeadInst(tid)->readyToCommit()) {
+            // The ROB has more instructions it can commit. Its next status
+            // will be active.
+            _nextStatus = Active;
+
+            DynInstPtr inst = rob->readHeadInst(tid);
+
+            DPRINTF(Commit,"[tid:%i]: Instruction [sn:%lli] PC %#x is head of"
+                    " ROB and ready to commit\n",
+                    tid, inst->seqNum, inst->readPC());
+
+        } else if (!rob->isEmpty(tid)) {
+            DynInstPtr inst = rob->readHeadInst(tid);
+
+            DPRINTF(Commit,"[tid:%i]: Can't commit, Instruction [sn:%lli] PC "
+                    "%#x is head of ROB and not ready\n",
+                    tid, inst->seqNum, inst->readPC());
+        }
+
+        DPRINTF(Commit, "[tid:%i]: ROB has %d insts & %d free entries.\n",
+                tid, rob->countInsts(tid), rob->numFreeEntries(tid));
+    }
+
+
+    if (wroteToTimeBuffer) {
+        DPRINTF(Activity,"Activity This Cycle.\n");
+        cpu->activityThisCycle();
+    }
+
+    updateStatus();
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::commit()
+{
+
     //////////////////////////////////////
     // Check for interrupts
     //////////////////////////////////////
@@ -187,17 +598,44 @@ SimpleCommit<Impl>::commit()
     // hwrei() is what resets the PC to the place where instruction execution
     // beings again.
 #if FULL_SYSTEM
-    if (//checkInterrupts &&
+//#if 0
+    if (cpu->checkInterrupts &&
         cpu->check_interrupts() &&
-        !cpu->inPalMode(readCommitPC())) {
-        // Will need to squash all instructions currently in flight and have
-        // the interrupt handler restart at the last non-committed inst.
-        // Most of that can be handled through the trap() function.  The
-        // processInterrupts() function really just checks for interrupts
-        // and then calls trap() if there is an interrupt present.
+        !cpu->inPalMode(readPC()) &&
+        !trapSquash[0] &&
+        !xcSquash[0]) {
+//        commitStatus[0] = TrapPending;
+        toIEW->commitInfo[0].interruptPending = true;
+        if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
+            // Will need to squash all instructions currently in flight and have
+            // the interrupt handler restart at the last non-committed inst.
+            // Most of that can be handled through the trap() function.  The
+            // processInterrupts() function really just checks for interrupts
+            // and then calls trap() if there is an interrupt present.
 
-        // CPU will handle implementation of the interrupt.
-        cpu->processInterrupts();
+            // Not sure which thread should be the one to interrupt.  For now
+            // always do thread 0.
+            assert(!thread[0]->inSyscall);
+            thread[0]->inSyscall = true;
+
+            // CPU will handle implementation of the interrupt.
+            cpu->processInterrupts();
+
+            // Now squash or record that I need to squash this cycle.
+            commitStatus[0] = TrapPending;
+
+            // Exit state update mode to avoid accidental updating.
+            thread[0]->inSyscall = false;
+
+            // Generate trap squash event.
+            generateTrapEvent(0);
+
+            toIEW->commitInfo[0].clearInterrupt = true;
+
+            DPRINTF(Commit, "Interrupt detected.\n");
+        } else {
+            DPRINTF(Commit, "Interrupt pending, waiting for ROB to empty.\n");
+        }
     }
 #endif // FULL_SYSTEM
 
@@ -205,43 +643,113 @@ SimpleCommit<Impl>::commit()
     // Check for squash signal, handle that first
     ////////////////////////////////////
 
-    // Want to mainly check if the IEW stage is telling the ROB to squash.
-    // Should I also check if the commit stage is telling the ROB to squah?
-    // This might be necessary to keep the same timing between the IQ and
-    // the ROB...
-    if (fromIEW->squash) {
-        DPRINTF(Commit, "Commit: Squashing instructions in the ROB.\n");
+    // Check if the IEW stage is telling the ROB to squash.
+    list<unsigned>::iterator threads = (*activeThreads).begin();
 
-        _status = ROBSquashing;
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
 
-        InstSeqNum squashed_inst = fromIEW->squashedSeqNum;
-
-        rob->squash(squashed_inst);
-
-        // Send back the sequence number of the squashed instruction.
-        toIEW->commitInfo.doneSeqNum = squashed_inst;
-
-        // Send back the squash signal to tell stages that they should squash.
-        toIEW->commitInfo.squash = true;
-
-        // Send back the rob squashing signal so other stages know that the
-        // ROB is in the process of squashing.
-        toIEW->commitInfo.robSquashing = true;
-
-        toIEW->commitInfo.branchMispredict = fromIEW->branchMispredict;
-
-        toIEW->commitInfo.branchTaken = fromIEW->branchTaken;
-
-        toIEW->commitInfo.nextPC = fromIEW->nextPC;
-
-        toIEW->commitInfo.mispredPC = fromIEW->mispredPC;
-
-        if (toIEW->commitInfo.branchMispredict) {
-            ++branchMispredicts;
+        if (fromFetch->fetchFault) {
+            // Record the fault.  Wait until it's empty in the ROB.  Then handle the trap.
+            fetchFault = fromFetch->fetchFault;
+            fetchFaultSN = fromFetch->fetchFaultSN;
+            fetchFaultTick = curTick + fetchTrapLatency;
+            commitStatus[0] = FetchTrapPending;
+            DPRINTF(Commit, "Fault from fetch recorded.  Will trap if the "
+                    "ROB empties without squashing the fault.\n");
+            fetchTrapWait = 0;
         }
+        if (fromFetch->clearFetchFault) {
+            DPRINTF(Commit, "Received clear fetch fault signal\n");
+            fetchTrapWait = 0;
+            if (commitStatus[0] == FetchTrapPending) {
+                DPRINTF(Commit, "Clearing fault from fetch\n");
+                commitStatus[0] = Running;
+            }
+        }
+
+        // Not sure which one takes priority.  I think if we have
+        // both, that's a bad sign.
+        if (trapSquash[tid] == true) {
+            assert(!xcSquash[tid]);
+            squashFromTrap(tid);
+        } else if (xcSquash[tid] == true) {
+            squashFromXC(tid);
+        }
+
+        // Squashed sequence number must be older than youngest valid
+        // instruction in the ROB. This prevents squashes from younger
+        // instructions overriding squashes from older instructions.
+        if (fromIEW->squash[tid] &&
+            commitStatus[tid] != TrapPending &&
+            fromIEW->squashedSeqNum[tid] <= youngestSeqNum[tid]) {
+
+            DPRINTF(Commit, "[tid:%u]: Squashing instructions in the "
+                    "ROB.\n",
+                    tid);
+
+            DPRINTF(Commit, "[tid:%i]: Squashing due to PC %#x [sn:%i]\n",
+                    tid,
+                    fromIEW->mispredPC[tid],
+                    fromIEW->squashedSeqNum[tid]);
+
+            DPRINTF(Commit, "[tid:%i]: Redirecting to PC %#x\n",
+                    tid,
+                    fromIEW->nextPC[tid]);
+
+            commitStatus[tid] = ROBSquashing;
+
+            ++squashCounter;
+
+            // If we want to include the squashing instruction in the squash,
+            // then use one older sequence number.
+            InstSeqNum squashed_inst = fromIEW->squashedSeqNum[tid];
+
+            if (fromIEW->includeSquashInst[tid] == true)
+                squashed_inst--;
+
+            // All younger instructions will be squashed. Set the sequence
+            // number as the youngest instruction in the ROB.
+            youngestSeqNum[tid] = squashed_inst;
+
+            rob->squash(squashed_inst, tid);
+            changedROBNumEntries[tid] = true;
+
+            // Send back the sequence number of the squashed instruction.
+            toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
+
+            // Send back the squash signal to tell stages that they should
+            // squash.
+            toIEW->commitInfo[tid].squash = true;
+
+            // Send back the rob squashing signal so other stages know that
+            // the ROB is in the process of squashing.
+            toIEW->commitInfo[tid].robSquashing = true;
+
+            toIEW->commitInfo[tid].branchMispredict =
+                fromIEW->branchMispredict[tid];
+
+            toIEW->commitInfo[tid].branchTaken =
+                fromIEW->branchTaken[tid];
+
+            toIEW->commitInfo[tid].nextPC = fromIEW->nextPC[tid];
+
+            DPRINTF(Commit, "Squashing from IEW, restarting at PC %#x\n",
+                    fromIEW->nextPC[tid]);
+
+            toIEW->commitInfo[tid].mispredPC =
+                fromIEW->mispredPC[tid];
+
+            if (toIEW->commitInfo[tid].branchMispredict) {
+                ++branchMispredicts;
+            }
+        }
+
     }
 
-    if (_status != ROBSquashing) {
+    setNextStatus();
+
+    if (squashCounter != numThreads) {
         // If we're not currently squashing, then get instructions.
         getInsts();
 
@@ -249,24 +757,29 @@ SimpleCommit<Impl>::commit()
         commitInsts();
     }
 
-    // If the ROB is empty, we can set this stage to idle.  Use this
-    // in the future when the Idle status will actually be utilized.
-#if 0
-    if (rob->isEmpty()) {
-        DPRINTF(Commit, "Commit: ROB is empty.  Status changed to idle.\n");
-        _status = Idle;
-        // Schedule an event so that commit will actually wake up
-        // once something gets put in the ROB.
+    //Check for any activity
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (changedROBNumEntries[tid]) {
+            toIEW->commitInfo[tid].usedROB = true;
+            toIEW->commitInfo[tid].freeROBEntries = rob->numFreeEntries(tid);
+
+            if (rob->isEmpty(tid)) {
+                toIEW->commitInfo[tid].emptyROB = true;
+            }
+
+            wroteToTimeBuffer = true;
+            changedROBNumEntries[tid] = false;
+        }
     }
-#endif
 }
 
-// Loop that goes through as many instructions in the ROB as possible and
-// tries to commit them.  The actual work for committing is done by the
-// commitHead() function.
 template <class Impl>
 void
-SimpleCommit<Impl>::commitInsts()
+DefaultCommit<Impl>::commitInsts()
 {
     ////////////////////////////////////
     // Handle commit
@@ -276,94 +789,193 @@ SimpleCommit<Impl>::commitInsts()
     // Can't commit and squash things at the same time...
     ////////////////////////////////////
 
-    if (rob->isEmpty())
-        return;
-
-    DynInstPtr head_inst = rob->readHeadInst();
+    DPRINTF(Commit, "Trying to commit instructions in the ROB.\n");
 
     unsigned num_committed = 0;
 
+    DynInstPtr head_inst;
+#if FULL_SYSTEM
+    if (commitStatus[0] == FetchTrapPending) {
+        DPRINTF(Commit, "Fault from fetch is pending.\n");
+        if (rob->isEmpty()) {
+            fetchTrapWait++;
+            if (fetchTrapWait > 10000000) {
+                panic("Fetch trap has been pending for a long time!");
+            }
+            if (fetchFaultTick > curTick) {
+                DPRINTF(Commit, "Not enough cycles since fault, fault will "
+                        "happen on %lli\n",
+                        fetchFaultTick);
+                cpu->activityThisCycle();
+                return;
+            } else if (iewStage->hasStoresToWB()) {
+                DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
+                        "they are completed. fetchTrapWait:%i\n",
+                        fetchTrapWait);
+                cpu->activityThisCycle();
+                return;
+            } else if (cpu->inPalMode(readPC())) {
+                DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
+                        fetchTrapWait);
+                return;
+            }
+            fetchTrapWait = 0;
+            DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
+
+            assert(!thread[0]->inSyscall);
+
+            thread[0]->inSyscall = true;
+
+            // Consider holding onto the trap and waiting until the trap event
+            // happens for this to be executed.
+            cpu->trap(fetchFault, 0);
+
+            // Exit state update mode to avoid accidental updating.
+            thread[0]->inSyscall = false;
+
+            commitStatus[0] = TrapPending;
+            // Set it up so that we squash next cycle
+            trapSquash[0] = true;
+            return;
+        }
+    }
+#endif
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
-    while (!rob->isEmpty() &&
-           head_inst->readyToCommit() &&
-           num_committed < commitWidth)
-    {
-        DPRINTF(Commit, "Commit: Trying to commit head instruction.\n");
+    while (num_committed < commitWidth) {
+        int commit_thread = getCommittingThread();
+
+        if (commit_thread == -1 || !rob->isHeadReady(commit_thread))
+            break;
+
+        head_inst = rob->readHeadInst(commit_thread);
+
+        int tid = head_inst->threadNumber;
+
+        assert(tid == commit_thread);
+
+        DPRINTF(Commit, "Trying to commit head instruction, [sn:%i] [tid:%i]\n",
+                head_inst->seqNum, tid);
 
         // If the head instruction is squashed, it is ready to retire at any
         // time.  However, we need to avoid updating any other state
         // incorrectly if it's already been squashed.
         if (head_inst->isSquashed()) {
 
-            DPRINTF(Commit, "Commit: Retiring squashed instruction from "
+            DPRINTF(Commit, "Retiring squashed instruction from "
                     "ROB.\n");
 
             // Tell ROB to retire head instruction.  This retires the head
             // inst in the ROB without affecting any other stages.
-            rob->retireHead();
+            rob->retireHead(commit_thread);
 
             ++commitSquashedInsts;
 
+            // Record that the number of ROB entries has changed.
+            changedROBNumEntries[tid] = true;
         } else {
+            PC[tid] = head_inst->readPC();
+            nextPC[tid] = head_inst->readNextPC();
+
             // Increment the total number of non-speculative instructions
             // executed.
             // Hack for now: it really shouldn't happen until after the
             // commit is deemed to be successful, but this count is needed
             // for syscalls.
-            cpu->funcExeInst++;
+            thread[tid]->funcExeInst++;
 
             // Try to commit the head instruction.
             bool commit_success = commitHead(head_inst, num_committed);
 
-            // Update what instruction we are looking at if the commit worked.
             if (commit_success) {
                 ++num_committed;
 
-                // Send back which instruction has been committed.
-                // @todo: Update this later when a wider pipeline is used.
-                // Hmm, can't really give a pointer here...perhaps the
-                // sequence number instead (copy).
-                toIEW->commitInfo.doneSeqNum = head_inst->seqNum;
+                // Record that the number of ROB entries has changed.
+                changedROBNumEntries[tid] = true;
+
+                // Set the doneSeqNum to the youngest committed instruction.
+                toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum;
 
                 ++commitCommittedInsts;
 
-                if (!head_inst->isNop()) {
-                    cpu->instDone();
+                // To match the old model, don't count nops and instruction
+                // prefetches towards the total commit count.
+                if (!head_inst->isNop() && !head_inst->isInstPrefetch()) {
+                    cpu->instDone(tid);
                 }
+
+                PC[tid] = nextPC[tid];
+#if FULL_SYSTEM
+                int count = 0;
+                Addr oldpc;
+                do {
+                    if (count == 0)
+                        assert(!thread[tid]->inSyscall && !thread[tid]->trapPending);
+                    oldpc = PC[tid];
+                    cpu->system->pcEventQueue.service(
+                        thread[tid]->getXCProxy());
+                    count++;
+                } while (oldpc != PC[tid]);
+                if (count > 1) {
+                    DPRINTF(Commit, "PC skip function event, stopping commit\n");
+                    break;
+                }
+#endif
             } else {
+                DPRINTF(Commit, "Unable to commit head instruction PC:%#x "
+                        "[tid:%i] [sn:%i].\n",
+                        head_inst->readPC(), tid ,head_inst->seqNum);
                 break;
             }
         }
-
-        // Update the pointer to read the next instruction in the ROB.
-        head_inst = rob->readHeadInst();
     }
 
     DPRINTF(CommitRate, "%i\n", num_committed);
-    n_committed_dist.sample(num_committed);
+    numCommittedDist.sample(num_committed);
 }
 
 template <class Impl>
 bool
-SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
+DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 {
     // Make sure instruction is valid
     assert(head_inst);
 
+    int tid = head_inst->threadNumber;
+
     // If the instruction is not executed yet, then it is a non-speculative
     // or store inst.  Signal backwards that it should be executed.
     if (!head_inst->isExecuted()) {
         // Keep this number correct.  We have not yet actually executed
         // and committed this instruction.
-        cpu->funcExeInst--;
+        thread[tid]->funcExeInst--;
 
-        if (head_inst->isNonSpeculative()) {
-            DPRINTF(Commit, "Commit: Encountered a store or non-speculative "
-                    "instruction at the head of the ROB, PC %#x.\n",
-                    head_inst->readPC());
+        head_inst->reachedCommit = true;
 
-            toIEW->commitInfo.nonSpecSeqNum = head_inst->seqNum;
+        if (head_inst->isNonSpeculative() ||
+            head_inst->isMemBarrier() ||
+            head_inst->isWriteBarrier()) {
+#if !FULL_SYSTEM
+            // Hack to make sure syscalls aren't executed until all stores
+            // write back their data.  This direct communication shouldn't
+            // be used for anything other than this.
+            if (inst_num > 0 || iewStage->hasStoresToWB())
+#else
+            if ((head_inst->isMemBarrier() || head_inst->isWriteBarrier() ||
+                    head_inst->isQuiesce()) &&
+                iewStage->hasStoresToWB())
+#endif
+            {
+                DPRINTF(Commit, "Waiting for all stores to writeback.\n");
+                return false;
+            }
+
+            DPRINTF(Commit, "Encountered a barrier or non-speculative "
+                    "instruction [sn:%lli] at the head of the ROB, PC %#x.\n",
+                    head_inst->seqNum, head_inst->readPC());
+
+            // Send back the non-speculative instruction's sequence number.
+            toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
 
             // Change the instruction so it won't try to commit again until
             // it is executed.
@@ -371,25 +983,34 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 
             ++commitNonSpecStalls;
 
+            return false;
+        } else if (head_inst->isLoad()) {
+            DPRINTF(Commit, "[sn:%lli]: Uncached load, PC %#x.\n",
+                    head_inst->seqNum, head_inst->readPC());
+
+            // Send back the non-speculative instruction's sequence
+            // number.  Maybe just tell the lsq to re-execute the load.
+            toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
+            toIEW->commitInfo[tid].uncached = true;
+            toIEW->commitInfo[tid].uncachedLoad = head_inst;
+
+            head_inst->clearCanCommit();
+
             return false;
         } else {
-            panic("Commit: Trying to commit un-executed instruction "
+            panic("Trying to commit un-executed instruction "
                   "of unknown type!\n");
         }
     }
 
     // Now check if it's one of the special trap or barrier or
     // serializing instructions.
-    if (head_inst->isThreadSync()  ||
-        head_inst->isSerializing() ||
-        head_inst->isMemBarrier()  ||
-        head_inst->isWriteBarrier() )
+    if (head_inst->isThreadSync())/*  ||
+//        head_inst->isMemBarrier()  ||
+head_inst->isWriteBarrier())*/
     {
-        // Not handled for now.  Mem barriers and write barriers are safe
-        // to simply let commit as memory accesses only happen once they
-        // reach the head of commit.  Not sure about the other two.
-        panic("Serializing or barrier instructions"
-              " are not handled yet.\n");
+        // Not handled for now.
+        panic("Barrier instructions are not handled yet.\n");
     }
 
     // Check if the instruction caused a fault.  If so, trap.
@@ -398,7 +1019,32 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
     if (inst_fault != NoFault) {
         if (!head_inst->isNop()) {
 #if FULL_SYSTEM
-            cpu->trap(inst_fault);
+            DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
+                    head_inst->seqNum, head_inst->readPC());
+
+            assert(!thread[tid]->inSyscall);
+
+            thread[tid]->inSyscall = true;
+
+            // Hack for now; DTB will sometimes need the machine instruction
+            // for when faults happen.  So we will set it here, prior to the
+            // DTB possibly needing it for this translation.
+            thread[tid]->setInst(
+                static_cast<TheISA::MachInst>(head_inst->staticInst->machInst));
+
+            // Consider holding onto the trap and waiting until the trap event
+            // happens for this to be executed.
+            cpu->trap(inst_fault, tid);
+
+            // Exit state update mode to avoid accidental updating.
+            thread[tid]->inSyscall = false;
+
+            commitStatus[tid] = TrapPending;
+
+            // Generate trap squash event.
+            generateTrapEvent(tid);
+
+            return false;
 #else // !FULL_SYSTEM
             panic("fault (%d) detected @ PC %08p", inst_fault,
                   head_inst->PC);
@@ -409,37 +1055,32 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
     // Check if we're really ready to commit.  If not then return false.
     // I'm pretty sure all instructions should be able to commit if they've
     // reached this far.  For now leave this in as a check.
-    if (!rob->isHeadReady()) {
-        panic("Commit: Unable to commit head instruction!\n");
+    if (!rob->isHeadReady(tid)) {
+        panic("Unable to commit head instruction!\n");
         return false;
     }
 
-    // If it's a branch, then send back branch prediction update info
-    // to the fetch stage.
-    // This should be handled in the iew stage if a mispredict happens...
-
     if (head_inst->isControl()) {
-
-#if 0
-        toIEW->nextPC = head_inst->readPC();
-        //Maybe switch over to BTB incorrect.
-        toIEW->btbMissed = head_inst->btbMiss();
-        toIEW->target = head_inst->nextPC;
-        //Maybe also include global history information.
-        //This simple version will have no branch prediction however.
-#endif
-
         ++commitCommittedBranches;
     }
 
     // Now that the instruction is going to be committed, finalize its
     // trace data.
     if (head_inst->traceData) {
+        head_inst->traceData->setFetchSeq(head_inst->seqNum);
+        head_inst->traceData->setCPSeq(thread[tid]->numInst);
         head_inst->traceData->finalize();
+        head_inst->traceData = NULL;
     }
 
-    //Finally clear the head ROB entry.
-    rob->retireHead();
+    // Update the commit rename map
+    for (int i = 0; i < head_inst->numDestRegs(); i++) {
+        renameMap[tid]->setEntry(head_inst->destRegIdx(i),
+                                 head_inst->renamedDestRegIdx(i));
+    }
+
+    // Finally clear the head ROB entry.
+    rob->retireHead(tid);
 
     // Return true to indicate that we have committed an instruction.
     return true;
@@ -447,37 +1088,45 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::getInsts()
+DefaultCommit<Impl>::getInsts()
 {
     //////////////////////////////////////
     // Handle ROB functions
     //////////////////////////////////////
 
-    // Read any issued instructions and place them into the ROB.  Do this
+    // Read any renamed instructions and place them into the ROB.  Do this
     // prior to squashing to avoid having instructions in the ROB that
     // don't get squashed properly.
     int insts_to_process = min((int)renameWidth, fromRename->size);
 
-    for (int inst_num = 0;
-         inst_num < insts_to_process;
-         ++inst_num)
+    for (int inst_num = 0; inst_num < insts_to_process; ++inst_num)
     {
-        if (!fromRename->insts[inst_num]->isSquashed()) {
-            DPRINTF(Commit, "Commit: Inserting PC %#x into ROB.\n",
-                    fromRename->insts[inst_num]->readPC());
-            rob->insertInst(fromRename->insts[inst_num]);
+        DynInstPtr inst = fromRename->insts[inst_num];
+        int tid = inst->threadNumber;
+
+        if (!inst->isSquashed() &&
+            commitStatus[tid] != ROBSquashing) {
+            changedROBNumEntries[tid] = true;
+
+            DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ROB.\n",
+                    inst->readPC(), inst->seqNum, tid);
+
+            rob->insertInst(inst);
+
+            assert(rob->getThreadEntries(tid) <= rob->getMaxEntries(tid));
+
+            youngestSeqNum[tid] = inst->seqNum;
         } else {
-            DPRINTF(Commit, "Commit: Instruction %i PC %#x was "
+            DPRINTF(Commit, "Instruction PC %#x [sn:%i] [tid:%i] was "
                     "squashed, skipping.\n",
-                    fromRename->insts[inst_num]->seqNum,
-                    fromRename->insts[inst_num]->readPC());
+                    inst->readPC(), inst->seqNum, tid);
         }
     }
 }
 
 template <class Impl>
 void
-SimpleCommit<Impl>::markCompletedInsts()
+DefaultCommit<Impl>::markCompletedInsts()
 {
     // Grab completed insts out of the IEW instruction queue, and mark
     // instructions completed within the ROB.
@@ -485,18 +1134,159 @@ SimpleCommit<Impl>::markCompletedInsts()
          inst_num < fromIEW->size && fromIEW->insts[inst_num];
          ++inst_num)
     {
-        DPRINTF(Commit, "Commit: Marking PC %#x, SN %i ready within ROB.\n",
-                fromIEW->insts[inst_num]->readPC(),
-                fromIEW->insts[inst_num]->seqNum);
+        if (!fromIEW->insts[inst_num]->isSquashed()) {
+            DPRINTF(Commit, "[tid:%i]: Marking PC %#x, SN %i ready within ROB.\n",
+                    fromIEW->insts[inst_num]->threadNumber,
+                    fromIEW->insts[inst_num]->readPC(),
+                    fromIEW->insts[inst_num]->seqNum);
 
-        // Mark the instruction as ready to commit.
-        fromIEW->insts[inst_num]->setCanCommit();
+            // Mark the instruction as ready to commit.
+            fromIEW->insts[inst_num]->setCanCommit();
+        }
     }
 }
 
 template <class Impl>
 uint64_t
-SimpleCommit<Impl>::readCommitPC()
+DefaultCommit<Impl>::readPC()
 {
-    return rob->readHeadPC();
+    // @todo: Fix this single thread hack.
+    return PC[0];
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setSquashing(unsigned tid)
+{
+    if (_status == Inactive) {
+        DPRINTF(Activity, "Activating stage.\n");
+        _status = Active;
+        cpu->activateStage(FullCPU::CommitIdx);
+    }
+
+    if (commitStatus[tid] != ROBSquashing) {
+        commitStatus[tid] = ROBSquashing;
+        ++squashCounter;
+    }
+}
+
+template <class Impl>
+bool
+DefaultCommit<Impl>::robDoneSquashing()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (!rob->isDoneSquashing(tid))
+            return false;
+    }
+
+    return true;
+}
+
+////////////////////////////////////////
+//                                    //
+//   SMT COMMIT POLICY MAITAINED HERE //
+//                                    //
+////////////////////////////////////////
+template <class Impl>
+int
+DefaultCommit<Impl>::getCommittingThread()
+{
+    if (numThreads > 1) {
+        switch (commitPolicy) {
+
+          case Aggressive:
+            //If Policy is Aggressive, commit will call
+            //this function multiple times per
+            //cycle
+            return oldestReady();
+
+          case RoundRobin:
+            return roundRobin();
+
+          case OldestReady:
+            return oldestReady();
+
+          default:
+            return -1;
+        }
+    } else {
+        int tid = (*activeThreads).front();
+
+        if (commitStatus[tid] == Running ||
+            commitStatus[tid] == Idle ||
+            commitStatus[tid] == FetchTrapPending) {
+            return tid;
+        } else {
+            return -1;
+        }
+    }
+}
+
+template<class Impl>
+int
+DefaultCommit<Impl>::roundRobin()
+{
+    list<unsigned>::iterator pri_iter = priority_list.begin();
+    list<unsigned>::iterator end      = priority_list.end();
+
+    while (pri_iter != end) {
+        unsigned tid = *pri_iter;
+
+        if (commitStatus[tid] == Running ||
+            commitStatus[tid] == Idle) {
+
+            if (rob->isHeadReady(tid)) {
+                priority_list.erase(pri_iter);
+                priority_list.push_back(tid);
+
+                return tid;
+            }
+        }
+
+        pri_iter++;
+    }
+
+    return -1;
+}
+
+template<class Impl>
+int
+DefaultCommit<Impl>::oldestReady()
+{
+    unsigned oldest = 0;
+    bool first = true;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (!rob->isEmpty(tid) &&
+            (commitStatus[tid] == Running ||
+             commitStatus[tid] == Idle ||
+             commitStatus[tid] == FetchTrapPending)) {
+
+            if (rob->isHeadReady(tid)) {
+
+                DynInstPtr head_inst = rob->readHeadInst(tid);
+
+                if (first) {
+                    oldest = tid;
+                    first = false;
+                } else if (head_inst->seqNum < oldest) {
+                    oldest = tid;
+                }
+            }
+        }
+    }
+
+    if (!first) {
+        return oldest;
+    } else {
+        return -1;
+    }
 }
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index 62d68bb33..d322037bc 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -41,13 +41,21 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/cpu.hh"
 
+#include "sim/stat_control.hh"
+
 using namespace std;
 
-BaseFullCPU::BaseFullCPU(Params &params)
-    : BaseCPU(&params), cpu_id(0)
+BaseFullCPU::BaseFullCPU(Params *params)
+    : BaseCPU(params), cpu_id(0)
 {
 }
 
+void
+BaseFullCPU::regStats()
+{
+    BaseCPU::regStats();
+}
+
 template <class Impl>
 FullO3CPU<Impl>::TickEvent::TickEvent(FullO3CPU<Impl> *c)
     : Event(&mainEventQueue, CPU_Tick_Pri), cpu(c)
@@ -70,96 +78,76 @@ FullO3CPU<Impl>::TickEvent::description()
 
 //Call constructor to all the pipeline stages here
 template <class Impl>
-FullO3CPU<Impl>::FullO3CPU(Params &params)
-#if FULL_SYSTEM
+FullO3CPU<Impl>::FullO3CPU(Params *params)
     : BaseFullCPU(params),
-#else
-    : BaseFullCPU(params),
-#endif // FULL_SYSTEM
       tickEvent(this),
+      removeInstsThisCycle(false),
       fetch(params),
       decode(params),
       rename(params),
       iew(params),
       commit(params),
 
-      regFile(params.numPhysIntRegs, params.numPhysFloatRegs),
+      regFile(params->numPhysIntRegs, params->numPhysFloatRegs),
 
-      freeList(TheISA::NumIntRegs, params.numPhysIntRegs,
-               TheISA::NumFloatRegs, params.numPhysFloatRegs),
+      freeList(params->numberOfThreads,//number of activeThreads
+               TheISA::NumIntRegs, params->numPhysIntRegs,
+               TheISA::NumFloatRegs, params->numPhysFloatRegs),
 
-      renameMap(TheISA::NumIntRegs, params.numPhysIntRegs,
-                TheISA::NumFloatRegs, params.numPhysFloatRegs,
-                TheISA::NumMiscRegs,
-                TheISA::ZeroReg,
-                TheISA::ZeroReg + TheISA::NumIntRegs),
+      rob(params->numROBEntries, params->squashWidth,
+          params->smtROBPolicy, params->smtROBThreshold,
+          params->numberOfThreads),
 
-      rob(params.numROBEntries, params.squashWidth),
+      scoreboard(params->numberOfThreads,//number of activeThreads
+                 TheISA::NumIntRegs, params->numPhysIntRegs,
+                 TheISA::NumFloatRegs, params->numPhysFloatRegs,
+                 TheISA::NumMiscRegs * number_of_threads,
+                 TheISA::ZeroReg),
 
       // What to pass to these time buffers?
       // For now just have these time buffers be pretty big.
+      // @todo: Make these time buffer sizes parameters.
       timeBuffer(5, 5),
       fetchQueue(5, 5),
       decodeQueue(5, 5),
       renameQueue(5, 5),
       iewQueue(5, 5),
-
-      cpuXC(NULL),
+      activityBuffer(5, 0),
+      activityCount(0),
 
       globalSeqNum(1),
 
 #if FULL_SYSTEM
-      system(params.system),
+      system(params->system),
       memCtrl(system->memctrl),
       physmem(system->physmem),
-      itb(params.itb),
-      dtb(params.dtb),
-      mem(params.mem),
+      mem(params->mem),
 #else
-      // Hardcoded for a single thread!!
-      mem(params.workload[0]->getMemory()),
+      pTable(params->pTable),
 #endif // FULL_SYSTEM
 
-      icacheInterface(params.icacheInterface),
-      dcacheInterface(params.dcacheInterface),
-      deferRegistration(params.defReg),
-      numInsts(0),
-      funcExeInst(0)
+      icacheInterface(params->icacheInterface),
+      dcacheInterface(params->dcacheInterface),
+      deferRegistration(params->deferRegistration)
 {
     _status = Idle;
 
 #if !FULL_SYSTEM
-    thread.resize(this->number_of_threads);
+    thread.resize(number_of_threads);
+    tids.resize(number_of_threads);
 #endif
 
-    for (int i = 0; i < this->number_of_threads; ++i) {
-#if FULL_SYSTEM
-        assert(i == 0);
-        thread[i] = new CPUExecContext(this, 0, system, itb, dtb, mem);
-        system->execContexts[i] = thread[i]->getProxy();
-
-        execContexts.push_back(system->execContexts[i]);
-#else
-        if (i < params.workload.size()) {
-            DPRINTF(FullCPU, "FullCPU: Workload[%i]'s starting PC is %#x, "
-                    "process is %#x",
-                    i, params.workload[i]->prog_entry, thread[i]);
-            thread[i] = new CPUExecContext(this, i, params.workload[i], i);
-        }
-        assert(params.workload[i]->getMemory() != NULL);
-        assert(mem != NULL);
-        execContexts.push_back(thread[i]->getProxy());
-#endif // !FULL_SYSTEM
-    }
-
-    // Note that this is a hack so that my code which still uses xc-> will
-    // still work.  I should remove this eventually
-    cpuXC = thread[0];
-
     // The stages also need their CPU pointer setup.  However this must be
     // done at the upper level CPU because they have pointers to the upper
     // level CPU, and not this FullO3CPU.
 
+    // Set up Pointers to the activeThreads list for each stage
+    fetch.setActiveThreads(&activeThreads);
+    decode.setActiveThreads(&activeThreads);
+    rename.setActiveThreads(&activeThreads);
+    iew.setActiveThreads(&activeThreads);
+    commit.setActiveThreads(&activeThreads);
+
     // Give each of the stages the time buffer they will use.
     fetch.setTimeBuffer(&timeBuffer);
     decode.setTimeBuffer(&timeBuffer);
@@ -170,6 +158,7 @@ FullO3CPU<Impl>::FullO3CPU(Params &params)
     // Also setup each of the stages' queues.
     fetch.setFetchQueue(&fetchQueue);
     decode.setFetchQueue(&fetchQueue);
+    commit.setFetchQueue(&fetchQueue);
     decode.setDecodeQueue(&decodeQueue);
     rename.setDecodeQueue(&decodeQueue);
     rename.setRenameQueue(&renameQueue);
@@ -178,16 +167,91 @@ FullO3CPU<Impl>::FullO3CPU(Params &params)
     commit.setIEWQueue(&iewQueue);
     commit.setRenameQueue(&renameQueue);
 
-    // Setup the rename map for whichever stages need it.
-    rename.setRenameMap(&renameMap);
-    iew.setRenameMap(&renameMap);
+    commit.setIEWStage(&iew);
+    rename.setIEWStage(&iew);
+    rename.setCommitStage(&commit);
 
-    // Setup the free list for whichever stages need it.
+    //Make Sure That this a Valid Architeture
+    //@todo: move this up in constructor
+    numThreads = number_of_threads;
+
+#if !FULL_SYSTEM
+    int activeThreads = params->workload.size();
+#else
+    int activeThreads = 1;
+#endif
+
+    assert(params->numPhysIntRegs   >= numThreads * TheISA::NumIntRegs);
+    assert(params->numPhysFloatRegs >= numThreads * TheISA::NumFloatRegs);
+
+    rename.setScoreboard(&scoreboard);
+    iew.setScoreboard(&scoreboard);
+
+    // Setup the rename map for whichever stages need it.
+    PhysRegIndex lreg_idx = 0;
+    PhysRegIndex freg_idx = params->numPhysIntRegs; //Index to 1 after int regs
+
+    for (int tid=0; tid < numThreads; tid++) {
+        bool bindRegs = (tid <= activeThreads - 1);
+
+        commitRenameMap[tid].init(TheISA::NumIntRegs,
+                                  params->numPhysIntRegs,
+                                  lreg_idx,                   //Index for Logical. Regs
+
+                                  TheISA::NumFloatRegs,
+                                  params->numPhysFloatRegs,
+                                  freg_idx,                   //Index for Float Regs
+
+                                  TheISA::NumMiscRegs,
+
+                                  TheISA::ZeroReg,
+                                  TheISA::ZeroReg,
+
+                                  tid,
+                                  false);
+
+        renameMap[tid].init(TheISA::NumIntRegs,
+                            params->numPhysIntRegs,
+                            lreg_idx,                   //Index for Logical. Regs
+
+                            TheISA::NumFloatRegs,
+                            params->numPhysFloatRegs,
+                            freg_idx,                   //Index for Float Regs
+
+                            TheISA::NumMiscRegs,
+
+                            TheISA::ZeroReg,
+                            TheISA::ZeroReg,
+
+                            tid,
+                            bindRegs);
+    }
+
+    rename.setRenameMap(renameMap);
+    commit.setRenameMap(commitRenameMap);
+
+    // Give renameMap & rename stage access to the freeList;
+    for (int i=0; i < numThreads; i++) {
+        renameMap[i].setFreeList(&freeList);
+    }
     rename.setFreeList(&freeList);
-    renameMap.setFreeList(&freeList);
+
+    // Setup the page table for whichever stages need it.
+#if !FULL_SYSTEM
+    fetch.setPageTable(pTable);
+    iew.setPageTable(pTable);
+#endif
 
     // Setup the ROB for whichever stages need it.
     commit.setROB(&rob);
+
+    lastRunningCycle = curTick;
+
+    for (int i = 0; i < NumStages; ++i) {
+        stageActive[i] = false;
+    }
+
+    contextSwitch = false;
 }
 
 template <class Impl>
@@ -199,7 +263,58 @@ template <class Impl>
 void
 FullO3CPU<Impl>::fullCPURegStats()
 {
+    BaseFullCPU::regStats();
+
     // Register any of the FullCPU's stats here.
+    timesIdled
+        .name(name() + ".timesIdled")
+        .desc("Number of times that the entire CPU went into an idle state and"
+              " unscheduled itself")
+        .prereq(timesIdled);
+
+    idleCycles
+        .name(name() + ".idleCycles")
+        .desc("Total number of cycles that the CPU has spent unscheduled due "
+              "to idling")
+        .prereq(idleCycles);
+
+    // Number of Instructions simulated
+    // --------------------------------
+    // Should probably be in Base CPU but need templated
+    // MaxThreads so put in here instead
+    committedInsts
+        .init(numThreads)
+        .name(name() + ".committedInsts")
+        .desc("Number of Instructions Simulated");
+
+    totalCommittedInsts
+        .name(name() + ".committedInsts_total")
+        .desc("Number of Instructions Simulated");
+
+    cpi
+        .name(name() + ".cpi")
+        .desc("CPI: Cycles Per Instruction")
+        .precision(6);
+    cpi = simTicks / committedInsts;
+
+    totalCpi
+        .name(name() + ".cpi_total")
+        .desc("CPI: Total CPI of All Threads")
+        .precision(6);
+    totalCpi = simTicks / totalCommittedInsts;
+
+    ipc
+        .name(name() + ".ipc")
+        .desc("IPC: Instructions Per Cycle")
+        .precision(6);
+    ipc =  committedInsts / simTicks;
+
+    totalIpc
+        .name(name() + ".ipc_total")
+        .desc("IPC: Total IPC of All Threads")
+        .precision(6);
+    totalIpc =  totalCommittedInsts / simTicks;
+
 }
 
 template <class Impl>
@@ -208,9 +323,11 @@ FullO3CPU<Impl>::tick()
 {
     DPRINTF(FullCPU, "\n\nFullCPU: Ticking main, FullO3CPU.\n");
 
-    //Tick each of the stages if they're actually running.
-    //Will want to figure out a way to unschedule itself if they're all
-    //going to be idle for a long time.
+    ++numCycles;
+
+    activity = false;
+
+    //Tick each of the stages
     fetch.tick();
 
     decode.tick();
@@ -221,7 +338,11 @@ FullO3CPU<Impl>::tick()
 
     commit.tick();
 
-    // Now advance the time buffers, unless the stage is stalled.
+#if !FULL_SYSTEM
+    doContextSwitch();
+#endif
+
+    // Now advance the time buffers
     timeBuffer.advance();
 
     fetchQueue.advance();
@@ -229,81 +350,310 @@ FullO3CPU<Impl>::tick()
     renameQueue.advance();
     iewQueue.advance();
 
-    if (_status == Running && !tickEvent.scheduled())
+    advanceActivityBuffer();
+
+    if (removeInstsThisCycle) {
+        cleanUpRemovedInsts();
+    }
+
+    if (activityCount && !tickEvent.scheduled()) {
         tickEvent.schedule(curTick + 1);
+    }
+
+#if !FULL_SYSTEM
+    updateThreadPriority();
+#endif
+
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::init()
 {
-    if(!deferRegistration)
-    {
-        this->registerExecContexts();
+    if (deferRegistration) {
+        return;
+    }
 
+    // Set inSyscall so that the CPU doesn't squash when initially
+    // setting up registers.
+    for (int i = 0; i < number_of_threads; ++i)
+        thread[i]->inSyscall = true;
+
+    registerExecContexts();
+
+    // Need to do a copy of the xc->regs into the CPU's regfile so
+    // that it can start properly.
+
+    for (int tid=0; tid < number_of_threads; tid++) {
         // Need to do a copy of the xc->regs into the CPU's regfile so
         // that it can start properly.
 #if FULL_SYSTEM
-        ExecContext *src_xc = system->execContexts[0];
-        TheISA::initCPU(src_xc, src_xc->readCpuId());
+        ExecContext *src_xc = system->execContexts[tid];
 #else
-        ExecContext *src_xc = thread[0]->getProxy();
+        ExecContext *src_xc = thread[tid]->getXCProxy();
 #endif
-        // First loop through the integer registers.
-        for (int i = 0; i < TheISA::NumIntRegs; ++i)
-        {
-            regFile.intRegFile[i] = src_xc->readIntReg(i);
+        // Threads start in the Suspended State
+        if (src_xc->status() != ExecContext::Suspended) {
+            continue;
         }
 
-        // Then loop through the floating point registers.
-        for (int i = 0; i < TheISA::NumFloatRegs; ++i)
-        {
-            regFile.floatRegFile[i].d = src_xc->readFloatRegDouble(i);
-            regFile.floatRegFile[i].q = src_xc->readFloatRegInt(i);
-        }
-/*
-        // Then loop through the misc registers.
-        regFile.miscRegs.fpcr = src_xc->regs.miscRegs.fpcr;
-        regFile.miscRegs.uniq = src_xc->regs.miscRegs.uniq;
-        regFile.miscRegs.lock_flag = src_xc->regs.miscRegs.lock_flag;
-        regFile.miscRegs.lock_addr = src_xc->regs.miscRegs.lock_addr;
-*/
-        // Then finally set the PC and the next PC.
-        regFile.pc = src_xc->readPC();
-        regFile.npc = src_xc->readNextPC();
+#if FULL_SYSTEM
+        TheISA::initCPU(src_xc, src_xc->readCpuId());
+#endif
+    }
+
+    // Clear inSyscall.
+    for (int i = 0; i < number_of_threads; ++i)
+        thread[i]->inSyscall = false;
+
+    // Probably should just make a call to all the stages to init stage,
+    // regardless of whether or not they need it.  Keeps it more independent.
+    fetch.initStage();
+    iew.initStage();
+    rename.initStage();
+    commit.initStage();
+
+    commit.setThreads(thread);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::insertThread(unsigned tid)
+{
+    DPRINTF(FullCPU,"[tid:%i] Initializing thread data");
+    // Will change now that the PC and thread state is internal to the CPU
+    // and not in the CPUExecContext.
+#if 0
+#if FULL_SYSTEM
+    ExecContext *src_xc = system->execContexts[tid];
+#else
+    CPUExecContext *src_xc = thread[tid];
+#endif
+
+    //Bind Int Regs to Rename Map
+    for (int ireg = 0; ireg < TheISA::NumIntRegs; ireg++) {
+        PhysRegIndex phys_reg = freeList.getIntReg();
+
+        renameMap[tid].setEntry(ireg,phys_reg);
+        scoreboard.setReg(phys_reg);
+    }
+
+    //Bind Float Regs to Rename Map
+    for (int freg = 0; freg < TheISA::NumFloatRegs; freg++) {
+        PhysRegIndex phys_reg = freeList.getFloatReg();
+
+        renameMap[tid].setEntry(freg,phys_reg);
+        scoreboard.setReg(phys_reg);
+    }
+
+    //Copy Thread Data Into RegFile
+    this->copyFromXC(tid);
+
+    //Set PC/NPC
+    regFile.pc[tid]  = src_xc->readPC();
+    regFile.npc[tid] = src_xc->readNextPC();
+
+    src_xc->setStatus(ExecContext::Active);
+
+    activateContext(tid,1);
+
+    //Reset ROB/IQ/LSQ Entries
+    commit.rob->resetEntries();
+    iew.resetEntries();
+#endif
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::removeThread(unsigned tid)
+{
+    DPRINTF(FullCPU,"[tid:%i] Removing thread data");
+#if 0
+    //Unbind Int Regs from Rename Map
+    for (int ireg = 0; ireg < TheISA::NumIntRegs; ireg++) {
+        PhysRegIndex phys_reg = renameMap[tid].lookup(ireg);
+
+        scoreboard.unsetReg(phys_reg);
+        freeList.addReg(phys_reg);
+    }
+
+    //Unbind Float Regs from Rename Map
+    for (int freg = 0; freg < TheISA::NumFloatRegs; freg++) {
+        PhysRegIndex phys_reg = renameMap[tid].lookup(freg);
+
+        scoreboard.unsetReg(phys_reg);
+        freeList.addReg(phys_reg);
+    }
+
+    //Copy Thread Data From RegFile
+    /* Fix Me:
+     * Do we really need to do this if we are removing a thread
+     * in the sense that it's finished (exiting)? If the thread is just
+     * being suspended we might...
+     */
+//    this->copyToXC(tid);
+
+    //Squash Throughout Pipeline
+    fetch.squash(0,tid);
+    decode.squash(tid);
+    rename.squash(tid);
+
+    assert(iew.ldstQueue.getCount(tid) == 0);
+
+    //Reset ROB/IQ/LSQ Entries
+    if (activeThreads.size() >= 1) {
+        commit.rob->resetEntries();
+        iew.resetEntries();
+    }
+#endif
+}
+
+
+template <class Impl>
+void
+FullO3CPU<Impl>::activateWhenReady(int tid)
+{
+    DPRINTF(FullCPU,"[tid:%i]: Checking if resources are available for incoming"
+            "(e.g. PhysRegs/ROB/IQ/LSQ) \n",
+            tid);
+
+    bool ready = true;
+
+    if (freeList.numFreeIntRegs() >= TheISA::NumIntRegs) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "Phys. Int. Regs.\n",
+                tid);
+        ready = false;
+    } else if (freeList.numFreeFloatRegs() >= TheISA::NumFloatRegs) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "Phys. Float. Regs.\n",
+                tid);
+        ready = false;
+    } else if (commit.rob->numFreeEntries() >=
+               commit.rob->entryAmount(activeThreads.size() + 1)) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "ROB entries.\n",
+                tid);
+        ready = false;
+    } else if (iew.instQueue.numFreeEntries() >=
+               iew.instQueue.entryAmount(activeThreads.size() + 1)) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "IQ entries.\n",
+                tid);
+        ready = false;
+    } else if (iew.ldstQueue.numFreeEntries() >=
+               iew.ldstQueue.entryAmount(activeThreads.size() + 1)) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "LSQ entries.\n",
+                tid);
+        ready = false;
+    }
+
+    if (ready) {
+        insertThread(tid);
+
+        contextSwitch = false;
+
+        cpuWaitList.remove(tid);
+    } else {
+        suspendContext(tid);
+
+        //blocks fetch
+        contextSwitch = true;
+
+        //do waitlist
+        cpuWaitList.push_back(tid);
     }
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::activateContext(int thread_num, int delay)
+FullO3CPU<Impl>::activateContext(int tid, int delay)
 {
+
     // Needs to set each stage to running as well.
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive == activeThreads.end()) {
+        //May Need to Re-code this if the delay variable is the
+        //delay needed for thread to activate
+        DPRINTF(FullCPU, "Adding Thread %i to active threads list\n",
+                tid);
+
+        activeThreads.push_back(tid);
+    }
+
+    assert(_status == Idle);
 
     scheduleTickEvent(delay);
 
+    // Be sure to signal that there's some activity so the CPU doesn't
+    // deschedule itself.
+    activityThisCycle();
+    fetch.wakeFromQuiesce();
+
     _status = Running;
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::suspendContext(int thread_num)
+FullO3CPU<Impl>::suspendContext(int tid)
 {
-    panic("suspendContext unimplemented!");
+    DPRINTF(FullCPU,"[tid: %i]: Suspended ...\n", tid);
+    unscheduleTickEvent();
+    _status = Idle;
+/*
+    //Remove From Active List, if Active
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive != activeThreads.end()) {
+        DPRINTF(FullCPU,"[tid:%i]: Removing from active threads list\n",
+                tid);
+        activeThreads.erase(isActive);
+    }
+*/
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::deallocateContext(int thread_num)
+FullO3CPU<Impl>::deallocateContext(int tid)
 {
-    panic("deallocateContext unimplemented!");
+    DPRINTF(FullCPU,"[tid:%i]: Deallocating ...", tid);
+/*
+    //Remove From Active List, if Active
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive != activeThreads.end()) {
+        DPRINTF(FullCPU,"[tid:%i]: Removing from active threads list\n",
+                tid);
+        activeThreads.erase(isActive);
+
+        removeThread(tid);
+    }
+*/
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::haltContext(int thread_num)
+FullO3CPU<Impl>::haltContext(int tid)
 {
-    panic("haltContext unimplemented!");
+    DPRINTF(FullCPU,"[tid:%i]: Halted ...", tid);
+/*
+    //Remove From Active List, if Active
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive != activeThreads.end()) {
+        DPRINTF(FullCPU,"[tid:%i]: Removing from active threads list\n",
+                tid);
+        activeThreads.erase(isActive);
+
+        removeThread(tid);
+    }
+*/
 }
 
 template <class Impl>
@@ -336,7 +686,6 @@ template <class Impl>
 InstSeqNum
 FullO3CPU<Impl>::getAndIncrementInstSeq()
 {
-    // Hopefully this works right.
     return globalSeqNum++;
 }
 
@@ -398,124 +747,274 @@ FullO3CPU<Impl>::setFloatRegInt(int reg_idx, uint64_t val)
 
 template <class Impl>
 uint64_t
-FullO3CPU<Impl>::readPC()
+FullO3CPU<Impl>::readArchIntReg(int reg_idx, unsigned tid)
 {
-    return regFile.readPC();
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    return regFile.readIntReg(phys_reg);
+}
+
+template <class Impl>
+float
+FullO3CPU<Impl>::readArchFloatRegSingle(int reg_idx, unsigned tid)
+{
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    return regFile.readFloatRegSingle(phys_reg);
+}
+
+template <class Impl>
+double
+FullO3CPU<Impl>::readArchFloatRegDouble(int reg_idx, unsigned tid)
+{
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    return regFile.readFloatRegDouble(phys_reg);
+}
+
+template <class Impl>
+uint64_t
+FullO3CPU<Impl>::readArchFloatRegInt(int reg_idx, unsigned tid)
+{
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    return regFile.readFloatRegInt(phys_reg);
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::setNextPC(uint64_t val)
+FullO3CPU<Impl>::setArchIntReg(int reg_idx, uint64_t val, unsigned tid)
 {
-    regFile.setNextPC(val);
+    if (reg_idx == TheISA::ZeroReg) {
+        warn("Setting r31 through ArchIntReg in CPU, cycle %i\n", curTick);
+    }
+
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    regFile.setIntReg(phys_reg, val);
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::setPC(Addr new_PC)
+FullO3CPU<Impl>::setArchFloatRegSingle(int reg_idx, float val, unsigned tid)
 {
-    regFile.setPC(new_PC);
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    regFile.setFloatRegSingle(phys_reg, val);
 }
 
 template <class Impl>
 void
+FullO3CPU<Impl>::setArchFloatRegDouble(int reg_idx, double val, unsigned tid)
+{
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    regFile.setFloatRegDouble(phys_reg, val);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid)
+{
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    regFile.setFloatRegInt(phys_reg, val);
+}
+
+template <class Impl>
+uint64_t
+FullO3CPU<Impl>::readPC(unsigned tid)
+{
+    return commit.readPC(tid);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::setPC(Addr new_PC,unsigned tid)
+{
+    commit.setPC(new_PC, tid);
+}
+
+template <class Impl>
+uint64_t
+FullO3CPU<Impl>::readNextPC(unsigned tid)
+{
+    return commit.readNextPC(tid);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::setNextPC(uint64_t val,unsigned tid)
+{
+    commit.setNextPC(val, tid);
+}
+
+template <class Impl>
+typename FullO3CPU<Impl>::ListIt
 FullO3CPU<Impl>::addInst(DynInstPtr &inst)
 {
     instList.push_back(inst);
+
+    return --(instList.end());
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::instDone()
+FullO3CPU<Impl>::instDone(unsigned tid)
 {
     // Keep an instruction count.
-    numInsts++;
+    thread[tid]->numInst++;
+    thread[tid]->numInsts++;
+    committedInsts[tid]++;
+    totalCommittedInsts++;
 
     // Check for instruction-count-based events.
-    comInstEventQueue[0]->serviceEvents(numInsts);
+    comInstEventQueue[tid]->serviceEvents(thread[tid]->numInst);
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::removeBackInst(DynInstPtr &inst)
+FullO3CPU<Impl>::addToRemoveList(DynInstPtr &inst)
 {
-    DynInstPtr inst_to_delete;
+    removeInstsThisCycle = true;
 
-    // Walk through the instruction list, removing any instructions
-    // that were inserted after the given instruction, inst.
-    while (instList.back() != inst)
-    {
-        assert(!instList.empty());
-
-        // Obtain the pointer to the instruction.
-        inst_to_delete = instList.back();
-
-        DPRINTF(FullCPU, "FullCPU: Removing instruction %i, PC %#x\n",
-                inst_to_delete->seqNum, inst_to_delete->readPC());
-
-        // Remove the instruction from the list.
-        instList.pop_back();
-
-        // Mark it as squashed.
-        inst_to_delete->setSquashed();
-    }
+    removeList.push(inst->getInstListIt());
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::removeFrontInst(DynInstPtr &inst)
 {
-    DynInstPtr inst_to_remove;
+    unsigned tid = inst->threadNumber;
 
-    // The front instruction should be the same one being asked to be removed.
-    assert(instList.front() == inst);
+    DPRINTF(FullCPU, "FullCPU: Removing committed instruction [tid:%i] PC %#x "
+            "[sn:%lli]\n",
+            tid, inst->readPC(), inst->seqNum);
+
+    removeInstsThisCycle = true;
 
     // Remove the front instruction.
-    inst_to_remove = inst;
-    instList.pop_front();
-
-    DPRINTF(FullCPU, "FullCPU: Removing committed instruction %#x, PC %#x\n",
-            inst_to_remove, inst_to_remove->readPC());
+    removeList.push(inst->getInstListIt());
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::removeInstsNotInROB()
+FullO3CPU<Impl>::removeInstsNotInROB(unsigned tid)
 {
-    DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction "
-            "list.\n");
+    DPRINTF(FullCPU, "FullCPU: Thread %i: Deleting instructions from instruction"
+            " list.\n", tid);
 
-    DynInstPtr rob_tail = rob.readTailInst();
+    ListIt end_it;
 
-    removeBackInst(rob_tail);
-}
+    bool rob_empty = false;
 
-template <class Impl>
-void
-FullO3CPU<Impl>::removeInstsUntil(const InstSeqNum &seq_num)
-{
-    DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction "
-            "list.\n");
-
-    DynInstPtr inst_to_delete;
-
-    while (instList.back()->seqNum > seq_num) {
-        assert(!instList.empty());
-
-        // Obtain the pointer to the instruction.
-        inst_to_delete = instList.back();
-
-        DPRINTF(FullCPU, "FullCPU: Removing instruction %i, PC %#x\n",
-                inst_to_delete->seqNum, inst_to_delete->readPC());
-
-        // Remove the instruction from the list.
-        instList.back() = NULL;
-        instList.pop_back();
-
-        // Mark it as squashed.
-        inst_to_delete->setSquashed();
+    if (instList.empty()) {
+        return;
+    } else if (rob.isEmpty(/*tid*/)) {
+        DPRINTF(FullCPU, "FullCPU: ROB is empty, squashing all insts.\n");
+        end_it = instList.begin();
+        rob_empty = true;
+    } else {
+        end_it = (rob.readTailInst(tid))->getInstListIt();
+        DPRINTF(FullCPU, "FullCPU: ROB is not empty, squashing insts not in ROB.\n");
     }
 
+    removeInstsThisCycle = true;
+
+    ListIt inst_it = instList.end();
+
+    inst_it--;
+
+    // Walk through the instruction list, removing any instructions
+    // that were inserted after the given instruction iterator, end_it.
+    while (inst_it != end_it) {
+        assert(!instList.empty());
+
+        bool break_loop = (inst_it == instList.begin());
+
+        squashInstIt(inst_it, tid);
+
+        inst_it--;
+
+        if (break_loop)
+            break;
+    }
+
+    // If the ROB was empty, then we actually need to remove the first
+    // instruction as well.
+    if (rob_empty) {
+        squashInstIt(inst_it, tid);
+    }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::removeInstsUntil(const InstSeqNum &seq_num,
+                                  unsigned tid)
+{
+    assert(!instList.empty());
+
+    removeInstsThisCycle = true;
+
+    ListIt inst_iter = instList.end();
+
+    inst_iter--;
+
+    DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction "
+            "list that are from [tid:%i] and above [sn:%lli] (end=%lli).\n",
+            tid, seq_num, (*inst_iter)->seqNum);
+
+    while ((*inst_iter)->seqNum > seq_num) {
+
+        bool break_loop = (inst_iter == instList.begin());
+
+        squashInstIt(inst_iter, tid);
+
+        inst_iter--;
+
+        if (break_loop)
+            break;
+    }
+}
+
+template <class Impl>
+inline void
+FullO3CPU<Impl>::squashInstIt(const ListIt &instIt, const unsigned &tid)
+{
+    if ((*instIt)->threadNumber == tid) {
+        DPRINTF(FullCPU, "FullCPU: Squashing instruction, "
+                "[tid:%i] [sn:%lli] PC %#x\n",
+                (*instIt)->threadNumber,
+                (*instIt)->seqNum,
+                (*instIt)->readPC());
+
+        // Mark it as squashed.
+        (*instIt)->setSquashed();
+
+        //@todo: Formulate a consistent method for deleting
+        //instructions from the instruction list
+        // Remove the instruction from the list.
+        removeList.push(instIt);
+    }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::cleanUpRemovedInsts()
+{
+    while (!removeList.empty()) {
+        DPRINTF(FullCPU, "FullCPU: Removing instruction, "
+                "[tid:%i] [sn:%lli] PC %#x\n",
+                (*removeList.front())->threadNumber,
+                (*removeList.front())->seqNum,
+                (*removeList.front())->readPC());
+
+        instList.erase(removeList.front());
+
+        removeList.pop();
+    }
+
+    removeInstsThisCycle = false;
 }
 
 template <class Impl>
@@ -530,16 +1029,22 @@ void
 FullO3CPU<Impl>::dumpInsts()
 {
     int num = 0;
-    typename list<DynInstPtr>::iterator inst_list_it = instList.begin();
 
-    while (inst_list_it != instList.end())
-    {
-        cprintf("Instruction:%i\nPC:%#x\nSN:%lli\nIssued:%i\nSquashed:%i\n\n",
-                num, (*inst_list_it)->readPC(), (*inst_list_it)->seqNum,
-                (*inst_list_it)->isIssued(), (*inst_list_it)->isSquashed());
+    ListIt inst_list_it = instList.begin();
+
+    cprintf("Dumping Instruction List\n");
+
+    while (inst_list_it != instList.end()) {
+        cprintf("Instruction:%i\nPC:%#x\n[tid:%i]\n[sn:%lli]\nIssued:%i\n"
+                "Squashed:%i\n\n",
+                num, (*inst_list_it)->readPC(), (*inst_list_it)->threadNumber,
+                (*inst_list_it)->seqNum, (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
         inst_list_it++;
         ++num;
     }
+
+
 }
 
 template <class Impl>
@@ -549,5 +1054,139 @@ FullO3CPU<Impl>::wakeDependents(DynInstPtr &inst)
     iew.wakeDependents(inst);
 }
 
+template <class Impl>
+void
+FullO3CPU<Impl>::wakeCPU()
+{
+    if (activityCount || tickEvent.scheduled()) {
+        return;
+    }
+
+    idleCycles += curTick - lastRunningCycle;
+
+    tickEvent.schedule(curTick);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::activityThisCycle()
+{
+    if (activityBuffer[0]) {
+        return;
+    }
+
+    activityBuffer[0] = true;
+    activity = true;
+    ++activityCount;
+
+    DPRINTF(Activity, "Activity: %i\n", activityCount);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::advanceActivityBuffer()
+{
+    if (activityBuffer[-5]) {
+        --activityCount;
+
+        assert(activityCount >= 0);
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+
+        if (activityCount == 0) {
+            DPRINTF(FullCPU, "No activity left, going to idle!\n");
+            lastRunningCycle = curTick;
+            timesIdled++;
+        }
+    }
+
+    activityBuffer.advance();
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::activateStage(const StageIdx idx)
+{
+    if (!stageActive[idx]) {
+        ++activityCount;
+
+        stageActive[idx] = true;
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+    } else {
+        DPRINTF(Activity, "Stage %i already active.\n", idx);
+    }
+
+    // @todo: Number is hardcoded for now.  Replace with parameter.
+    assert(activityCount < 15);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::deactivateStage(const StageIdx idx)
+{
+    if (stageActive[idx]) {
+        --activityCount;
+
+        stageActive[idx] = false;
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+    } else {
+        DPRINTF(Activity, "Stage %i already inactive.\n", idx);
+    }
+
+    assert(activityCount >= 0);
+}
+
+template <class Impl>
+int
+FullO3CPU<Impl>::getFreeTid()
+{
+    for (int i=0; i < numThreads; i++) {
+        if (!tids[i]) {
+            tids[i] = true;
+            return i;
+        }
+    }
+
+    return -1;
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::doContextSwitch()
+{
+    if (contextSwitch) {
+
+        //ADD CODE TO DEACTIVE THREAD HERE (???)
+
+        for (int tid=0; tid < cpuWaitList.size(); tid++) {
+            activateWhenReady(tid);
+        }
+
+        if (cpuWaitList.size() == 0)
+            contextSwitch = true;
+    }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::updateThreadPriority()
+{
+    if (activeThreads.size() > 1)
+    {
+        //DEFAULT TO ROUND ROBIN SCHEME
+        //e.g. Move highest priority to end of thread list
+        list<unsigned>::iterator list_begin = activeThreads.begin();
+        list<unsigned>::iterator list_end   = activeThreads.end();
+
+        unsigned high_thread = *list_begin;
+
+        activeThreads.erase(list_begin);
+
+        activeThreads.push_back(high_thread);
+    }
+}
+
 // Forward declaration of FullO3CPU.
 template class FullO3CPU<AlphaSimpleImpl>;
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index 6577e46e4..91eaf9d6f 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -26,18 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-//Todo: Add in a lot of the functions that are ISA specific.  Also define
-//the functions that currently exist within the base cpu class.  Define
-//everything for the simobject stuff so it can be serialized and
-//instantiated, add in debugging statements everywhere.  Have CPU schedule
-//itself properly.  Threads!
-// Avoid running stages and advancing queues if idle/stalled.
-
-#ifndef __CPU_O3_CPU_FULL_CPU_HH__
-#define __CPU_O3_CPU_FULL_CPU_HH__
+#ifndef __CPU_O3_FULL_CPU_HH__
+#define __CPU_O3_FULL_CPU_HH__
 
 #include <iostream>
 #include <list>
+#include <queue>
+#include <set>
 #include <vector>
 
 #include "base/statistics.hh"
@@ -47,10 +42,12 @@
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/o3/comm.hh"
 #include "cpu/o3/cpu_policy.hh"
+#include "cpu/o3/scoreboard.hh"
+#include "cpu/o3/thread_state.hh"
 #include "sim/process.hh"
 
 class ExecContext;
-class FunctionalMemory;
+class MemInterface;
 class Process;
 
 class BaseFullCPU : public BaseCPU
@@ -59,11 +56,9 @@ class BaseFullCPU : public BaseCPU
   public:
     typedef BaseCPU::Params Params;
 
-#if FULL_SYSTEM
-    BaseFullCPU(Params &params);
-#else
-    BaseFullCPU(Params &params);
-#endif // FULL_SYSTEM
+    BaseFullCPU(Params *params);
+
+    void regStats();
 
   protected:
     int cpu_id;
@@ -78,31 +73,42 @@ class FullO3CPU : public BaseFullCPU
     typedef typename Impl::Params Params;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
+    typedef O3ThreadState<Impl> Thread;
+
+    typedef typename std::list<DynInstPtr>::iterator ListIt;
+
   public:
     enum Status {
         Running,
         Idle,
         Halted,
-        Blocked // ?
+        Blocked
     };
 
+    /** Overall CPU status. */
     Status _status;
 
   private:
     class TickEvent : public Event
     {
       private:
+        /** Pointer to the CPU. */
         FullO3CPU<Impl> *cpu;
 
       public:
+        /** Constructs a tick event. */
         TickEvent(FullO3CPU<Impl> *c);
+
+        /** Processes a tick event, calling tick() on the CPU. */
         void process();
+        /** Returns the description of the tick event. */
         const char *description();
     };
 
+    /** The tick event used for scheduling CPU ticks. */
     TickEvent tickEvent;
 
-    /// Schedule tick event, regardless of its current state.
+    /** Schedule tick event, regardless of its current state. */
     void scheduleTickEvent(int delay)
     {
         if (tickEvent.squashed())
@@ -111,7 +117,7 @@ class FullO3CPU : public BaseFullCPU
             tickEvent.schedule(curTick + delay);
     }
 
-    /// Unschedule tick event, regardless of its current state.
+    /** Unschedule tick event, regardless of its current state. */
     void unscheduleTickEvent()
     {
         if (tickEvent.scheduled())
@@ -119,21 +125,82 @@ class FullO3CPU : public BaseFullCPU
     }
 
   public:
-    FullO3CPU(Params &params);
+    /** Constructs a CPU with the given parameters. */
+    FullO3CPU(Params *params);
+    /** Destructor. */
     ~FullO3CPU();
 
+    /** Registers statistics. */
     void fullCPURegStats();
 
+    /** Ticks CPU, calling tick() on each stage, and checking the overall
+     *  activity to see if the CPU should deschedule itself.
+     */
     void tick();
 
+    /** Initialize the CPU */
     void init();
 
-    void activateContext(int thread_num, int delay);
-    void suspendContext(int thread_num);
-    void deallocateContext(int thread_num);
-    void haltContext(int thread_num);
+    /** Setup CPU to insert a thread's context */
+    void insertThread(unsigned tid);
 
+    /** Remove all of a thread's context from CPU */
+    void removeThread(unsigned tid);
+
+    /** Count the Total Instructions Committed in the CPU. */
+    virtual Counter totalInstructions() const
+    {
+        Counter total(0);
+
+        for (int i=0; i < thread.size(); i++)
+            total += thread[i]->numInst;
+
+        return total;
+    }
+
+    /** Add Thread to Active Threads List. */
+    void activateContext(int tid, int delay);
+
+    /** Remove Thread from Active Threads List */
+    void suspendContext(int tid);
+
+    /** Remove Thread from Active Threads List &&
+     *  Remove Thread Context from CPU.
+     */
+    void deallocateContext(int tid);
+
+    /** Remove Thread from Active Threads List &&
+     *  Remove Thread Context from CPU.
+     */
+    void haltContext(int tid);
+
+    /** Activate a Thread When CPU Resources are Available. */
+    void activateWhenReady(int tid);
+
+    /** Add or Remove a Thread Context in the CPU. */
+    void doContextSwitch();
+
+    /** Update The Order In Which We Process Threads. */
+    void updateThreadPriority();
+
+    /** Executes a syscall on this cycle.
+     *  ---------------------------------------
+     *  Note: this is a virtual function. CPU-Specific
+     *  functionality defined in derived classes
+     */
+    virtual void syscall(int tid) {}
+
+    /** Check if there are any system calls pending. */
+    void checkSyscalls();
+
+    /** Switches out this CPU.
+     *  @todo: Implement this.
+     */
     void switchOut();
+
+    /** Takes over from another CPU.
+     *  @todo: Implement this.
+     */
     void takeOverFrom(BaseCPU *oldCPU);
 
     /** Get the current instruction sequence number, and increment it. */
@@ -147,21 +214,28 @@ class FullO3CPU : public BaseFullCPU
     bool validDataAddr(Addr addr) { return true; }
 
     /** Get instruction asid. */
-    int getInstAsid()
-    { return regFile.miscRegs.getInstAsid(); }
+    int getInstAsid(unsigned tid)
+    { return regFile.miscRegs[tid].getInstAsid(); }
 
     /** Get data asid. */
-    int getDataAsid()
-    { return regFile.miscRegs.getDataAsid(); }
+    int getDataAsid(unsigned tid)
+    { return regFile.miscRegs[tid].getDataAsid(); }
 #else
-    bool validInstAddr(Addr addr)
-    { return thread[0]->validInstAddr(addr); }
+    /** Check if this address is a valid instruction address. */
+    bool validInstAddr(Addr addr,unsigned tid)
+    { return thread[tid]->validInstAddr(addr); }
 
-    bool validDataAddr(Addr addr)
-    { return thread[0]->validDataAddr(addr); }
+    /** Check if this address is a valid data address. */
+    bool validDataAddr(Addr addr,unsigned tid)
+    { return thread[tid]->validDataAddr(addr); }
 
-    int getInstAsid() { return thread[0]->getInstAsid(); }
-    int getDataAsid() { return thread[0]->getDataAsid(); }
+    /** Get instruction asid. */
+    int getInstAsid(unsigned tid)
+    { return thread[tid]->asid; }
+
+    /** Get data asid. */
+    int getDataAsid(unsigned tid)
+    { return thread[tid]->asid; }
 
 #endif
 
@@ -184,29 +258,40 @@ class FullO3CPU : public BaseFullCPU
 
     void setFloatRegInt(int reg_idx, uint64_t val);
 
-    uint64_t readPC();
+    uint64_t readArchIntReg(int reg_idx, unsigned tid);
 
-    void setNextPC(uint64_t val);
+    float readArchFloatRegSingle(int reg_idx, unsigned tid);
 
-    void setPC(Addr new_PC);
+    double readArchFloatRegDouble(int reg_idx, unsigned tid);
+
+    uint64_t readArchFloatRegInt(int reg_idx, unsigned tid);
+
+    void setArchIntReg(int reg_idx, uint64_t val, unsigned tid);
+
+    void setArchFloatRegSingle(int reg_idx, float val, unsigned tid);
+
+    void setArchFloatRegDouble(int reg_idx, double val, unsigned tid);
+
+    void setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid);
+
+    uint64_t readPC(unsigned tid);
+
+    void setPC(Addr new_PC,unsigned tid);
+
+    uint64_t readNextPC(unsigned tid);
+
+    void setNextPC(uint64_t val,unsigned tid);
 
     /** Function to add instruction onto the head of the list of the
      *  instructions.  Used when new instructions are fetched.
      */
-    void addInst(DynInstPtr &inst);
+    ListIt addInst(DynInstPtr &inst);
 
     /** Function to tell the CPU that an instruction has completed. */
-    void instDone();
+    void instDone(unsigned tid);
 
-    /** Remove all instructions in back of the given instruction, but leave
-     *  that instruction in the list.  This is useful in a squash, when there
-     *  are instructions in this list that don't exist in structures such as
-     *  the ROB.  The instruction doesn't have to be the last instruction in
-     *  the list, but will be once this function completes.
-     *  @todo: Remove only up until that inst?  Squashed inst is most likely
-     *  valid.
-     */
-    void removeBackInst(DynInstPtr &inst);
+    /** Add Instructions to the CPU Remove List*/
+    void addToRemoveList(DynInstPtr &inst);
 
     /** Remove an instruction from the front of the list.  It is expected
      *  that there are no instructions in front of it (that is, none are older
@@ -218,10 +303,14 @@ class FullO3CPU : public BaseFullCPU
     void removeFrontInst(DynInstPtr &inst);
 
     /** Remove all instructions that are not currently in the ROB. */
-    void removeInstsNotInROB();
+    void removeInstsNotInROB(unsigned tid);
 
     /** Remove all instructions younger than the given sequence number. */
-    void removeInstsUntil(const InstSeqNum &seq_num);
+    void removeInstsUntil(const InstSeqNum &seq_num,unsigned tid);
+
+    inline void squashInstIt(const ListIt &instIt, const unsigned &tid);
+
+    void cleanUpRemovedInsts();
 
     /** Remove all instructions from the list. */
     void removeAllInsts();
@@ -236,43 +325,38 @@ class FullO3CPU : public BaseFullCPU
 
   public:
     /** List of all the instructions in flight. */
-    list<DynInstPtr> instList;
+    std::list<DynInstPtr> instList;
+
+    /** List of all the instructions that will be removed at the end of this
+     *  cycle.
+     */
+    std::queue<ListIt> removeList;
+
+#ifdef DEBUG
+    std::set<InstSeqNum> snList;
+#endif
+
+    /** Records if instructions need to be removed this cycle due to being
+     *  retired or squashed.
+     */
+    bool removeInstsThisCycle;
 
-    //not sure these should be private.
   protected:
     /** The fetch stage. */
     typename CPUPolicy::Fetch fetch;
 
-    /** The fetch stage's status. */
-    typename CPUPolicy::Fetch::Status fetchStatus;
-
     /** The decode stage. */
     typename CPUPolicy::Decode decode;
 
-    /** The decode stage's status. */
-    typename CPUPolicy::Decode::Status decodeStatus;
-
     /** The dispatch stage. */
     typename CPUPolicy::Rename rename;
 
-    /** The dispatch stage's status. */
-    typename CPUPolicy::Rename::Status renameStatus;
-
     /** The issue/execute/writeback stages. */
     typename CPUPolicy::IEW iew;
 
-    /** The issue/execute/writeback stage's status. */
-    typename CPUPolicy::IEW::Status iewStatus;
-
     /** The commit stage. */
     typename CPUPolicy::Commit commit;
 
-    /** The fetch stage's status. */
-    typename CPUPolicy::Commit::Status commitStatus;
-
-    //Might want to just pass these objects in to the constructors of the
-    //appropriate stage.  regFile is in iew, freeList in dispatch, renameMap
-    //in dispatch, and the rob in commit.
     /** The register file. */
     typename CPUPolicy::RegFile regFile;
 
@@ -280,12 +364,33 @@ class FullO3CPU : public BaseFullCPU
     typename CPUPolicy::FreeList freeList;
 
     /** The rename map. */
-    typename CPUPolicy::RenameMap renameMap;
+    typename CPUPolicy::RenameMap renameMap[Impl::MaxThreads];
+
+    /** The commit rename map. */
+    typename CPUPolicy::RenameMap commitRenameMap[Impl::MaxThreads];
 
     /** The re-order buffer. */
     typename CPUPolicy::ROB rob;
 
+    /** Active Threads List */
+    std::list<unsigned> activeThreads;
+
+    /** Integer Register Scoreboard */
+    Scoreboard scoreboard;
+
   public:
+    /** Enum to give each stage a specific index, so when calling
+     *  activateStage() or deactivateStage(), they can specify which stage
+     *  is being activated/deactivated.
+     */
+    enum StageIdx {
+        FetchIdx,
+        DecodeIdx,
+        RenameIdx,
+        IEWIdx,
+        CommitIdx,
+        NumStages };
+
     /** Typedefs from the Impl to get the structs that each of the
      *  time buffers should use.
      */
@@ -314,46 +419,123 @@ class FullO3CPU : public BaseFullCPU
     /** The IEW stage's instruction queue. */
     TimeBuffer<IEWStruct> iewQueue;
 
+  private:
+    /** Time buffer that tracks if any cycles has active communication in them.
+     *  It should be as long as the longest communication latency in the system.
+     *  Each time any time buffer is written, the activity buffer should also
+     *  be written to. The activityBuffer is advanced along with all the other
+     *  time buffers, so it should always have a 1 somewhere in it only if there
+     *  is active communication in a time buffer.
+     */
+    TimeBuffer<bool> activityBuffer;
+
+    /** Tracks how many stages and cycles of time buffer have activity. Stages
+     *  increment this count when they switch to active, and decrement it when
+     *  they switch to inactive. Whenever a cycle that previously had no
+     *  information is written in the time buffer, this is incremented. When
+     *  a cycle that had information exits the time buffer due to age, this
+     *  count is decremented. When the count is 0, there is no activity in the
+     *  CPU, and it can be descheduled.
+     */
+    int activityCount;
+
+    /** Records if there has been activity this cycle. */
+    bool activity;
+
+    /** Records which stages are active/inactive. */
+    bool stageActive[NumStages];
+
   public:
-    /** The temporary exec context to support older accessors. */
-    CPUExecContext *cpuXC;
+    /** Wakes the CPU, rescheduling the CPU if it's not already active. */
+    void wakeCPU();
+    /** Records that there is activity this cycle. */
+    void activityThisCycle();
+    /** Advances the activity buffer, decrementing the activityCount if active
+     *  communication just left the time buffer, and descheduling the CPU if
+     *  there is no activity.
+     */
+    void advanceActivityBuffer();
+    /** Marks a stage as active. */
+    void activateStage(const StageIdx idx);
+    /** Deactivates a stage. */
+    void deactivateStage(const StageIdx idx);
 
+    /** Gets a free thread id. Use if thread ids change across system. */
+    int getFreeTid();
+
+  public:
     /** Temporary function to get pointer to exec context. */
-    ExecContext *xcBase()
+    ExecContext *xcBase(unsigned tid)
     {
-        return thread[0]->getProxy();
-    }
-
-    CPUExecContext *cpuXCBase()
-    {
-        return thread[0];
+        return thread[tid]->getXCProxy();
     }
 
+    /** The global sequence number counter. */
     InstSeqNum globalSeqNum;
 
 #if FULL_SYSTEM
+    /** Pointer to the system. */
     System *system;
 
+    /** Pointer to the memory controller. */
     MemoryController *memCtrl;
+    /** Pointer to physical memory. */
     PhysicalMemory *physmem;
-
-    AlphaITB *itb;
-    AlphaDTB *dtb;
-
-//    SWContext *swCtx;
 #endif
-    std::vector<CPUExecContext *> thread;
 
+    // List of all ExecContexts.
+    std::vector<Thread *> thread;
+
+    /** Pointer to memory. */
     FunctionalMemory *mem;
 
+#if 0
+    /** Page table pointer. */
+    PageTable *pTable;
+#endif
+
+    /** Pointer to the icache interface. */
     MemInterface *icacheInterface;
+    /** Pointer to the dcache interface. */
     MemInterface *dcacheInterface;
 
+    /** Whether or not the CPU should defer its registration. */
     bool deferRegistration;
 
-    Counter numInsts;
+    /** Is there a context switch pending? */
+    bool contextSwitch;
 
-    Counter funcExeInst;
+    /** Threads Scheduled to Enter CPU */
+    std::list<int> cpuWaitList;
+
+    /** The cycle that the CPU was last running, used for statistics. */
+    Tick lastRunningCycle;
+
+    /** Number of Threads CPU can process */
+    unsigned numThreads;
+
+    /** Mapping for system thread id to cpu id */
+    std::map<unsigned,unsigned> threadMap;
+
+    /** Available thread ids in the cpu*/
+    std::vector<unsigned> tids;
+
+    /** Stat for total number of times the CPU is descheduled. */
+    Stats::Scalar<> timesIdled;
+    /** Stat for total number of cycles the CPU spends descheduled. */
+    Stats::Scalar<> idleCycles;
+    /** Stat for the number of committed instructions per thread. */
+    Stats::Vector<> committedInsts;
+    /** Stat for the total number of committed instructions. */
+    Stats::Scalar<> totalCommittedInsts;
+    /** Stat for the CPI per thread. */
+    Stats::Formula cpi;
+    /** Stat for the total CPI. */
+    Stats::Formula totalCpi;
+    /** Stat for the IPC per thread. */
+    Stats::Formula ipc;
+    /** Stat for the total IPC. */
+    Stats::Formula totalIpc;
 };
 
 #endif
diff --git a/cpu/o3/cpu_policy.hh b/cpu/o3/cpu_policy.hh
index 41f06f81b..52227013e 100644
--- a/cpu/o3/cpu_policy.hh
+++ b/cpu/o3/cpu_policy.hh
@@ -26,13 +26,14 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_CPU_POLICY_HH__
-#define __CPU_O3_CPU_CPU_POLICY_HH__
+#ifndef __CPU_O3_CPU_POLICY_HH__
+#define __CPU_O3_CPU_POLICY_HH__
 
 #include "cpu/o3/bpred_unit.hh"
 #include "cpu/o3/free_list.hh"
 #include "cpu/o3/inst_queue.hh"
-#include "cpu/o3/ldstq.hh"
+#include "cpu/o3/lsq.hh"
+#include "cpu/o3/lsq_unit.hh"
 #include "cpu/o3/mem_dep_unit.hh"
 #include "cpu/o3/regfile.hh"
 #include "cpu/o3/rename_map.hh"
@@ -57,32 +58,34 @@ struct SimpleCPUPolicy
     typedef ROB<Impl> ROB;
     typedef InstructionQueue<Impl> IQ;
     typedef MemDepUnit<StoreSet, Impl> MemDepUnit;
-    typedef LDSTQ<Impl> LDSTQ;
+    typedef LSQ<Impl> LSQ;
+    typedef LSQUnit<Impl> LSQUnit;
 
-    typedef SimpleFetch<Impl> Fetch;
-    typedef SimpleDecode<Impl> Decode;
-    typedef SimpleRename<Impl> Rename;
-    typedef SimpleIEW<Impl> IEW;
-    typedef SimpleCommit<Impl> Commit;
+
+    typedef DefaultFetch<Impl> Fetch;
+    typedef DefaultDecode<Impl> Decode;
+    typedef DefaultRename<Impl> Rename;
+    typedef DefaultIEW<Impl> IEW;
+    typedef DefaultCommit<Impl> Commit;
 
     /** The struct for communication between fetch and decode. */
-    typedef SimpleFetchSimpleDecode<Impl> FetchStruct;
+    typedef DefaultFetchDefaultDecode<Impl> FetchStruct;
 
     /** The struct for communication between decode and rename. */
-    typedef SimpleDecodeSimpleRename<Impl> DecodeStruct;
+    typedef DefaultDecodeDefaultRename<Impl> DecodeStruct;
 
     /** The struct for communication between rename and IEW. */
-    typedef SimpleRenameSimpleIEW<Impl> RenameStruct;
+    typedef DefaultRenameDefaultIEW<Impl> RenameStruct;
 
     /** The struct for communication between IEW and commit. */
-    typedef SimpleIEWSimpleCommit<Impl> IEWStruct;
+    typedef DefaultIEWDefaultCommit<Impl> IEWStruct;
 
     /** The struct for communication within the IEW stage. */
     typedef IssueStruct<Impl> IssueStruct;
 
     /** The struct for all backwards communication. */
-    typedef TimeBufStruct TimeStruct;
+    typedef TimeBufStruct<Impl> TimeStruct;
 
 };
 
-#endif //__CPU_O3_CPU_CPU_POLICY_HH__
+#endif //__CPU_O3_CPU_POLICY_HH__
diff --git a/cpu/o3/decode.cc b/cpu/o3/decode.cc
index 290648318..b14fbb7a3 100644
--- a/cpu/o3/decode.cc
+++ b/cpu/o3/decode.cc
@@ -30,4 +30,4 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/decode_impl.hh"
 
-template class SimpleDecode<AlphaSimpleImpl>;
+template class DefaultDecode<AlphaSimpleImpl>;
diff --git a/cpu/o3/decode.hh b/cpu/o3/decode.hh
index 5b9a0f822..279ff556e 100644
--- a/cpu/o3/decode.hh
+++ b/cpu/o3/decode.hh
@@ -26,16 +26,23 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_SIMPLE_DECODE_HH__
-#define __CPU_O3_CPU_SIMPLE_DECODE_HH__
+#ifndef __CPU_O3_DECODE_HH__
+#define __CPU_O3_DECODE_HH__
 
 #include <queue>
 
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 
+/**
+ * DefaultDecode class handles both single threaded and SMT decode. Its width is
+ * specified by the parameters; each cycles it tries to decode that many
+ * instructions. Because instructions are actually decoded when the StaticInst
+ * is created, this stage does not do much other than check any PC-relative
+ * branches.
+ */
 template<class Impl>
-class SimpleDecode
+class DefaultDecode
 {
   private:
     // Typedefs from the Impl.
@@ -50,49 +57,126 @@ class SimpleDecode
     typedef typename CPUPol::TimeStruct TimeStruct;
 
   public:
-    // The only time decode will become blocked is if dispatch becomes
-    // blocked, which means IQ or ROB is probably full.
-    enum Status {
+    /** Overall decode stage status. Used to determine if the CPU can
+     * deschedule itself due to a lack of activity.
+     */
+    enum DecodeStatus {
+        Active,
+        Inactive
+    };
+
+    /** Individual thread status. */
+    enum ThreadStatus {
         Running,
         Idle,
+        StartSquash,
         Squashing,
         Blocked,
         Unblocking
     };
 
   private:
-    // May eventually need statuses on a per thread basis.
-    Status _status;
+    /** Decode status. */
+    DecodeStatus _status;
+
+    /** Per-thread status. */
+    ThreadStatus decodeStatus[Impl::MaxThreads];
 
   public:
-    SimpleDecode(Params &params);
+    /** DefaultDecode constructor. */
+    DefaultDecode(Params *params);
 
+    /** Returns the name of decode. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets CPU pointer. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets the main backwards communication time buffer pointer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    /** Sets pointer to time buffer used to communicate to the next stage. */
     void setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr);
 
+    /** Sets pointer to time buffer coming from fetch. */
     void setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr);
 
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Ticks decode, processing all input signals and decoding as many
+     * instructions as possible.
+     */
     void tick();
 
-    void decode();
+    /** Determines what to do based on decode's current status.
+     * @param status_change decode() sets this variable if there was a status
+     * change (ie switching from from blocking to unblocking).
+     * @param tid Thread id to decode instructions from.
+     */
+    void decode(bool &status_change, unsigned tid);
+
+    /** Processes instructions from fetch and passes them on to rename.
+     * Decoding of instructions actually happens when they are created in
+     * fetch, so this function mostly checks if PC-relative branches are
+     * correct.
+     */
+    void decodeInsts(unsigned tid);
 
   private:
+    /** Inserts a thread's instructions into the skid buffer, to be decoded
+     * once decode unblocks.
+     */
+    void skidInsert(unsigned tid);
+
+    /** Returns if all of the skid buffers are empty. */
+    bool skidsEmpty();
+
+    /** Updates overall decode status based on all of the threads' statuses. */
+    void updateStatus();
+
+    /** Separates instructions from fetch into individual lists of instructions
+     * sorted by thread.
+     */
+    void sortInsts();
+
+    /** Reads all stall signals from the backwards communication timebuffer. */
+    void readStallSignals(unsigned tid);
+
+    /** Checks all input signals and updates decode's status appropriately. */
+    bool checkSignalsAndUpdate(unsigned tid);
+
+    /** Checks all stall signals, and returns if any are true. */
+    bool checkStall(unsigned tid) const;
+
+    /** Returns if there any instructions from fetch on this cycle. */
     inline bool fetchInstsValid();
 
-    void block();
+    /** Switches decode to blocking, and signals back that decode has
+     * become blocked.
+     * @return Returns true if there is a status change.
+     */
+    bool block(unsigned tid);
 
-    inline void unblock();
+    /** Switches decode to unblocking if the skid buffer is empty, and
+     * signals back that decode has unblocked.
+     * @return Returns true if there is a status change.
+     */
+    bool unblock(unsigned tid);
 
-    void squash(DynInstPtr &inst);
+    /** Squashes if there is a PC-relative branch that was predicted
+     * incorrectly. Sends squash information back to fetch.
+     */
+    void squash(DynInstPtr &inst, unsigned tid);
 
   public:
-    // Might want to make squash a friend function.
-    void squash();
+    /** Squashes due to commit signalling a squash. Changes status to
+     * squashing and clears block/unblock signals as needed.
+     */
+    unsigned squash(unsigned tid);
 
   private:
     // Interfaces to objects outside of decode.
@@ -127,10 +211,27 @@ class SimpleDecode
     /** Wire to get fetch's output from fetch queue. */
     typename TimeBuffer<FetchStruct>::wire fromFetch;
 
-    /** Skid buffer between fetch and decode. */
-    std::queue<FetchStruct> skidBuffer;
+    /** Queue of all instructions coming from fetch this cycle. */
+    std::queue<DynInstPtr> insts[Impl::MaxThreads];
+
+    /** Skid buffer between fetch and decode. */
+    std::queue<DynInstPtr> skidBuffer[Impl::MaxThreads];
+
+    /** Variable that tracks if decode has written to the time buffer this
+     * cycle. Used to tell CPU if there is activity this cycle.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Source of possible stalls. */
+    struct Stalls {
+        bool rename;
+        bool iew;
+        bool commit;
+    };
+
+    /** Tracks which stages are telling decode to stall. */
+    Stalls stalls[Impl::MaxThreads];
 
-    //Consider making these unsigned to avoid any confusion.
     /** Rename to decode delay, in ticks. */
     unsigned renameToDecodeDelay;
 
@@ -146,20 +247,41 @@ class SimpleDecode
     /** The width of decode, in instructions. */
     unsigned decodeWidth;
 
-    /** The instruction that decode is currently on.  It needs to have
-     *  persistent state so that when a stall occurs in the middle of a
-     *  group of instructions, it can restart at the proper instruction.
-     */
-    unsigned numInst;
+    /** Index of instructions being sent to rename. */
+    unsigned toRenameIndex;
 
+    /** number of Active Threads*/
+    unsigned numThreads;
+
+    /** List of active thread ids */
+    std::list<unsigned> *activeThreads;
+
+    /** Number of branches in flight. */
+    unsigned branchCount[Impl::MaxThreads];
+
+    /** Maximum size of the skid buffer. */
+    unsigned skidBufferMax;
+
+    /** Stat for total number of idle cycles. */
     Stats::Scalar<> decodeIdleCycles;
+    /** Stat for total number of blocked cycles. */
     Stats::Scalar<> decodeBlockedCycles;
+    /** Stat for total number of normal running cycles. */
+    Stats::Scalar<> decodeRunCycles;
+    /** Stat for total number of unblocking cycles. */
     Stats::Scalar<> decodeUnblockCycles;
+    /** Stat for total number of squashing cycles. */
     Stats::Scalar<> decodeSquashCycles;
+    /** Stat for number of times a branch mispredict is detected. */
     Stats::Scalar<> decodeBranchMispred;
+    /** Stat for number of times decode detected a non-control instruction
+     * incorrectly predicted as a branch.
+     */
     Stats::Scalar<> decodeControlMispred;
+    /** Stat for total number of decoded instructions. */
     Stats::Scalar<> decodeDecodedInsts;
+    /** Stat for total number of squashed instructions. */
     Stats::Scalar<> decodeSquashedInsts;
 };
 
-#endif // __CPU_O3_CPU_SIMPLE_DECODE_HH__
+#endif // __CPU_O3_DECODE_HH__
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index 463f0ddac..f1aea27b4 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -28,22 +28,42 @@
 
 #include "cpu/o3/decode.hh"
 
+using namespace std;
+
 template<class Impl>
-SimpleDecode<Impl>::SimpleDecode(Params &params)
-    : renameToDecodeDelay(params.renameToDecodeDelay),
-      iewToDecodeDelay(params.iewToDecodeDelay),
-      commitToDecodeDelay(params.commitToDecodeDelay),
-      fetchToDecodeDelay(params.fetchToDecodeDelay),
-      decodeWidth(params.decodeWidth),
-      numInst(0)
+DefaultDecode<Impl>::DefaultDecode(Params *params)
+    : renameToDecodeDelay(params->renameToDecodeDelay),
+      iewToDecodeDelay(params->iewToDecodeDelay),
+      commitToDecodeDelay(params->commitToDecodeDelay),
+      fetchToDecodeDelay(params->fetchToDecodeDelay),
+      decodeWidth(params->decodeWidth),
+      numThreads(params->numberOfThreads)
 {
-    DPRINTF(Decode, "Decode: decodeWidth=%i.\n", decodeWidth);
-    _status = Idle;
+    DPRINTF(Decode, "decodeWidth=%i.\n", decodeWidth);
+    _status = Inactive;
+
+    for (int i = 0; i < numThreads; ++i) {
+        decodeStatus[i] = Idle;
+
+        stalls[i].rename = false;
+        stalls[i].iew = false;
+        stalls[i].commit = false;
+    }
+
+    // @todo: Make into a parameter
+    skidBufferMax = (fetchToDecodeDelay * params->fetchWidth) + decodeWidth;
+}
+
+template <class Impl>
+std::string
+DefaultDecode<Impl>::name() const
+{
+    return cpu->name() + ".decode";
 }
 
 template <class Impl>
 void
-SimpleDecode<Impl>::regStats()
+DefaultDecode<Impl>::regStats()
 {
     decodeIdleCycles
         .name(name() + ".decodeIdleCycles")
@@ -53,6 +73,10 @@ SimpleDecode<Impl>::regStats()
         .name(name() + ".decodeBlockedCycles")
         .desc("Number of cycles decode is blocked")
         .prereq(decodeBlockedCycles);
+    decodeRunCycles
+        .name(name() + ".decodeRunCycles")
+        .desc("Number of cycles decode is running")
+        .prereq(decodeRunCycles);
     decodeUnblockCycles
         .name(name() + ".decodeUnblockCycles")
         .desc("Number of cycles decode is unblocking")
@@ -82,17 +106,17 @@ SimpleDecode<Impl>::regStats()
 
 template<class Impl>
 void
-SimpleDecode<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultDecode<Impl>::setCPU(FullCPU *cpu_ptr)
 {
-    DPRINTF(Decode, "Decode: Setting CPU pointer.\n");
+    DPRINTF(Decode, "Setting CPU pointer.\n");
     cpu = cpu_ptr;
 }
 
 template<class Impl>
 void
-SimpleDecode<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+DefaultDecode<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
-    DPRINTF(Decode, "Decode: Setting time buffer pointer.\n");
+    DPRINTF(Decode, "Setting time buffer pointer.\n");
     timeBuffer = tb_ptr;
 
     // Setup wire to write information back to fetch.
@@ -106,9 +130,9 @@ SimpleDecode<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 
 template<class Impl>
 void
-SimpleDecode<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
+DefaultDecode<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 {
-    DPRINTF(Decode, "Decode: Setting decode queue pointer.\n");
+    DPRINTF(Decode, "Setting decode queue pointer.\n");
     decodeQueue = dq_ptr;
 
     // Setup wire to write information to proper place in decode queue.
@@ -117,260 +141,515 @@ SimpleDecode<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 
 template<class Impl>
 void
-SimpleDecode<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+DefaultDecode<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
 {
-    DPRINTF(Decode, "Decode: Setting fetch queue pointer.\n");
+    DPRINTF(Decode, "Setting fetch queue pointer.\n");
     fetchQueue = fq_ptr;
 
     // Setup wire to read information from fetch queue.
     fromFetch = fetchQueue->getWire(-fetchToDecodeDelay);
 }
 
+template<class Impl>
+void
+DefaultDecode<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(Decode, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+
+template<class Impl>
+bool
+DefaultDecode<Impl>::checkStall(unsigned tid) const
+{
+    bool ret_val = false;
+
+    if (stalls[tid].rename) {
+        DPRINTF(Decode,"[tid:%i]: Stall fom Rename stage detected.\n", tid);
+        ret_val = true;
+    } else if (stalls[tid].iew) {
+        DPRINTF(Decode,"[tid:%i]: Stall fom IEW stage detected.\n", tid);
+        ret_val = true;
+    } else if (stalls[tid].commit) {
+        DPRINTF(Decode,"[tid:%i]: Stall fom Commit stage detected.\n", tid);
+        ret_val = true;
+    }
+
+    return ret_val;
+}
+
 template<class Impl>
 inline bool
-SimpleDecode<Impl>::fetchInstsValid()
+DefaultDecode<Impl>::fetchInstsValid()
 {
     return fromFetch->size > 0;
 }
 
 template<class Impl>
-void
-SimpleDecode<Impl>::block()
+bool
+DefaultDecode<Impl>::block(unsigned tid)
 {
-    DPRINTF(Decode, "Decode: Blocking.\n");
+    DPRINTF(Decode, "[tid:%u]: Blocking.\n", tid);
 
-    // Set the status to Blocked.
-    _status = Blocked;
+    // If the decode status is blocked or unblocking then decode has not yet
+    // signalled fetch to unblock. In that case, there is no need to tell
+    // fetch to block.
+    if (decodeStatus[tid] != Blocked &&
+        decodeStatus[tid] != Unblocking) {
+        toFetch->decodeBlock[tid] = true;
+        wroteToTimeBuffer = true;
+    }
 
     // Add the current inputs to the skid buffer so they can be
     // reprocessed when this stage unblocks.
-    skidBuffer.push(*fromFetch);
+    skidInsert(tid);
 
-    // Note that this stage only signals previous stages to stall when
-    // it is the cause of the stall originates at this stage.  Otherwise
-    // the previous stages are expected to check all possible stall signals.
+    if (decodeStatus[tid] != Blocked) {
+        // Set the status to Blocked.
+        decodeStatus[tid] = Blocked;
+        return true;
+    }
+
+    return false;
 }
 
 template<class Impl>
-inline void
-SimpleDecode<Impl>::unblock()
+bool
+DefaultDecode<Impl>::unblock(unsigned tid)
 {
-    DPRINTF(Decode, "Decode: Unblocking, going to remove "
-            "instructions from skid buffer.\n");
-    // Remove the now processed instructions from the skid buffer.
-    skidBuffer.pop();
+    DPRINTF(Decode, "[tid:%u]: Trying to unblock.\n", tid);
 
-    // If there's still information in the skid buffer, then
-    // continue to tell previous stages to stall.  They will be
-    // able to restart once the skid buffer is empty.
-    if (!skidBuffer.empty()) {
-        toFetch->decodeInfo.stall = true;
-    } else {
-        DPRINTF(Decode, "Decode: Finished unblocking.\n");
-        _status = Running;
+    // Decode is done unblocking only if the skid buffer is empty.
+    if (skidBuffer[tid].empty()) {
+        DPRINTF(Decode, "[tid:%u]: Done unblocking.\n", tid);
+        toFetch->decodeUnblock[tid] = true;
+        wroteToTimeBuffer = true;
+
+        decodeStatus[tid] = Running;
+        return true;
     }
+
+    return false;
 }
 
-// This squash is specifically for when Decode detects a PC-relative branch
-// was predicted incorrectly.
 template<class Impl>
 void
-SimpleDecode<Impl>::squash(DynInstPtr &inst)
+DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid)
 {
-    DPRINTF(Decode, "Decode: Squashing due to incorrect branch prediction "
-                    "detected at decode.\n");
-    Addr new_PC = inst->readNextPC();
+    DPRINTF(Decode, "[tid:%i]: Squashing due to incorrect branch prediction "
+            "detected at decode.\n", tid);
 
-    toFetch->decodeInfo.branchMispredict = true;
-    toFetch->decodeInfo.doneSeqNum = inst->seqNum;
-    toFetch->decodeInfo.predIncorrect = true;
-    toFetch->decodeInfo.squash = true;
-    toFetch->decodeInfo.nextPC = new_PC;
-    toFetch->decodeInfo.branchTaken = true;
+    toFetch->decodeInfo[tid].branchMispredict = true;
+    toFetch->decodeInfo[tid].doneSeqNum = inst->seqNum;
+    toFetch->decodeInfo[tid].predIncorrect = true;
+    toFetch->decodeInfo[tid].squash = true;
+    toFetch->decodeInfo[tid].nextPC = inst->readNextPC();
+    toFetch->decodeInfo[tid].branchTaken = true;
+
+    if (decodeStatus[tid] == Blocked ||
+        decodeStatus[tid] == Unblocking) {
+        toFetch->decodeUnblock[tid] = 1;
+    }
 
     // Set status to squashing.
-    _status = Squashing;
+    decodeStatus[tid] = Squashing;
+
+    for (int i=0; i<fromFetch->size; i++) {
+        if (fromFetch->insts[i]->threadNumber == tid &&
+            fromFetch->insts[i]->seqNum > inst->seqNum) {
+            fromFetch->insts[i]->squashed = true;
+        }
+    }
+
+    while (!insts[tid].empty()) {
+        insts[tid].pop();
+    }
 
     // Clear the skid buffer in case it has any data in it.
-    while (!skidBuffer.empty()) {
-        skidBuffer.pop();
+    while (!skidBuffer[tid].empty()) {
+        skidBuffer[tid].pop();
     }
 
     // Squash instructions up until this one
-    // Slightly unrealistic!
-    cpu->removeInstsUntil(inst->seqNum);
+    cpu->removeInstsUntil(inst->seqNum, tid);
+}
+
+template<class Impl>
+unsigned
+DefaultDecode<Impl>::squash(unsigned tid)
+{
+    DPRINTF(Decode, "[tid:%i]: Squashing.\n",tid);
+
+    if (decodeStatus[tid] == Blocked ||
+        decodeStatus[tid] == Unblocking) {
+#if !FULL_SYSTEM
+        // In syscall emulation, we can have both a block and a squash due
+        // to a syscall in the same cycle.  This would cause both signals to
+        // be high.  This shouldn't happen in full system.
+        if (toFetch->decodeBlock[tid]) {
+            toFetch->decodeBlock[tid] = 0;
+        } else {
+            toFetch->decodeUnblock[tid] = 1;
+        }
+#else
+        toFetch->decodeUnblock[tid] = 1;
+#endif
+    }
+
+    // Set status to squashing.
+    decodeStatus[tid] = Squashing;
+
+    // Go through incoming instructions from fetch and squash them.
+    unsigned squash_count = 0;
+
+    for (int i=0; i<fromFetch->size; i++) {
+        if (fromFetch->insts[i]->threadNumber == tid) {
+            fromFetch->insts[i]->squashed = true;
+            squash_count++;
+        }
+    }
+
+    while (!insts[tid].empty()) {
+        insts[tid].pop();
+    }
+
+    // Clear the skid buffer in case it has any data in it.
+    while (!skidBuffer[tid].empty()) {
+        skidBuffer[tid].pop();
+    }
+
+    return squash_count;
 }
 
 template<class Impl>
 void
-SimpleDecode<Impl>::squash()
+DefaultDecode<Impl>::skidInsert(unsigned tid)
 {
-    DPRINTF(Decode, "Decode: Squashing.\n");
-    // Set status to squashing.
-    _status = Squashing;
+    DynInstPtr inst = NULL;
 
-    // Maybe advance the time buffer?  Not sure what to do in the normal
-    // case.
+    while (!insts[tid].empty()) {
+        inst = insts[tid].front();
 
-    // Clear the skid buffer in case it has any data in it.
-    while (!skidBuffer.empty())
-    {
-        skidBuffer.pop();
+        insts[tid].pop();
+
+        assert(tid == inst->threadNumber);
+
+        DPRINTF(Decode,"Inserting [sn:%lli] PC:%#x into decode skidBuffer %i\n",
+                inst->seqNum, inst->readPC(), inst->threadNumber);
+
+        skidBuffer[tid].push(inst);
+    }
+
+    // Eventually need to enforce this by not letting a thread
+    // fetch past its skidbuffer
+    assert(skidBuffer[tid].size() <= skidBufferMax);
+}
+
+template<class Impl>
+bool
+DefaultDecode<Impl>::skidsEmpty()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        if (!skidBuffer[*threads++].empty())
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+void
+DefaultDecode<Impl>::updateStatus()
+{
+    bool any_unblocking = false;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (decodeStatus[tid] == Unblocking) {
+            any_unblocking = true;
+            break;
+        }
+    }
+
+    // Decode will have activity if it's unblocking.
+    if (any_unblocking) {
+        if (_status == Inactive) {
+            _status = Active;
+
+            DPRINTF(Activity, "Activating stage.\n");
+
+            cpu->activateStage(FullCPU::DecodeIdx);
+        }
+    } else {
+        // If it's not unblocking, then decode will not have any internal
+        // activity.  Switch it to inactive.
+        if (_status == Active) {
+            _status = Inactive;
+            DPRINTF(Activity, "Deactivating stage.\n");
+
+            cpu->deactivateStage(FullCPU::DecodeIdx);
+        }
+    }
+}
+
+template <class Impl>
+void
+DefaultDecode<Impl>::sortInsts()
+{
+    int insts_from_fetch = fromFetch->size;
+
+    for (int i=0; i < numThreads; i++)
+        assert(insts[i].empty());
+
+    for (int i = 0; i < insts_from_fetch; ++i) {
+        insts[fromFetch->insts[i]->threadNumber].push(fromFetch->insts[i]);
     }
 }
 
 template<class Impl>
 void
-SimpleDecode<Impl>::tick()
+DefaultDecode<Impl>::readStallSignals(unsigned tid)
 {
-    // Decode should try to execute as many instructions as its bandwidth
+    if (fromRename->renameBlock[tid]) {
+        stalls[tid].rename = true;
+    }
+
+    if (fromRename->renameUnblock[tid]) {
+        assert(stalls[tid].rename);
+        stalls[tid].rename = false;
+    }
+
+    if (fromIEW->iewBlock[tid]) {
+        stalls[tid].iew = true;
+    }
+
+    if (fromIEW->iewUnblock[tid]) {
+        assert(stalls[tid].iew);
+        stalls[tid].iew = false;
+    }
+
+    if (fromCommit->commitBlock[tid]) {
+        stalls[tid].commit = true;
+    }
+
+    if (fromCommit->commitUnblock[tid]) {
+        assert(stalls[tid].commit);
+        stalls[tid].commit = false;
+    }
+}
+
+template <class Impl>
+bool
+DefaultDecode<Impl>::checkSignalsAndUpdate(unsigned tid)
+{
+    // Check if there's a squash signal, squash if there is.
+    // Check stall signals, block if necessary.
+    // If status was blocked
+    //     Check if stall conditions have passed
+    //         if so then go to unblocking
+    // If status was Squashing
+    //     check if squashing is not high.  Switch to running this cycle.
+
+    // Update the per thread stall statuses.
+    readStallSignals(tid);
+
+    // Check squash signals from commit.
+    if (fromCommit->commitInfo[tid].squash) {
+
+        DPRINTF(Decode, "[tid:%u]: Squashing instructions due to squash "
+                "from commit.\n", tid);
+
+        squash(tid);
+
+        return true;
+    }
+
+    // Check ROB squash signals from commit.
+    if (fromCommit->commitInfo[tid].robSquashing) {
+        DPRINTF(Decode, "[tid:%]: ROB is still squashing.\n",tid);
+
+        // Continue to squash.
+        decodeStatus[tid] = Squashing;
+
+        return true;
+    }
+
+    if (checkStall(tid)) {
+        return block(tid);
+    }
+
+    if (decodeStatus[tid] == Blocked) {
+        DPRINTF(Decode, "[tid:%u]: Done blocking, switching to unblocking.\n",
+                tid);
+
+        decodeStatus[tid] = Unblocking;
+
+        unblock(tid);
+
+        return true;
+    }
+
+    if (decodeStatus[tid] == Squashing) {
+        // Switch status to running if decode isn't being told to block or
+        // squash this cycle.
+        DPRINTF(Decode, "[tid:%u]: Done squashing, switching to running.\n",
+                tid);
+
+        decodeStatus[tid] = Running;
+
+        return false;
+    }
+
+    // If we've reached this point, we have not gotten any signals that
+    // cause decode to change its status.  Decode remains the same as before.
+    return false;
+}
+
+template<class Impl>
+void
+DefaultDecode<Impl>::tick()
+{
+    wroteToTimeBuffer = false;
+
+    bool status_change = false;
+
+    toRenameIndex = 0;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    sortInsts();
+
+    //Check stall and squash signals.
+    while (threads != (*activeThreads).end()) {
+    unsigned tid = *threads++;
+
+        DPRINTF(Decode,"Processing [tid:%i]\n",tid);
+        status_change =  checkSignalsAndUpdate(tid) || status_change;
+
+        decode(status_change, tid);
+    }
+
+    if (status_change) {
+        updateStatus();
+    }
+
+    if (wroteToTimeBuffer) {
+        DPRINTF(Activity, "Activity this cycle.\n");
+
+        cpu->activityThisCycle();
+    }
+}
+
+template<class Impl>
+void
+DefaultDecode<Impl>::decode(bool &status_change, unsigned tid)
+{
+    // If status is Running or idle,
+    //     call decodeInsts()
+    // If status is Unblocking,
+    //     buffer any instructions coming from fetch
+    //     continue trying to empty skid buffer
+    //     check if stall conditions have passed
+
+    if (decodeStatus[tid] == Blocked) {
+        ++decodeBlockedCycles;
+    } else if (decodeStatus[tid] == Squashing) {
+        ++decodeSquashCycles;
+    }
+
+    // Decode should try to decode as many instructions as its bandwidth
     // will allow, as long as it is not currently blocked.
-    if (_status != Blocked && _status != Squashing) {
-        DPRINTF(Decode, "Decode: Not blocked, so attempting to run "
-                        "stage.\n");
+    if (decodeStatus[tid] == Running ||
+        decodeStatus[tid] == Idle) {
+        DPRINTF(Decode, "[tid:%u] Not blocked, so attempting to run "
+                "stage.\n",tid);
+
+        decodeInsts(tid);
+    } else if (decodeStatus[tid] == Unblocking) {
         // Make sure that the skid buffer has something in it if the
         // status is unblocking.
-        assert(_status == Unblocking ? !skidBuffer.empty() : 1);
-
-        decode();
+        assert(!skidsEmpty());
 
         // If the status was unblocking, then instructions from the skid
         // buffer were used.  Remove those instructions and handle
         // the rest of unblocking.
-        if (_status == Unblocking) {
-            ++decodeUnblockCycles;
-
-            if (fetchInstsValid()) {
-                // Add the current inputs to the skid buffer so they can be
-                // reprocessed when this stage unblocks.
-                skidBuffer.push(*fromFetch);
-            }
-
-            unblock();
-        }
-    } else if (_status == Blocked) {
-        ++decodeBlockedCycles;
+        decodeInsts(tid);
 
         if (fetchInstsValid()) {
-            block();
+            // Add the current inputs to the skid buffer so they can be
+            // reprocessed when this stage unblocks.
+            skidInsert(tid);
         }
 
-        if (!fromRename->renameInfo.stall &&
-            !fromIEW->iewInfo.stall &&
-            !fromCommit->commitInfo.stall) {
-            DPRINTF(Decode, "Decode: Stall signals cleared, going to "
-                    "unblock.\n");
-            _status = Unblocking;
-
-            // Continue to tell previous stage to block until this
-            // stage is done unblocking.
-            toFetch->decodeInfo.stall = true;
-        } else {
-            DPRINTF(Decode, "Decode: Still blocked.\n");
-            toFetch->decodeInfo.stall = true;
-        }
-
-        if (fromCommit->commitInfo.squash ||
-            fromCommit->commitInfo.robSquashing) {
-            squash();
-        }
-    } else if (_status == Squashing) {
-        if (!fromCommit->commitInfo.squash &&
-            !fromCommit->commitInfo.robSquashing) {
-            _status = Running;
-        } else if (fromCommit->commitInfo.squash) {
-            ++decodeSquashCycles;
-
-            squash();
-        }
+        status_change = unblock(tid) || status_change;
     }
 }
 
-template<class Impl>
+template <class Impl>
 void
-SimpleDecode<Impl>::decode()
+DefaultDecode<Impl>::decodeInsts(unsigned tid)
 {
-    // Check time buffer if being told to squash.
-    if (fromCommit->commitInfo.squash) {
-        squash();
-        return;
-    }
+    // Instructions can come either from the skid buffer or the list of
+    // instructions coming from fetch, depending on decode's status.
+    int insts_available = decodeStatus[tid] == Unblocking ?
+        skidBuffer[tid].size() : insts[tid].size();
 
-    // Check time buffer if being told to stall.
-    if (fromRename->renameInfo.stall ||
-        fromIEW->iewInfo.stall ||
-        fromCommit->commitInfo.stall) {
-        block();
-        return;
-    }
-
-    // Check fetch queue to see if instructions are available.
-    // If no available instructions, do nothing, unless this stage is
-    // currently unblocking.
-    if (!fetchInstsValid() && _status != Unblocking) {
-        DPRINTF(Decode, "Decode: Nothing to do, breaking out early.\n");
+    if (insts_available == 0) {
+        DPRINTF(Decode, "[tid:%u] Nothing to do, breaking out"
+                " early.\n",tid);
         // Should I change the status to idle?
         ++decodeIdleCycles;
         return;
+    } else if (decodeStatus[tid] == Unblocking) {
+        DPRINTF(Decode, "[tid:%u] Unblocking, removing insts from skid "
+                "buffer.\n",tid);
+        ++decodeUnblockCycles;
+    } else if (decodeStatus[tid] == Running) {
+        ++decodeRunCycles;
     }
 
-    // Might be better to use a base DynInst * instead?
     DynInstPtr inst;
 
-    unsigned to_rename_index = 0;
+    std::queue<DynInstPtr>
+        &insts_to_decode = decodeStatus[tid] == Unblocking ?
+        skidBuffer[tid] : insts[tid];
 
-    int insts_available = _status == Unblocking ?
-        skidBuffer.front().size - numInst :
-        fromFetch->size;
+    DPRINTF(Decode, "[tid:%u]: Sending instruction to rename.\n",tid);
 
-    // Debug block...
-#if 0
-    if (insts_available) {
-        DPRINTF(Decode, "Decode: Instructions available.\n");
-    } else {
-        if (_status == Unblocking && skidBuffer.empty()) {
-            DPRINTF(Decode, "Decode: No instructions available, skid buffer "
-                    "empty.\n");
-        } else if (_status != Unblocking &&
-                   !fromFetch->insts[0]) {
-            DPRINTF(Decode, "Decode: No instructions available, fetch queue "
-                    "empty.\n");
-        } else {
-            panic("Decode: No instructions available, unexpected condition!"
-                  "\n");
-        }
-    }
-#endif
+    while (insts_available > 0 && toRenameIndex < decodeWidth) {
+        assert(!insts_to_decode.empty());
 
-    while (insts_available > 0)
-    {
-        DPRINTF(Decode, "Decode: Sending instruction to rename.\n");
+        inst = insts_to_decode.front();
 
-        inst = _status == Unblocking ? skidBuffer.front().insts[numInst] :
-               fromFetch->insts[numInst];
+        insts_to_decode.pop();
 
-        DPRINTF(Decode, "Decode: Processing instruction %i with PC %#x\n",
-                inst->seqNum, inst->readPC());
+        DPRINTF(Decode, "[tid:%u]: Processing instruction [sn:%lli] with "
+                "PC %#x\n",
+                tid, inst->seqNum, inst->readPC());
 
         if (inst->isSquashed()) {
-            DPRINTF(Decode, "Decode: Instruction %i with PC %#x is "
+            DPRINTF(Decode, "[tid:%u]: Instruction %i with PC %#x is "
                     "squashed, skipping.\n",
-                    inst->seqNum, inst->readPC());
+                    tid, inst->seqNum, inst->readPC());
 
             ++decodeSquashedInsts;
 
-            ++numInst;
             --insts_available;
 
             continue;
         }
 
-
         // Also check if instructions have no source registers.  Mark
         // them as ready to issue at any time.  Not sure if this check
         // should exist here or at a later stage; however it doesn't matter
         // too much for function correctness.
-        // Isn't this handled by the inst queue?
         if (inst->numSrcRegs() == 0) {
             inst->setCanIssue();
         }
@@ -378,9 +657,12 @@ SimpleDecode<Impl>::decode()
         // This current instruction is valid, so add it into the decode
         // queue.  The next instruction may not be valid, so check to
         // see if branches were predicted correctly.
-        toRename->insts[to_rename_index] = inst;
+        toRename->insts[toRenameIndex] = inst;
 
         ++(toRename->size);
+        ++toRenameIndex;
+        ++decodeDecodedInsts;
+        --insts_available;
 
         // Ensure that if it was predicted as a branch, it really is a
         // branch.
@@ -388,38 +670,39 @@ SimpleDecode<Impl>::decode()
             panic("Instruction predicted as a branch!");
 
             ++decodeControlMispred;
+
             // Might want to set some sort of boolean and just do
             // a check at the end
-            squash(inst);
+            squash(inst, inst->threadNumber);
+
             break;
         }
 
         // Go ahead and compute any PC-relative branches.
-
         if (inst->isDirectCtrl() && inst->isUncondCtrl()) {
-
             inst->setNextPC(inst->branchTarget());
 
             if (inst->mispredicted()) {
                 ++decodeBranchMispred;
+
                 // Might want to set some sort of boolean and just do
                 // a check at the end
-                squash(inst);
+                squash(inst, inst->threadNumber);
+
                 break;
             }
         }
-
-        // Normally can check if a direct branch has the right target
-        // addr (either the immediate, or the branch PC + 4) and redirect
-        // fetch if it's incorrect.
-
-        // Increment which instruction we're looking at.
-        ++numInst;
-        ++to_rename_index;
-        ++decodeDecodedInsts;
-
-        --insts_available;
     }
 
-     numInst = 0;
+    // If we didn't process all instructions, then we will need to block
+    // and put all those instructions into the skid buffer.
+    if (!insts_to_decode.empty()) {
+        block(tid);
+    }
+
+    // Record that decode has written to the time buffer for activity
+    // tracking.
+    if (toRenameIndex) {
+        wroteToTimeBuffer = true;
+    }
 }
diff --git a/cpu/o3/fetch.cc b/cpu/o3/fetch.cc
index 8ad5e6565..7959416be 100644
--- a/cpu/o3/fetch.cc
+++ b/cpu/o3/fetch.cc
@@ -30,4 +30,4 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/fetch_impl.hh"
 
-template class SimpleFetch<AlphaSimpleImpl>;
+template class DefaultFetch<AlphaSimpleImpl>;
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index cc64800d9..f0f3f2745 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -26,11 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo: SMT fetch,
-// Add a way to get a stage's current status.
-
-#ifndef __CPU_O3_CPU_SIMPLE_FETCH_HH__
-#define __CPU_O3_CPU_SIMPLE_FETCH_HH__
+#ifndef __CPU_O3_FETCH_HH__
+#define __CPU_O3_FETCH_HH__
 
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
@@ -39,13 +36,15 @@
 #include "sim/eventq.hh"
 
 /**
- * SimpleFetch class to fetch a single instruction each cycle.  SimpleFetch
- * will stall if there's an Icache miss, but otherwise assumes a one cycle
- * Icache hit.
+ * DefaultFetch class handles both single threaded and SMT fetch. Its width is
+ * specified by the parameters; each cycle it tries to fetch that many
+ * instructions. It supports using a branch predictor to predict direction and
+ * targets.
+ * It supports the idling functionalitiy of the CPU by indicating to the CPU
+ * when it is active and inactive.
  */
-
 template <class Impl>
-class SimpleFetch
+class DefaultFetch
 {
   public:
     /** Typedefs from Impl. */
@@ -55,56 +54,125 @@ class SimpleFetch
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::Params Params;
 
+    /** Typedefs from the CPU policy. */
     typedef typename CPUPol::BPredUnit BPredUnit;
     typedef typename CPUPol::FetchStruct FetchStruct;
     typedef typename CPUPol::TimeStruct TimeStruct;
 
     /** Typedefs from ISA. */
     typedef TheISA::MachInst MachInst;
+    typedef TheISA::ExtMachInst ExtMachInst;
 
   public:
-    enum Status {
+    /** Overall fetch status. Used to determine if the CPU can deschedule itsef
+     * due to a lack of activity.
+     */
+    enum FetchStatus {
+        Active,
+        Inactive
+    };
+
+    /** Individual thread status. */
+    enum ThreadStatus {
         Running,
         Idle,
         Squashing,
         Blocked,
+        Fetching,
+        TrapPending,
+        QuiescePending,
         IcacheMissStall,
         IcacheMissComplete
     };
 
-    // May eventually need statuses on a per thread basis.
-    Status _status;
+    /** Fetching Policy, Add new policies here.*/
+    enum FetchPriority {
+        SingleThread,
+        RoundRobin,
+        Branch,
+        IQ,
+        LSQ
+    };
 
-    bool stalled;
+  private:
+    /** Fetch status. */
+    FetchStatus _status;
+
+    /** Per-thread status. */
+    ThreadStatus fetchStatus[Impl::MaxThreads];
+
+    /** Fetch policy. */
+    FetchPriority fetchPolicy;
+
+    /** List that has the threads organized by priority. */
+    std::list<unsigned> priorityList;
 
   public:
     class CacheCompletionEvent : public Event
     {
       private:
-        SimpleFetch *fetch;
+        MemReqPtr req;
+        /** Pointer to fetch. */
+        DefaultFetch *fetch;
+        /** Thread id. */
+//        unsigned threadId;
 
       public:
-        CacheCompletionEvent(SimpleFetch *_fetch);
+        /** Constructs a cache completion event, which tells fetch when the
+         * cache miss is complete.
+         */
+        CacheCompletionEvent(MemReqPtr &_req, DefaultFetch *_fetch);
 
+        /** Processes cache completion event. */
         virtual void process();
+        /** Returns the description of the cache completion event. */
         virtual const char *description();
     };
 
   public:
-    /** SimpleFetch constructor. */
-    SimpleFetch(Params &params);
+    /** DefaultFetch constructor. */
+    DefaultFetch(Params *params);
 
+    /** Returns the name of fetch. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets CPU pointer. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets the main backwards communication time buffer pointer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer);
 
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Sets pointer to time buffer used to communicate to the next stage. */
     void setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr);
 
-    void processCacheCompletion();
+    /** Sets pointer to page table. */
+//    void setPageTable(PageTable *pt_ptr);
+
+    /** Initialize stage. */
+    void initStage();
+
+    /** Processes cache completion event. */
+    void processCacheCompletion(MemReqPtr &req);
+
+    void wakeFromQuiesce();
 
   private:
+    /** Changes the status of this stage to active, and indicates this to the
+     * CPU.
+     */
+    inline void switchToActive();
+
+    /** Changes the status of this stage to inactive, and indicates this to the
+     * CPU.
+     */
+    inline void switchToInactive();
+
     /**
      * Looks up in the branch predictor to see if the next PC should be
      * either next PC+=MachInst or a branch target.
@@ -120,30 +188,76 @@ class SimpleFetch
      * fault that happened.  Puts the data into the class variable
      * cacheData.
      * @param fetch_PC The PC address that is being fetched from.
+     * @param ret_fault The fault reference that will be set to the result of
+     * the icache access.
+     * @param tid Thread id.
      * @return Any fault that occured.
      */
-    Fault fetchCacheLine(Addr fetch_PC);
+    bool fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid);
 
-    inline void doSquash(const Addr &new_PC);
+    /** Squashes a specific thread and resets the PC. */
+    inline void doSquash(const Addr &new_PC, unsigned tid);
 
-    void squashFromDecode(const Addr &new_PC, const InstSeqNum &seq_num);
+    /** Squashes a specific thread and resets the PC. Also tells the CPU to
+     * remove any instructions between fetch and decode that should be sqaushed.
+     */
+    void squashFromDecode(const Addr &new_PC, const InstSeqNum &seq_num,
+                          unsigned tid);
+
+    /** Checks if a thread is stalled. */
+    bool checkStall(unsigned tid) const;
+
+    /** Updates overall fetch stage status; to be called at the end of each
+     * cycle. */
+    FetchStatus updateFetchStatus();
 
   public:
-    // Figure out PC vs next PC and how it should be updated
-    void squash(const Addr &new_PC);
+    /** Squashes a specific thread and resets the PC. Also tells the CPU to
+     * remove any instructions that are not in the ROB. The source of this
+     * squash should be the commit stage.
+     */
+    void squash(const Addr &new_PC, unsigned tid);
 
+    /** Ticks the fetch stage, processing all inputs signals and fetching
+     * as many instructions as possible.
+     */
     void tick();
 
-    void fetch();
+    /** Checks all input signals and updates the status as necessary.
+     *  @return: Returns if the status has changed due to input signals.
+     */
+    bool checkSignalsAndUpdate(unsigned tid);
 
-    // Align an address (typically a PC) to the start of an I-cache block.
-    // We fold in the PISA 64- to 32-bit conversion here as well.
+    /** Does the actual fetching of instructions and passing them on to the
+     * next stage.
+     * @param status_change fetch() sets this variable if there was a status
+     * change (ie switching to IcacheMissStall).
+     */
+    void fetch(bool &status_change);
+
+    /** Align a PC to the start of an I-cache block. */
     Addr icacheBlockAlignPC(Addr addr)
     {
         addr = TheISA::realPCToFetchPC(addr);
         return (addr & ~(cacheBlkMask));
     }
 
+  private:
+    /** Returns the appropriate thread to fetch, given the fetch policy. */
+    int getFetchingThread(FetchPriority &fetch_priority);
+
+    /** Returns the appropriate thread to fetch using a round robin policy. */
+    int roundRobin();
+
+    /** Returns the appropriate thread to fetch using the IQ count policy. */
+    int iqCount();
+
+    /** Returns the appropriate thread to fetch using the LSQ count policy. */
+    int lsqCount();
+
+    /** Returns the appropriate thread to fetch using the branch count policy. */
+    int branchCount();
+
   private:
     /** Pointer to the FullCPU. */
     FullCPU *cpu;
@@ -176,8 +290,31 @@ class SimpleFetch
     /** BPredUnit. */
     BPredUnit branchPred;
 
+    Addr PC[Impl::MaxThreads];
+
+    Addr nextPC[Impl::MaxThreads];
+
     /** Memory request used to access cache. */
-    MemReqPtr memReq;
+    MemReqPtr memReq[Impl::MaxThreads];
+
+    /** Variable that tracks if fetch has written to the time buffer this
+     * cycle. Used to tell CPU if there is activity this cycle.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Tracks how many instructions has been fetched this cycle. */
+    int numInst;
+
+    /** Source of possible stalls. */
+    struct Stalls {
+        bool decode;
+        bool rename;
+        bool iew;
+        bool commit;
+    };
+
+    /** Tracks which stages are telling fetch to stall. */
+    Stalls stalls[Impl::MaxThreads];
 
     /** Decode to fetch delay, in ticks. */
     unsigned decodeToFetchDelay;
@@ -201,23 +338,56 @@ class SimpleFetch
     Addr cacheBlkMask;
 
     /** The cache line being fetched. */
-    uint8_t *cacheData;
+    uint8_t *cacheData[Impl::MaxThreads];
 
     /** Size of instructions. */
     int instSize;
 
     /** Icache stall statistics. */
-    Counter lastIcacheStall;
+    Counter lastIcacheStall[Impl::MaxThreads];
 
+    /** List of Active Threads */
+    std::list<unsigned> *activeThreads;
+
+    /** Number of threads. */
+    unsigned numThreads;
+
+    /** Number of threads that are actively fetching. */
+    unsigned numFetchingThreads;
+
+    /** Thread ID being fetched. */
+    int threadFetched;
+
+    bool interruptPending;
+
+#if !FULL_SYSTEM
+    /** Page table pointer. */
+//    PageTable *pTable;
+#endif
+
+    // @todo: Consider making these vectors and tracking on a per thread basis.
+    /** Stat for total number of cycles stalled due to an icache miss. */
     Stats::Scalar<> icacheStallCycles;
+    /** Stat for total number of fetched instructions. */
     Stats::Scalar<> fetchedInsts;
+    /** Stat for total number of predicted branches. */
     Stats::Scalar<> predictedBranches;
+    /** Stat for total number of cycles spent fetching. */
     Stats::Scalar<> fetchCycles;
+    /** Stat for total number of cycles spent squashing. */
     Stats::Scalar<> fetchSquashCycles;
+    /** Stat for total number of cycles spent blocked due to other stages in
+     * the pipeline.
+     */
+    Stats::Scalar<> fetchIdleCycles;
     Stats::Scalar<> fetchBlockedCycles;
+    /** Stat for total number of fetched cache lines. */
     Stats::Scalar<> fetchedCacheLines;
-
-    Stats::Distribution<> fetch_nisn_dist;
+    /** Distribution of number of instructions fetched each cycle. */
+    Stats::Distribution<> fetchNisnDist;
+    Stats::Formula idleRate;
+    Stats::Formula branchRate;
+    Stats::Formula fetchRate;
 };
 
-#endif //__CPU_O3_CPU_SIMPLE_FETCH_HH__
+#endif //__CPU_O3_FETCH_HH__
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 8029fc732..7abc5733f 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -26,66 +26,101 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Remove this later; used only for debugging.
-#define OPCODE(X)                       (X >> 26) & 0x3f
-
 #include "arch/isa_traits.hh"
 #include "sim/byteswap.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/o3/fetch.hh"
 #include "mem/base_mem.hh"
 #include "mem/mem_interface.hh"
 #include "mem/mem_req.hh"
-#include "cpu/o3/fetch.hh"
 
 #include "sim/root.hh"
 
+#if FULL_SYSTEM
+#include "base/remote_gdb.hh"
+#include "mem/functional/memory_control.hh"
+#include "mem/functional/physical.hh"
+#include "sim/system.hh"
+#include "arch/tlb.hh"
+#include "arch/vtophys.hh"
+#else // !FULL_SYSTEM
+#include "mem/functional/functional.hh"
+#endif // FULL_SYSTEM
+
+#include <algorithm>
+
+using namespace std;
+
 template<class Impl>
-SimpleFetch<Impl>::CacheCompletionEvent
-::CacheCompletionEvent(SimpleFetch *_fetch)
-    : Event(&mainEventQueue),
+DefaultFetch<Impl>::CacheCompletionEvent::CacheCompletionEvent(MemReqPtr &_req,
+                                                               DefaultFetch *_fetch)
+    : Event(&mainEventQueue, Delayed_Writeback_Pri),
+      req(_req),
       fetch(_fetch)
 {
+    this->setFlags(Event::AutoDelete);
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::CacheCompletionEvent::process()
+DefaultFetch<Impl>::CacheCompletionEvent::process()
 {
-    fetch->processCacheCompletion();
+    fetch->processCacheCompletion(req);
 }
 
 template<class Impl>
 const char *
-SimpleFetch<Impl>::CacheCompletionEvent::description()
+DefaultFetch<Impl>::CacheCompletionEvent::description()
 {
-    return "SimpleFetch cache completion event";
+    return "DefaultFetch cache completion event";
 }
 
 template<class Impl>
-SimpleFetch<Impl>::SimpleFetch(Params &params)
-    : icacheInterface(params.icacheInterface),
+DefaultFetch<Impl>::DefaultFetch(Params *params)
+    : icacheInterface(params->icacheInterface),
       branchPred(params),
-      decodeToFetchDelay(params.decodeToFetchDelay),
-      renameToFetchDelay(params.renameToFetchDelay),
-      iewToFetchDelay(params.iewToFetchDelay),
-      commitToFetchDelay(params.commitToFetchDelay),
-      fetchWidth(params.fetchWidth)
+      decodeToFetchDelay(params->decodeToFetchDelay),
+      renameToFetchDelay(params->renameToFetchDelay),
+      iewToFetchDelay(params->iewToFetchDelay),
+      commitToFetchDelay(params->commitToFetchDelay),
+      fetchWidth(params->fetchWidth),
+      numThreads(params->numberOfThreads),
+      numFetchingThreads(params->smtNumFetchingThreads),
+      interruptPending(false)
 {
-    DPRINTF(Fetch, "Fetch: Fetch constructor called\n");
+    if (numThreads > Impl::MaxThreads)
+        fatal("numThreads is not a valid value\n");
 
-    // Set status to idle.
-    _status = Idle;
+    DPRINTF(Fetch, "Fetch constructor called\n");
 
-    // Create a new memory request.
-    memReq = new MemReq();
-    // Not sure of this parameter.  I think it should be based on the
-    // thread number.
-#if !FULL_SYSTEM
-    memReq->asid = 0;
-#else
-    memReq->asid = 0;
-#endif // FULL_SYSTEM
-    memReq->data = new uint8_t[64];
+    // Set fetch stage's status to inactive.
+    _status = Inactive;
+
+    string policy = params->smtFetchPolicy;
+
+    // Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    // Figure out fetch policy
+    if (policy == "singlethread") {
+        fetchPolicy = SingleThread;
+    } else if (policy == "roundrobin") {
+        fetchPolicy = RoundRobin;
+        DPRINTF(Fetch, "Fetch policy set to Round Robin\n");
+    } else if (policy == "branch") {
+        fetchPolicy = Branch;
+        DPRINTF(Fetch, "Fetch policy set to Branch Count\n");
+    } else if (policy == "iqcount") {
+        fetchPolicy = IQ;
+        DPRINTF(Fetch, "Fetch policy set to IQ count\n");
+    } else if (policy == "lsqcount") {
+        fetchPolicy = LSQ;
+        DPRINTF(Fetch, "Fetch policy set to LSQ count\n");
+    } else {
+        fatal("Invalid Fetch Policy. Options Are: {SingleThread,"
+              " RoundRobin,LSQcount,IQcount}\n");
+    }
 
     // Size of cache block.
     cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
@@ -93,16 +128,45 @@ SimpleFetch<Impl>::SimpleFetch(Params &params)
     // Create mask to get rid of offset bits.
     cacheBlkMask = (cacheBlkSize - 1);
 
+    for (int tid=0; tid < numThreads; tid++) {
+
+        fetchStatus[tid] = Running;
+
+        priorityList.push_back(tid);
+
+        // Create a new memory request.
+        memReq[tid] = NULL;
+//        memReq[tid] = new MemReq();
+/*
+        // Need a way of setting this correctly for parallel programs
+        // @todo: Figure out how to properly set asid vs thread_num.
+        memReq[tid]->asid = tid;
+        memReq[tid]->thread_num = tid;
+        memReq[tid]->data = new uint8_t[64];
+*/
+        // Create space to store a cache line.
+        cacheData[tid] = new uint8_t[cacheBlkSize];
+
+        stalls[tid].decode = 0;
+        stalls[tid].rename = 0;
+        stalls[tid].iew = 0;
+        stalls[tid].commit = 0;
+    }
+
     // Get the size of an instruction.
     instSize = sizeof(MachInst);
+}
 
-    // Create space to store a cache line.
-    cacheData = new uint8_t[cacheBlkSize];
+template <class Impl>
+std::string
+DefaultFetch<Impl>::name() const
+{
+    return cpu->name() + ".fetch";
 }
 
 template <class Impl>
 void
-SimpleFetch<Impl>::regStats()
+DefaultFetch<Impl>::regStats()
 {
     icacheStallCycles
         .name(name() + ".icacheStallCycles")
@@ -113,55 +177,88 @@ SimpleFetch<Impl>::regStats()
         .name(name() + ".fetchedInsts")
         .desc("Number of instructions fetch has processed")
         .prereq(fetchedInsts);
+
     predictedBranches
         .name(name() + ".predictedBranches")
         .desc("Number of branches that fetch has predicted taken")
         .prereq(predictedBranches);
+
     fetchCycles
         .name(name() + ".fetchCycles")
         .desc("Number of cycles fetch has run and was not squashing or"
               " blocked")
         .prereq(fetchCycles);
+
     fetchSquashCycles
         .name(name() + ".fetchSquashCycles")
         .desc("Number of cycles fetch has spent squashing")
         .prereq(fetchSquashCycles);
+
+    fetchIdleCycles
+        .name(name() + ".fetchIdleCycles")
+        .desc("Number of cycles fetch was idle")
+        .prereq(fetchIdleCycles);
+
     fetchBlockedCycles
         .name(name() + ".fetchBlockedCycles")
         .desc("Number of cycles fetch has spent blocked")
         .prereq(fetchBlockedCycles);
+
     fetchedCacheLines
         .name(name() + ".fetchedCacheLines")
         .desc("Number of cache lines fetched")
         .prereq(fetchedCacheLines);
 
-    fetch_nisn_dist
+    fetchNisnDist
         .init(/* base value */ 0,
               /* last value */ fetchWidth,
               /* bucket size */ 1)
-        .name(name() + ".FETCH:rate_dist")
+        .name(name() + ".rateDist")
         .desc("Number of instructions fetched each cycle (Total)")
-        .flags(Stats::pdf)
-        ;
+        .flags(Stats::pdf);
+
+    idleRate
+        .name(name() + ".idleRate")
+        .desc("Percent of cycles fetch was idle")
+        .prereq(idleRate);
+    idleRate = fetchIdleCycles * 100 / cpu->numCycles;
+
+    branchRate
+        .name(name() + ".branchRate")
+        .desc("Number of branch fetches per cycle")
+        .flags(Stats::total);
+    branchRate = predictedBranches / cpu->numCycles;
+
+    fetchRate
+        .name(name() + ".rate")
+        .desc("Number of inst fetches per cycle")
+        .flags(Stats::total);
+    fetchRate = fetchedInsts / cpu->numCycles;
 
     branchPred.regStats();
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultFetch<Impl>::setCPU(FullCPU *cpu_ptr)
 {
-    DPRINTF(Fetch, "Fetch: Setting the CPU pointer.\n");
+    DPRINTF(Fetch, "Setting the CPU pointer.\n");
     cpu = cpu_ptr;
-    // This line will be removed eventually.
-    memReq->xc = cpu->xcBase();
+
+    // Set ExecContexts for Memory Requests
+//    for (int tid=0; tid < numThreads; tid++)
+//        memReq[tid]->xc = cpu->xcBase(tid);
+
+    // Fetch needs to start fetching instructions at the very beginning,
+    // so it must start up in active state.
+    switchToActive();
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
+DefaultFetch<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
 {
-    DPRINTF(Fetch, "Fetch: Setting the time buffer pointer.\n");
+    DPRINTF(Fetch, "Setting the time buffer pointer.\n");
     timeBuffer = time_buffer;
 
     // Create wires to get information from proper places in time buffer.
@@ -173,32 +270,122 @@ SimpleFetch<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
 
 template<class Impl>
 void
-SimpleFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+DefaultFetch<Impl>::setActiveThreads(list<unsigned> *at_ptr)
 {
-    DPRINTF(Fetch, "Fetch: Setting the fetch queue pointer.\n");
+    DPRINTF(Fetch, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+
+template<class Impl>
+void
+DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+{
+    DPRINTF(Fetch, "Setting the fetch queue pointer.\n");
     fetchQueue = fq_ptr;
 
     // Create wire to write information to proper place in fetch queue.
     toDecode = fetchQueue->getWire(0);
 }
 
+#if 0
 template<class Impl>
 void
-SimpleFetch<Impl>::processCacheCompletion()
+DefaultFetch<Impl>::setPageTable(PageTable *pt_ptr)
 {
-    DPRINTF(Fetch, "Fetch: Waking up from cache miss.\n");
+    DPRINTF(Fetch, "Setting the page table pointer.\n");
+#if !FULL_SYSTEM
+    pTable = pt_ptr;
+#endif
+}
+#endif
+
+template<class Impl>
+void
+DefaultFetch<Impl>::initStage()
+{
+    for (int tid = 0; tid < numThreads; tid++) {
+        PC[tid] = cpu->readPC(tid);
+        nextPC[tid] = cpu->readNextPC(tid);
+    }
+}
+
+template<class Impl>
+void
+DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
+{
+    unsigned tid = req->thread_num;
+
+    DPRINTF(Fetch, "[tid:%u] Waking up from cache miss.\n",tid);
 
     // Only change the status if it's still waiting on the icache access
     // to return.
     // Can keep track of how many cache accesses go unused due to
     // misspeculation here.
-    if (_status == IcacheMissStall)
-        _status = IcacheMissComplete;
+    if (fetchStatus[tid] != IcacheMissStall ||
+        req != memReq[tid])
+        return;
+
+    // Wake up the CPU (if it went to sleep and was waiting on this completion
+    // event).
+    cpu->wakeCPU();
+
+    DPRINTF(Activity, "[tid:%u] Activating fetch due to cache completion\n",
+            tid);
+
+    switchToActive();
+
+    // Only switch to IcacheMissComplete if we're not stalled as well.
+    if (checkStall(tid)) {
+        fetchStatus[tid] = Blocked;
+    } else {
+        fetchStatus[tid] = IcacheMissComplete;
+    }
+
+//    memcpy(cacheData[tid], memReq[tid]->data, memReq[tid]->size);
+
+    // Reset the completion event to NULL.
+    memReq[tid] = NULL;
+//    memReq[tid]->completionEvent = NULL;
+}
+
+template <class Impl>
+void
+DefaultFetch<Impl>::wakeFromQuiesce()
+{
+    DPRINTF(Fetch, "Waking up from quiesce\n");
+    // Hopefully this is safe
+    fetchStatus[0] = Running;
+}
+
+template <class Impl>
+inline void
+DefaultFetch<Impl>::switchToActive()
+{
+    if (_status == Inactive) {
+        DPRINTF(Activity, "Activating stage.\n");
+
+        cpu->activateStage(FullCPU::FetchIdx);
+
+        _status = Active;
+    }
+}
+
+template <class Impl>
+inline void
+DefaultFetch<Impl>::switchToInactive()
+{
+    if (_status == Active) {
+        DPRINTF(Activity, "Deactivating stage.\n");
+
+        cpu->deactivateStage(FullCPU::FetchIdx);
+
+        _status = Inactive;
+    }
 }
 
 template <class Impl>
 bool
-SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
+DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
 {
     // Do branch prediction check here.
     // A bit of a misnomer...next_PC is actually the current PC until
@@ -211,7 +398,7 @@ SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
         return false;
     }
 
-    predict_taken = branchPred.predict(inst, next_PC);
+    predict_taken = branchPred.predict(inst, next_PC, inst->threadNumber);
 
     if (predict_taken) {
         ++predictedBranches;
@@ -221,37 +408,48 @@ SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
 }
 
 template <class Impl>
-Fault
-SimpleFetch<Impl>::fetchCacheLine(Addr fetch_PC)
+bool
+DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid)
 {
     // Check if the instruction exists within the cache.
     // If it does, then proceed on to read the instruction and the rest
     // of the instructions in the cache line until either the end of the
     // cache line or a predicted taken branch is encountered.
+    Fault fault = NoFault;
 
 #if FULL_SYSTEM
     // Flag to say whether or not address is physical addr.
-    unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
+    unsigned flags = cpu->inPalMode(fetch_PC) ? PHYSICAL : 0;
 #else
     unsigned flags = 0;
 #endif // FULL_SYSTEM
 
-    Fault fault = NoFault;
+    if (interruptPending && flags == 0) {
+        // Hold off fetch from getting new instructions while an interrupt
+        // is pending.
+        return false;
+    }
 
     // Align the fetch PC so it's at the start of a cache block.
     fetch_PC = icacheBlockAlignPC(fetch_PC);
 
-    // Setup the memReq to do a read of the first isntruction's address.
+    // Setup the memReq to do a read of the first instruction's address.
     // Set the appropriate read size and flags as well.
-    memReq->cmd = Read;
-    memReq->reset(fetch_PC, cacheBlkSize, flags);
+    memReq[tid] = new MemReq();
+
+    memReq[tid]->asid = tid;
+    memReq[tid]->thread_num = tid;
+    memReq[tid]->data = new uint8_t[64];
+    memReq[tid]->xc = cpu->xcBase(tid);
+    memReq[tid]->cmd = Read;
+    memReq[tid]->reset(fetch_PC, cacheBlkSize, flags);
 
     // Translate the instruction request.
-    // Should this function be
-    // in the CPU class ?  Probably...ITB/DTB should exist within the
-    // CPU.
-
-    fault = cpu->translateInstReq(memReq);
+//#if FULL_SYSTEM
+    fault = cpu->translateInstReq(memReq[tid]);
+//#else
+//    fault = pTable->translate(memReq[tid]);
+//#endif
 
     // In the case of faults, the fetch stage may need to stall and wait
     // on what caused the fetch (ITB or Icache miss).
@@ -259,213 +457,416 @@ SimpleFetch<Impl>::fetchCacheLine(Addr fetch_PC)
     // If translation was successful, attempt to read the first
     // instruction.
     if (fault == NoFault) {
+        if (cpu->system->memctrl->badaddr(memReq[tid]->paddr)) {
+            DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
+                    "misspeculating path!",
+                    memReq[tid]->paddr);
+            ret_fault = TheISA::genMachineCheckFault();
+            return false;
+        }
+
         DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
-        fault = cpu->mem->read(memReq, cacheData);
+        fault = cpu->mem->read(memReq[tid], cacheData[tid]);
         // This read may change when the mem interface changes.
 
-        fetchedCacheLines++;
-    }
+        // Now do the timing access to see whether or not the instruction
+        // exists within the cache.
+        if (icacheInterface && !icacheInterface->isBlocked()) {
+            DPRINTF(Fetch, "Doing cache access.\n");
 
-    // Now do the timing access to see whether or not the instruction
-    // exists within the cache.
-    if (icacheInterface && fault == NoFault) {
-        DPRINTF(Fetch, "Fetch: Doing timing memory access.\n");
-        memReq->completionEvent = NULL;
+            memReq[tid]->completionEvent = NULL;
 
-        memReq->time = curTick;
+            memReq[tid]->time = curTick;
 
-        MemAccessResult result = icacheInterface->access(memReq);
+            MemAccessResult result = icacheInterface->access(memReq[tid]);
 
-        // If the cache missed (in this model functional and timing
-        // memories are different), then schedule an event to wake
-        // up this stage once the cache miss completes.
-        if (result != MA_HIT && icacheInterface->doEvents()) {
-            memReq->completionEvent = new CacheCompletionEvent(this);
+            // If the cache missed, then schedule an event to wake
+            // up this stage once the cache miss completes.
+            // @todo: Possibly allow for longer than 1 cycle cache hits.
+            if (result != MA_HIT && icacheInterface->doEvents()) {
 
-            // How does current model work as far as individual
-            // stages scheduling/unscheduling?
-            // Perhaps have only the main CPU scheduled/unscheduled,
-            // and have it choose what stages to run appropriately.
+                memReq[tid]->completionEvent =
+                    new CacheCompletionEvent(memReq[tid], this);
 
-            DPRINTF(Fetch, "Fetch: Stalling due to icache miss.\n");
-            _status = IcacheMissStall;
+                lastIcacheStall[tid] = curTick;
+
+                DPRINTF(Activity, "[tid:%i]: Activity: Stalling due to I-cache "
+                        "miss.\n", tid);
+
+                fetchStatus[tid] = IcacheMissStall;
+            } else {
+                DPRINTF(Fetch, "[tid:%i]: I-Cache hit. Doing Instruction "
+                        "read.\n", tid);
+
+//                memcpy(cacheData[tid], memReq[tid]->data, memReq[tid]->size);
+
+                fetchedCacheLines++;
+            }
+        } else {
+            DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
+            ret_fault = NoFault;
+            return false;
         }
     }
 
-    return fault;
+    ret_fault = fault;
+    return true;
 }
 
 template <class Impl>
 inline void
-SimpleFetch<Impl>::doSquash(const Addr &new_PC)
+DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
 {
-    DPRINTF(Fetch, "Fetch: Squashing, setting PC to: %#x.\n", new_PC);
+    DPRINTF(Fetch, "[tid:%i]: Squashing, setting PC to: %#x.\n",
+            tid, new_PC);
 
-    cpu->setNextPC(new_PC + instSize);
-    cpu->setPC(new_PC);
+    PC[tid] = new_PC;
+    nextPC[tid] = new_PC + instSize;
 
     // Clear the icache miss if it's outstanding.
-    if (_status == IcacheMissStall && icacheInterface) {
-        DPRINTF(Fetch, "Fetch: Squashing outstanding Icache miss.\n");
-        // @todo: Use an actual thread number here.
-        icacheInterface->squash(0);
+    if (fetchStatus[tid] == IcacheMissStall && icacheInterface) {
+        DPRINTF(Fetch, "[tid:%i]: Squashing outstanding Icache miss.\n",
+                tid);
+//        icacheInterface->squash(tid);
+/*
+        if (memReq[tid]->completionEvent) {
+            if (memReq[tid]->completionEvent->scheduled()) {
+                memReq[tid]->completionEvent->squash();
+            } else {
+                delete memReq[tid]->completionEvent;
+                memReq[tid]->completionEvent = NULL;
+            }
+        }
+*/
+        memReq[tid] = NULL;
     }
 
-    _status = Squashing;
+    if (fetchStatus[tid] == TrapPending) {
+        // @todo: Hardcoded number here
+
+        // This is only effective if communication to and from commit
+        // is identical.  If it's faster to commit than it is from
+        // commit to here, then it causes problems.
+
+        bool found_fault = false;
+        for (int i = 0; i > -5; --i) {
+            if (fetchQueue->access(i)->fetchFault) {
+                DPRINTF(Fetch, "[tid:%i]: Fetch used to be in a trap, "
+                        "clearing it.\n",
+                        tid);
+                fetchQueue->access(i)->fetchFault = NoFault;
+                found_fault = true;
+            }
+        }
+        if (!found_fault) {
+            warn("%lli Fault from fetch not found in time buffer!",
+                 curTick);
+        }
+        toDecode->clearFetchFault = true;
+    }
+
+    fetchStatus[tid] = Squashing;
 
     ++fetchSquashCycles;
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::squashFromDecode(const Addr &new_PC,
-                                    const InstSeqNum &seq_num)
+DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
+                                    const InstSeqNum &seq_num,
+                                    unsigned tid)
 {
-    DPRINTF(Fetch, "Fetch: Squashing from decode.\n");
+    DPRINTF(Fetch, "[tid:%i]: Squashing from decode.\n",tid);
 
-    doSquash(new_PC);
+    doSquash(new_PC, tid);
 
     // Tell the CPU to remove any instructions that are in flight between
     // fetch and decode.
-    cpu->removeInstsUntil(seq_num);
+    cpu->removeInstsUntil(seq_num, tid);
+}
+
+template<class Impl>
+bool
+DefaultFetch<Impl>::checkStall(unsigned tid) const
+{
+    bool ret_val = false;
+
+    if (cpu->contextSwitch) {
+        DPRINTF(Fetch,"[tid:%i]: Stalling for a context switch.\n",tid);
+        ret_val = true;
+    } else if (stalls[tid].decode) {
+        DPRINTF(Fetch,"[tid:%i]: Stall from Decode stage detected.\n",tid);
+        ret_val = true;
+    } else if (stalls[tid].rename) {
+        DPRINTF(Fetch,"[tid:%i]: Stall from Rename stage detected.\n",tid);
+        ret_val = true;
+    } else if (stalls[tid].iew) {
+        DPRINTF(Fetch,"[tid:%i]: Stall from IEW stage detected.\n",tid);
+        ret_val = true;
+    } else if (stalls[tid].commit) {
+        DPRINTF(Fetch,"[tid:%i]: Stall from Commit stage detected.\n",tid);
+        ret_val = true;
+    }
+
+    return ret_val;
+}
+
+template<class Impl>
+typename DefaultFetch<Impl>::FetchStatus
+DefaultFetch<Impl>::updateFetchStatus()
+{
+    //Check Running
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+
+        unsigned tid = *threads++;
+
+        if (fetchStatus[tid] == Running ||
+            fetchStatus[tid] == Squashing ||
+            fetchStatus[tid] == IcacheMissComplete) {
+
+            if (_status == Inactive) {
+                DPRINTF(Activity, "[tid:%i]: Activating stage.\n",tid);
+
+                if (fetchStatus[tid] == IcacheMissComplete) {
+                    DPRINTF(Activity, "[tid:%i]: Activating fetch due to cache"
+                            "completion\n",tid);
+                }
+
+                cpu->activateStage(FullCPU::FetchIdx);
+            }
+
+            return Active;
+        }
+    }
+
+    // Stage is switching from active to inactive, notify CPU of it.
+    if (_status == Active) {
+        DPRINTF(Activity, "Deactivating stage.\n");
+
+        cpu->deactivateStage(FullCPU::FetchIdx);
+    }
+
+    return Inactive;
 }
 
 template <class Impl>
 void
-SimpleFetch<Impl>::squash(const Addr &new_PC)
+DefaultFetch<Impl>::squash(const Addr &new_PC, unsigned tid)
 {
-    DPRINTF(Fetch, "Fetch: Squash from commit.\n");
+    DPRINTF(Fetch, "[tid:%u]: Squash from commit.\n",tid);
 
-    doSquash(new_PC);
+    doSquash(new_PC, tid);
 
     // Tell the CPU to remove any instructions that are not in the ROB.
-    cpu->removeInstsNotInROB();
+    cpu->removeInstsNotInROB(tid);
 }
 
-template<class Impl>
+template <class Impl>
 void
-SimpleFetch<Impl>::tick()
+DefaultFetch<Impl>::tick()
 {
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+    bool status_change = false;
+
+    wroteToTimeBuffer = false;
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        // Check the signals for each thread to determine the proper status
+        // for each thread.
+        bool updated_status = checkSignalsAndUpdate(tid);
+        status_change =  status_change || updated_status;
+    }
+
+    DPRINTF(Fetch, "Running stage.\n");
+
+    // Reset the number of the instruction we're fetching.
+    numInst = 0;
+
+    if (fromCommit->commitInfo[0].interruptPending) {
+        interruptPending = true;
+    }
+    if (fromCommit->commitInfo[0].clearInterrupt) {
+        interruptPending = false;
+    }
+
+    for (threadFetched = 0; threadFetched < numFetchingThreads;
+         threadFetched++) {
+        // Fetch each of the actively fetching threads.
+        fetch(status_change);
+    }
+
+    // Record number of instructions fetched this cycle for distribution.
+    fetchNisnDist.sample(numInst);
+
+    if (status_change) {
+        // Change the fetch stage status if there was a status change.
+        _status = updateFetchStatus();
+    }
+
+    // If there was activity this cycle, inform the CPU of it.
+    if (wroteToTimeBuffer || cpu->contextSwitch) {
+        DPRINTF(Activity, "Activity this cycle.\n");
+
+        cpu->activityThisCycle();
+    }
+}
+
+template <class Impl>
+bool
+DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)
+{
+    // Update the per thread stall statuses.
+    if (fromDecode->decodeBlock[tid]) {
+        stalls[tid].decode = true;
+    }
+
+    if (fromDecode->decodeUnblock[tid]) {
+        assert(stalls[tid].decode);
+        assert(!fromDecode->decodeBlock[tid]);
+        stalls[tid].decode = false;
+    }
+
+    if (fromRename->renameBlock[tid]) {
+        stalls[tid].rename = true;
+    }
+
+    if (fromRename->renameUnblock[tid]) {
+        assert(stalls[tid].rename);
+        assert(!fromRename->renameBlock[tid]);
+        stalls[tid].rename = false;
+    }
+
+    if (fromIEW->iewBlock[tid]) {
+        stalls[tid].iew = true;
+    }
+
+    if (fromIEW->iewUnblock[tid]) {
+        assert(stalls[tid].iew);
+        assert(!fromIEW->iewBlock[tid]);
+        stalls[tid].iew = false;
+    }
+
+    if (fromCommit->commitBlock[tid]) {
+        stalls[tid].commit = true;
+    }
+
+    if (fromCommit->commitUnblock[tid]) {
+        assert(stalls[tid].commit);
+        assert(!fromCommit->commitBlock[tid]);
+        stalls[tid].commit = false;
+    }
+
     // Check squash signals from commit.
-    if (fromCommit->commitInfo.squash) {
-        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                "from commit.\n");
+    if (fromCommit->commitInfo[tid].squash) {
+
+        DPRINTF(Fetch, "[tid:%u]: Squashing instructions due to squash "
+                "from commit.\n",tid);
 
         // In any case, squash.
-        squash(fromCommit->commitInfo.nextPC);
+        squash(fromCommit->commitInfo[tid].nextPC,tid);
 
         // Also check if there's a mispredict that happened.
-        if (fromCommit->commitInfo.branchMispredict) {
-            branchPred.squash(fromCommit->commitInfo.doneSeqNum,
-                              fromCommit->commitInfo.nextPC,
-                              fromCommit->commitInfo.branchTaken);
+        if (fromCommit->commitInfo[tid].branchMispredict) {
+            branchPred.squash(fromCommit->commitInfo[tid].doneSeqNum,
+                              fromCommit->commitInfo[tid].nextPC,
+                              fromCommit->commitInfo[tid].branchTaken,
+                              tid);
         } else {
-            branchPred.squash(fromCommit->commitInfo.doneSeqNum);
+            branchPred.squash(fromCommit->commitInfo[tid].doneSeqNum,
+                              tid);
         }
 
-        return;
-    } else if (fromCommit->commitInfo.doneSeqNum) {
+        return true;
+    } else if (fromCommit->commitInfo[tid].doneSeqNum) {
         // Update the branch predictor if it wasn't a squashed instruction
-        // that was braodcasted.
-        branchPred.update(fromCommit->commitInfo.doneSeqNum);
+        // that was broadcasted.
+        branchPred.update(fromCommit->commitInfo[tid].doneSeqNum, tid);
     }
 
     // Check ROB squash signals from commit.
-    if (fromCommit->commitInfo.robSquashing) {
-        DPRINTF(Fetch, "Fetch: ROB is still squashing.\n");
+    if (fromCommit->commitInfo[tid].robSquashing) {
+        DPRINTF(Fetch, "[tid:%u]: ROB is still squashing Thread %u.\n", tid);
 
         // Continue to squash.
-        _status = Squashing;
+        fetchStatus[tid] = Squashing;
 
-        ++fetchSquashCycles;
-        return;
+        return true;
     }
 
     // Check squash signals from decode.
-    if (fromDecode->decodeInfo.squash) {
-        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                "from decode.\n");
+    if (fromDecode->decodeInfo[tid].squash) {
+        DPRINTF(Fetch, "[tid:%u]: Squashing instructions due to squash "
+                "from decode.\n",tid);
 
         // Update the branch predictor.
-        if (fromDecode->decodeInfo.branchMispredict) {
-            branchPred.squash(fromDecode->decodeInfo.doneSeqNum,
-                              fromDecode->decodeInfo.nextPC,
-                              fromDecode->decodeInfo.branchTaken);
+        if (fromDecode->decodeInfo[tid].branchMispredict) {
+            branchPred.squash(fromDecode->decodeInfo[tid].doneSeqNum,
+                              fromDecode->decodeInfo[tid].nextPC,
+                              fromDecode->decodeInfo[tid].branchTaken,
+                              tid);
         } else {
-            branchPred.squash(fromDecode->decodeInfo.doneSeqNum);
+            branchPred.squash(fromDecode->decodeInfo[tid].doneSeqNum,
+                              tid);
         }
 
-        if (_status != Squashing) {
-            // Squash unless we're already squashing?
-            squashFromDecode(fromDecode->decodeInfo.nextPC,
-                             fromDecode->decodeInfo.doneSeqNum);
-            return;
+        if (fetchStatus[tid] != Squashing) {
+            // Squash unless we're already squashing
+            squashFromDecode(fromDecode->decodeInfo[tid].nextPC,
+                             fromDecode->decodeInfo[tid].doneSeqNum,
+                             tid);
+
+            return true;
         }
     }
 
-    // Check if any of the stall signals are high.
-    if (fromDecode->decodeInfo.stall ||
-        fromRename->renameInfo.stall ||
-        fromIEW->iewInfo.stall ||
-        fromCommit->commitInfo.stall)
-    {
-        // Block stage, regardless of current status.
+    if (checkStall(tid) && fetchStatus[tid] != IcacheMissStall) {
+        DPRINTF(Fetch, "[tid:%i]: Setting to blocked\n",tid);
 
-        DPRINTF(Fetch, "Fetch: Stalling stage.\n");
-        DPRINTF(Fetch, "Fetch: Statuses: Decode: %i Rename: %i IEW: %i "
-                "Commit: %i\n",
-                fromDecode->decodeInfo.stall,
-                fromRename->renameInfo.stall,
-                fromIEW->iewInfo.stall,
-                fromCommit->commitInfo.stall);
+        fetchStatus[tid] = Blocked;
 
-        _status = Blocked;
-
-        ++fetchBlockedCycles;
-        return;
-    } else if (_status == Blocked) {
-        // Unblock stage if status is currently blocked and none of the
-        // stall signals are being held high.
-        _status = Running;
-
-        ++fetchBlockedCycles;
-        return;
+        return true;
     }
 
-    // If fetch has reached this point, then there are no squash signals
-    // still being held high.  Check if fetch is in the squashing state;
-    // if so, fetch can switch to running.
-    // Similarly, there are no blocked signals still being held high.
-    // Check if fetch is in the blocked state; if so, fetch can switch to
-    // running.
-    if (_status == Squashing) {
-        DPRINTF(Fetch, "Fetch: Done squashing, switching to running.\n");
+    if (fetchStatus[tid] == Blocked ||
+        fetchStatus[tid] == Squashing) {
+        // Switch status to running if fetch isn't being told to block or
+        // squash this cycle.
+        DPRINTF(Fetch, "[tid:%i]: Done squashing, switching to running.\n",
+                tid);
 
-        // Switch status to running
-        _status = Running;
+        fetchStatus[tid] = Running;
 
-        ++fetchCycles;
-
-        fetch();
-    } else if (_status != IcacheMissStall) {
-        DPRINTF(Fetch, "Fetch: Running stage.\n");
-
-        ++fetchCycles;
-
-        fetch();
+        return true;
     }
+
+    // If we've reached this point, we have not gotten any signals that
+    // cause fetch to change its status.  Fetch remains the same as before.
+    return false;
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::fetch()
+DefaultFetch<Impl>::fetch(bool &status_change)
 {
     //////////////////////////////////////////
     // Start actual fetch
     //////////////////////////////////////////
+    int tid = getFetchingThread(fetchPolicy);
+
+    if (tid == -1) {
+        DPRINTF(Fetch,"There are no more threads available to fetch from.\n");
+
+        // Breaks looping condition in tick()
+        threadFetched = numFetchingThreads;
+        return;
+    }
 
     // The current PC.
-    Addr fetch_PC = cpu->readPC();
+    Addr &fetch_PC = PC[tid];
 
     // Fault code for memory access.
     Fault fault = NoFault;
@@ -473,45 +874,54 @@ SimpleFetch<Impl>::fetch()
     // If returning from the delay of a cache miss, then update the status
     // to running, otherwise do the cache access.  Possibly move this up
     // to tick() function.
-    if (_status == IcacheMissComplete) {
-        DPRINTF(Fetch, "Fetch: Icache miss is complete.\n");
+    if (fetchStatus[tid] == IcacheMissComplete) {
+        DPRINTF(Fetch, "[tid:%i]: Icache miss is complete.\n",
+                tid);
 
-        // Reset the completion event to NULL.
-        memReq->completionEvent = NULL;
+        fetchStatus[tid] = Running;
+        status_change = true;
+    } else if (fetchStatus[tid] == Running) {
+        DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read "
+                "instruction, starting at PC %08p.\n",
+                tid, fetch_PC);
 
-        _status = Running;
+        bool fetch_success = fetchCacheLine(fetch_PC, fault, tid);
+        if (!fetch_success)
+            return;
     } else {
-        DPRINTF(Fetch, "Fetch: Attempting to translate and read "
-                       "instruction, starting at PC %08p.\n",
-                fetch_PC);
+        if (fetchStatus[tid] == Blocked) {
+            ++fetchBlockedCycles;
+        } else if (fetchStatus[tid] == Squashing) {
+            ++fetchSquashCycles;
+        }
 
-        fault = fetchCacheLine(fetch_PC);
-    }
-
-    // If we had a stall due to an icache miss, then return.  It'd
-    // be nicer if this were handled through the kind of fault that
-    // is returned by the function.
-    if (_status == IcacheMissStall) {
+        // Status is Idle, Squashing, Blocked, or IcacheMissStall, so
+        // fetch should do nothing.
         return;
     }
 
-    // As far as timing goes, the CPU will need to send an event through
-    // the MemReq in order to be woken up once the memory access completes.
-    // Probably have a status on a per thread basis so each thread can
-    // block independently and be woken up independently.
+    ++fetchCycles;
+
+    // If we had a stall due to an icache miss, then return.
+    if (fetchStatus[tid] == IcacheMissStall) {
+        status_change = true;
+        return;
+    }
 
     Addr next_PC = fetch_PC;
     InstSeqNum inst_seq;
     MachInst inst;
-    unsigned offset = fetch_PC & cacheBlkMask;
-    unsigned fetched;
+    ExtMachInst ext_inst;
+    // @todo: Fix this hack.
+    unsigned offset = (fetch_PC & cacheBlkMask) & ~3;
 
     if (fault == NoFault) {
         // If the read of the first instruction was successful, then grab the
         // instructions from the rest of the cache line and put them into the
         // queue heading to decode.
 
-        DPRINTF(Fetch, "Fetch: Adding instructions to queue to decode.\n");
+        DPRINTF(Fetch, "[tid:%i]: Adding instructions to queue to "
+                "decode.\n",tid);
 
         //////////////////////////
         // Fetch first instruction
@@ -521,12 +931,11 @@ SimpleFetch<Impl>::fetch()
         // ended this fetch block.
         bool predicted_branch = false;
 
-        for (fetched = 0;
+        for (;
              offset < cacheBlkSize &&
-                 fetched < fetchWidth &&
+                 numInst < fetchWidth &&
                  !predicted_branch;
-             ++fetched)
-        {
+             ++numInst) {
 
             // Get a sequence number.
             inst_seq = cpu->getAndIncrementInstSeq();
@@ -536,31 +945,40 @@ SimpleFetch<Impl>::fetch()
 
             // Get the instruction from the array of the cache line.
             inst = gtoh(*reinterpret_cast<MachInst *>
-                        (&cacheData[offset]));
+                        (&cacheData[tid][offset]));
+
+            ext_inst = TheISA::makeExtMI(inst, fetch_PC);
 
             // Create a new DynInst from the instruction fetched.
-            DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC,
+            DynInstPtr instruction = new DynInst(ext_inst, fetch_PC,
+                                                 next_PC,
                                                  inst_seq, cpu);
+            instruction->setThread(tid);
 
-            DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
-                    inst_seq, instruction->readPC());
+            instruction->setASID(tid);
 
-            DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
-                    OPCODE(inst));
+            instruction->setState(cpu->thread[tid]);
+
+            DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created "
+                    "[sn:%lli]\n",
+                    tid, instruction->readPC(), inst_seq);
+
+            DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n",
+                    tid, instruction->staticInst->disassemble(fetch_PC));
 
             instruction->traceData =
-                Trace::getInstRecord(curTick, cpu->xcBase(), cpu,
+                Trace::getInstRecord(curTick, cpu->xcBase(tid), cpu,
                                      instruction->staticInst,
-                                     instruction->readPC(), 0);
+                                     instruction->readPC(),tid);
 
             predicted_branch = lookupAndUpdateNextPC(instruction, next_PC);
 
             // Add instruction to the CPU's list of instructions.
-            cpu->addInst(instruction);
+            instruction->setInstListIt(cpu->addInst(instruction));
 
             // Write the instruction to the first slot in the queue
             // that heads to decode.
-            toDecode->insts[fetched] = instruction;
+            toDecode->insts[numInst] = instruction;
 
             toDecode->size++;
 
@@ -570,27 +988,36 @@ SimpleFetch<Impl>::fetch()
             // Move to the next instruction, unless we have a branch.
             fetch_PC = next_PC;
 
+            if (instruction->isQuiesce()) {
+                warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
+                fetchStatus[tid] = QuiescePending;
+                ++numInst;
+                status_change = true;
+                break;
+            }
+
             offset+= instSize;
         }
+    }
 
-        fetch_nisn_dist.sample(fetched);
+    if (numInst > 0) {
+        wroteToTimeBuffer = true;
     }
 
     // Now that fetching is completed, update the PC to signify what the next
-    // cycle will be.  Might want to move this to the beginning of this
-    // function so that the PC updates at the beginning of everything.
-    // Or might want to leave setting the PC to the main CPU, with fetch
-    // only changing the nextPC (will require correct determination of
-    // next PC).
+    // cycle will be.
     if (fault == NoFault) {
-        DPRINTF(Fetch, "Fetch: Setting PC to %08p.\n", next_PC);
-        cpu->setPC(next_PC);
-        cpu->setNextPC(next_PC + instSize);
+
+        DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n",tid, next_PC);
+
+
+        PC[tid] = next_PC;
+        nextPC[tid] = next_PC + instSize;
     } else {
         // If the issue was an icache miss, then we can just return and
         // wait until it is handled.
-        if (_status == IcacheMissStall) {
-            return;
+        if (fetchStatus[tid] == IcacheMissStall) {
+            panic("Fetch should have exited prior to this!");
         }
 
         // Handle the fault.
@@ -601,17 +1028,169 @@ SimpleFetch<Impl>::fetch()
         // have it handled by the upper level CPU class which peeks into the
         // time buffer and sees if a squash comes along, in which case it
         // changes the status.
-
-        DPRINTF(Fetch, "Fetch: Blocked, need to handle the trap.\n");
-
-        _status = Blocked;
 #if FULL_SYSTEM
+        // Tell the commit stage the fault we had.
+        toDecode->fetchFault = fault;
+        toDecode->fetchFaultSN = cpu->globalSeqNum;
+
+        DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid);
+
+        fetchStatus[tid] = TrapPending;
+        status_change = true;
+
+        warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
 //        cpu->trap(fault);
         // Send a signal to the ROB indicating that there's a trap from the
         // fetch stage that needs to be handled.  Need to indicate that
         // there's a fault, and the fault type.
 #else // !FULL_SYSTEM
-        fatal("fault (%d) detected @ PC %08p", fault, cpu->readPC());
+        fatal("fault (%d) detected @ PC %08p", fault, PC[tid]);
 #endif // FULL_SYSTEM
     }
 }
+
+
+///////////////////////////////////////
+//                                   //
+//  SMT FETCH POLICY MAINTAINED HERE //
+//                                   //
+///////////////////////////////////////
+template<class Impl>
+int
+DefaultFetch<Impl>::getFetchingThread(FetchPriority &fetch_priority)
+{
+    if (numThreads > 1) {
+        switch (fetch_priority) {
+
+          case SingleThread:
+            return 0;
+
+          case RoundRobin:
+            return roundRobin();
+
+          case IQ:
+            return iqCount();
+
+          case LSQ:
+            return lsqCount();
+
+          case Branch:
+            return branchCount();
+
+          default:
+            return -1;
+        }
+    } else {
+        int tid = *((*activeThreads).begin());
+
+        if (fetchStatus[tid] == Running ||
+            fetchStatus[tid] == IcacheMissComplete ||
+            fetchStatus[tid] == Idle) {
+            return tid;
+        } else {
+            return -1;
+        }
+    }
+
+}
+
+
+template<class Impl>
+int
+DefaultFetch<Impl>::roundRobin()
+{
+    list<unsigned>::iterator pri_iter = priorityList.begin();
+    list<unsigned>::iterator end      = priorityList.end();
+
+    int high_pri;
+
+    while (pri_iter != end) {
+        high_pri = *pri_iter;
+
+        assert(high_pri <= numThreads);
+
+        if (fetchStatus[high_pri] == Running ||
+            fetchStatus[high_pri] == IcacheMissComplete ||
+            fetchStatus[high_pri] == Idle) {
+
+            priorityList.erase(pri_iter);
+            priorityList.push_back(high_pri);
+
+            return high_pri;
+        }
+
+        pri_iter++;
+    }
+
+    return -1;
+}
+
+template<class Impl>
+int
+DefaultFetch<Impl>::iqCount()
+{
+    priority_queue<unsigned> PQ;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        PQ.push(fromIEW->iewInfo[tid].iqCount);
+    }
+
+    while (!PQ.empty()) {
+
+        unsigned high_pri = PQ.top();
+
+        if (fetchStatus[high_pri] == Running ||
+            fetchStatus[high_pri] == IcacheMissComplete ||
+            fetchStatus[high_pri] == Idle)
+            return high_pri;
+        else
+            PQ.pop();
+
+    }
+
+    return -1;
+}
+
+template<class Impl>
+int
+DefaultFetch<Impl>::lsqCount()
+{
+    priority_queue<unsigned> PQ;
+
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        PQ.push(fromIEW->iewInfo[tid].ldstqCount);
+    }
+
+    while (!PQ.empty()) {
+
+        unsigned high_pri = PQ.top();
+
+        if (fetchStatus[high_pri] == Running ||
+            fetchStatus[high_pri] == IcacheMissComplete ||
+           fetchStatus[high_pri] == Idle)
+            return high_pri;
+        else
+            PQ.pop();
+
+    }
+
+    return -1;
+}
+
+template<class Impl>
+int
+DefaultFetch<Impl>::branchCount()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    return *threads;
+}
diff --git a/cpu/o3/free_list.cc b/cpu/o3/free_list.cc
index 6f0b4be1e..bd0f4f034 100644
--- a/cpu/o3/free_list.cc
+++ b/cpu/o3/free_list.cc
@@ -30,7 +30,8 @@
 
 #include "cpu/o3/free_list.hh"
 
-SimpleFreeList::SimpleFreeList(unsigned _numLogicalIntRegs,
+SimpleFreeList::SimpleFreeList(unsigned activeThreads,
+                               unsigned _numLogicalIntRegs,
                                unsigned _numPhysicalIntRegs,
                                unsigned _numLogicalFloatRegs,
                                unsigned _numPhysicalFloatRegs)
@@ -40,43 +41,30 @@ SimpleFreeList::SimpleFreeList(unsigned _numLogicalIntRegs,
       numPhysicalFloatRegs(_numPhysicalFloatRegs),
       numPhysicalRegs(numPhysicalIntRegs + numPhysicalFloatRegs)
 {
-    DPRINTF(FreeList, "FreeList: Creating new free list object.\n");
-
-    // DEBUG stuff.
-    freeIntRegsScoreboard.resize(numPhysicalIntRegs);
-
-    freeFloatRegsScoreboard.resize(numPhysicalRegs);
-
-    for (PhysRegIndex i = 0; i < numLogicalIntRegs; ++i) {
-        freeIntRegsScoreboard[i] = 0;
-    }
+    DPRINTF(FreeList, "Creating new free list object.\n");
 
     // Put all of the extra physical registers onto the free list.  This
     // means excluding all of the base logical registers.
-    for (PhysRegIndex i = numLogicalIntRegs;
+    for (PhysRegIndex i = numLogicalIntRegs * activeThreads;
          i < numPhysicalIntRegs; ++i)
     {
         freeIntRegs.push(i);
-
-        freeIntRegsScoreboard[i] = 1;
-    }
-
-    for (PhysRegIndex i = 0; i < numPhysicalIntRegs + numLogicalFloatRegs;
-         ++i)
-    {
-        freeFloatRegsScoreboard[i] = 0;
     }
 
     // Put all of the extra physical registers onto the free list.  This
     // means excluding all of the base logical registers.  Because the
     // float registers' indices start where the physical registers end,
     // some math must be done to determine where the free registers start.
-    for (PhysRegIndex i = numPhysicalIntRegs + numLogicalFloatRegs;
-         i < numPhysicalRegs; ++i)
+    PhysRegIndex i = numPhysicalIntRegs + (numLogicalFloatRegs * activeThreads);
+
+    for ( ; i < numPhysicalRegs; ++i)
     {
         freeFloatRegs.push(i);
-
-        freeFloatRegsScoreboard[i] = 1;
     }
 }
 
+std::string
+SimpleFreeList::name() const
+{
+    return "cpu.freelist";
+}
diff --git a/cpu/o3/free_list.hh b/cpu/o3/free_list.hh
index 0b85dba1e..29e84cd44 100644
--- a/cpu/o3/free_list.hh
+++ b/cpu/o3/free_list.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_FREE_LIST_HH__
-#define __CPU_O3_CPU_FREE_LIST_HH__
+#ifndef __CPU_O3_FREE_LIST_HH__
+#define __CPU_O3_FREE_LIST_HH__
 
 #include <iostream>
 #include <queue>
@@ -45,10 +45,9 @@
  * other classes, it assumes that the indices for the floating point
  * registers starts after the integer registers end.  Hence the variable
  * numPhysicalIntRegs is logically equivalent to the baseFP dependency.
- * Note that
- * while this most likely should be called FreeList, the name "FreeList"
- * is used in a typedef within the CPU Policy, and therefore no class
- * can be named simply "FreeList".
+ * Note that while this most likely should be called FreeList, the name
+ * "FreeList" is used in a typedef within the CPU Policy, and therefore no
+ * class can be named simply "FreeList".
  * @todo: Give a better name to the base FP dependency.
  */
 class SimpleFreeList
@@ -75,36 +74,51 @@ class SimpleFreeList
     /** Total number of physical registers. */
     int numPhysicalRegs;
 
-    /** DEBUG stuff below. */
-    std::vector<int> freeIntRegsScoreboard;
-
-    std::vector<bool> freeFloatRegsScoreboard;
-
   public:
-    SimpleFreeList(unsigned _numLogicalIntRegs,
+    /** Constructs a free list.
+     *  @param activeThreads Number of active threads.
+     *  @param _numLogicalIntRegs Number of logical integer registers.
+     *  @param _numPhysicalIntRegs Number of physical integer registers.
+     *  @param _numLogicalFloatRegs Number of logical fp registers.
+     *  @param _numPhysicalFloatRegs Number of physical fp registers.
+     */
+    SimpleFreeList(unsigned activeThreads,
+                   unsigned _numLogicalIntRegs,
                    unsigned _numPhysicalIntRegs,
                    unsigned _numLogicalFloatRegs,
                    unsigned _numPhysicalFloatRegs);
 
+    /** Gives the name of the freelist. */
+    std::string name() const;
+
+    /** Gets a free integer register. */
     inline PhysRegIndex getIntReg();
 
+    /** Gets a free fp register. */
     inline PhysRegIndex getFloatReg();
 
+    /** Adds a register back to the free list. */
     inline void addReg(PhysRegIndex freed_reg);
 
+    /** Adds an integer register back to the free list. */
     inline void addIntReg(PhysRegIndex freed_reg);
 
+    /** Adds a fp register back to the free list. */
     inline void addFloatReg(PhysRegIndex freed_reg);
 
+    /** Checks if there are any free integer registers. */
     bool hasFreeIntRegs()
     { return !freeIntRegs.empty(); }
 
+    /** Checks if there are any free fp registers. */
     bool hasFreeFloatRegs()
     { return !freeFloatRegs.empty(); }
 
+    /** Returns the number of free integer registers. */
     int numFreeIntRegs()
     { return freeIntRegs.size(); }
 
+    /** Returns the number of free fp registers. */
     int numFreeFloatRegs()
     { return freeFloatRegs.size(); }
 };
@@ -112,7 +126,8 @@ class SimpleFreeList
 inline PhysRegIndex
 SimpleFreeList::getIntReg()
 {
-    DPRINTF(Rename, "FreeList: Trying to get free integer register.\n");
+    DPRINTF(FreeList, "Trying to get free integer register.\n");
+
     if (freeIntRegs.empty()) {
         panic("No free integer registers!");
     }
@@ -121,17 +136,14 @@ SimpleFreeList::getIntReg()
 
     freeIntRegs.pop();
 
-    // DEBUG
-    assert(freeIntRegsScoreboard[free_reg]);
-    freeIntRegsScoreboard[free_reg] = 0;
-
     return(free_reg);
 }
 
 inline PhysRegIndex
 SimpleFreeList::getFloatReg()
 {
-    DPRINTF(Rename, "FreeList: Trying to get free float register.\n");
+    DPRINTF(FreeList, "Trying to get free float register.\n");
+
     if (freeFloatRegs.empty()) {
         panic("No free integer registers!");
     }
@@ -140,42 +152,28 @@ SimpleFreeList::getFloatReg()
 
     freeFloatRegs.pop();
 
-    // DEBUG
-    assert(freeFloatRegsScoreboard[free_reg]);
-    freeFloatRegsScoreboard[free_reg] = 0;
-
     return(free_reg);
 }
 
 inline void
 SimpleFreeList::addReg(PhysRegIndex freed_reg)
 {
-    DPRINTF(Rename, "Freelist: Freeing register %i.\n", freed_reg);
+    DPRINTF(FreeList,"Freeing register %i.\n", freed_reg);
     //Might want to add in a check for whether or not this register is
     //already in there.  A bit vector or something similar would be useful.
     if (freed_reg < numPhysicalIntRegs) {
-        freeIntRegs.push(freed_reg);
-
-        // DEBUG
-        assert(freeIntRegsScoreboard[freed_reg] == false);
-        freeIntRegsScoreboard[freed_reg] = 1;
+        if (freed_reg != TheISA::ZeroReg)
+            freeIntRegs.push(freed_reg);
     } else if (freed_reg < numPhysicalRegs) {
-        freeFloatRegs.push(freed_reg);
-
-        // DEBUG
-        assert(freeFloatRegsScoreboard[freed_reg] == false);
-        freeFloatRegsScoreboard[freed_reg] = 1;
+        if (freed_reg != (TheISA::ZeroReg + numPhysicalIntRegs))
+            freeFloatRegs.push(freed_reg);
     }
 }
 
 inline void
 SimpleFreeList::addIntReg(PhysRegIndex freed_reg)
 {
-    DPRINTF(Rename, "Freelist: Freeing int register %i.\n", freed_reg);
-
-    // DEBUG
-    assert(!freeIntRegsScoreboard[freed_reg]);
-    freeIntRegsScoreboard[freed_reg] = 1;
+    DPRINTF(FreeList,"Freeing int register %i.\n", freed_reg);
 
     freeIntRegs.push(freed_reg);
 }
@@ -183,13 +181,9 @@ SimpleFreeList::addIntReg(PhysRegIndex freed_reg)
 inline void
 SimpleFreeList::addFloatReg(PhysRegIndex freed_reg)
 {
-    DPRINTF(Rename, "Freelist: Freeing float register %i.\n", freed_reg);
-
-    // DEBUG
-    assert(!freeFloatRegsScoreboard[freed_reg]);
-    freeFloatRegsScoreboard[freed_reg] = 1;
+    DPRINTF(FreeList,"Freeing float register %i.\n", freed_reg);
 
     freeFloatRegs.push(freed_reg);
 }
 
-#endif // __CPU_O3_CPU_FREE_LIST_HH__
+#endif // __CPU_O3_FREE_LIST_HH__
diff --git a/cpu/o3/fu_pool.cc b/cpu/o3/fu_pool.cc
new file mode 100644
index 000000000..9b6ac15d9
--- /dev/null
+++ b/cpu/o3/fu_pool.cc
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sstream>
+
+#include "cpu/o3/fu_pool.hh"
+#include "encumbered/cpu/full/fu_pool.hh"
+#include "sim/builder.hh"
+
+using namespace std;
+
+////////////////////////////////////////////////////////////////////////////
+//
+//  A pool of function units
+//
+
+inline void
+FUPool::FUIdxQueue::addFU(int fu_idx)
+{
+    funcUnitsIdx.push_back(fu_idx);
+    ++size;
+}
+
+inline int
+FUPool::FUIdxQueue::getFU()
+{
+    int retval = funcUnitsIdx[idx++];
+
+    if (idx == size)
+        idx = 0;
+
+    return retval;
+}
+
+FUPool::~FUPool()
+{
+    fuListIterator i = funcUnits.begin();
+    fuListIterator end = funcUnits.end();
+    for (; i != end; ++i)
+        delete *i;
+}
+
+
+// Constructor
+FUPool::FUPool(string name, vector<FUDesc *> paramList)
+    : SimObject(name)
+{
+    numFU = 0;
+
+    funcUnits.clear();
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        maxOpLatencies[i] = 0;
+        maxIssueLatencies[i] = 0;
+    }
+
+    //
+    //  Iterate through the list of FUDescData structures
+    //
+    for (FUDDiterator i = paramList.begin(); i != paramList.end(); ++i) {
+
+        //
+        //  Don't bother with this if we're not going to create any FU's
+        //
+        if ((*i)->number) {
+            //
+            //  Create the FuncUnit object from this structure
+            //   - add the capabilities listed in the FU's operation
+            //     description
+            //
+            //  We create the first unit, then duplicate it as needed
+            //
+            FuncUnit *fu = new FuncUnit;
+
+            OPDDiterator j = (*i)->opDescList.begin();
+            OPDDiterator end = (*i)->opDescList.end();
+            for (; j != end; ++j) {
+                // indicate that this pool has this capability
+                capabilityList.set((*j)->opClass);
+
+                // Add each of the FU's that will have this capability to the
+                // appropriate queue.
+                for (int k = 0; k < (*i)->number; ++k)
+                    fuPerCapList[(*j)->opClass].addFU(numFU + k);
+
+                // indicate that this FU has the capability
+                fu->addCapability((*j)->opClass, (*j)->opLat, (*j)->issueLat);
+
+                if ((*j)->opLat > maxOpLatencies[(*j)->opClass])
+                    maxOpLatencies[(*j)->opClass] = (*j)->opLat;
+
+                if ((*j)->issueLat > maxIssueLatencies[(*j)->opClass])
+                    maxIssueLatencies[(*j)->opClass] = (*j)->issueLat;
+            }
+
+            numFU++;
+
+            //  Add the appropriate number of copies of this FU to the list
+            ostringstream s;
+
+            s << (*i)->name() << "(0)";
+            fu->name = s.str();
+            funcUnits.push_back(fu);
+
+            for (int c = 1; c < (*i)->number; ++c) {
+                ostringstream s;
+                numFU++;
+                FuncUnit *fu2 = new FuncUnit(*fu);
+
+                s << (*i)->name() << "(" << c << ")";
+                fu2->name = s.str();
+                funcUnits.push_back(fu2);
+            }
+        }
+    }
+
+    unitBusy.resize(numFU);
+
+    for (int i = 0; i < numFU; i++) {
+        unitBusy[i] = false;
+    }
+}
+
+void
+FUPool::annotateMemoryUnits(unsigned hit_latency)
+{
+    maxOpLatencies[MemReadOp] = hit_latency;
+
+    fuListIterator i = funcUnits.begin();
+    fuListIterator iend = funcUnits.end();
+    for (; i != iend; ++i) {
+        if ((*i)->provides(MemReadOp))
+            (*i)->opLatency(MemReadOp) = hit_latency;
+
+        if ((*i)->provides(MemWriteOp))
+            (*i)->opLatency(MemWriteOp) = hit_latency;
+    }
+}
+
+int
+FUPool::getUnit(OpClass capability)
+{
+    //  If this pool doesn't have the specified capability,
+    //  return this information to the caller
+    if (!capabilityList[capability])
+        return -2;
+
+    int fu_idx = fuPerCapList[capability].getFU();
+    int start_idx = fu_idx;
+
+    // Iterate through the circular queue if needed, stopping if we've reached
+    // the first element again.
+    while (unitBusy[fu_idx]) {
+        fu_idx = fuPerCapList[capability].getFU();
+        if (fu_idx == start_idx) {
+            // No FU available
+            return -1;
+        }
+    }
+
+    unitBusy[fu_idx] = true;
+
+    return fu_idx;
+}
+
+void
+FUPool::freeUnit(int fu_idx)
+{
+    assert(unitBusy[fu_idx]);
+    unitsToBeFreed.push_back(fu_idx);
+}
+
+void
+FUPool::processFreeUnits()
+{
+    while (!unitsToBeFreed.empty()) {
+        int fu_idx = unitsToBeFreed.back();
+        unitsToBeFreed.pop_back();
+
+        assert(unitBusy[fu_idx]);
+
+        unitBusy[fu_idx] = false;
+    }
+}
+
+void
+FUPool::dump()
+{
+    cout << "Function Unit Pool (" << name() << ")\n";
+    cout << "======================================\n";
+    cout << "Free List:\n";
+
+    for (int i = 0; i < numFU; ++i) {
+        if (unitBusy[i]) {
+            continue;
+        }
+
+        cout << "  [" << i << "] : ";
+
+        cout << funcUnits[i]->name << " ";
+
+        cout << "\n";
+    }
+
+    cout << "======================================\n";
+    cout << "Busy List:\n";
+    for (int i = 0; i < numFU; ++i) {
+        if (!unitBusy[i]) {
+            continue;
+        }
+
+        cout << "  [" << i << "] : ";
+
+        cout << funcUnits[i]->name << " ";
+
+        cout << "\n";
+    }
+}
+
+//
+
+////////////////////////////////////////////////////////////////////////////
+//
+//  The SimObjects we use to get the FU information into the simulator
+//
+////////////////////////////////////////////////////////////////////////////
+
+//
+//    FUPool - Contails a list of FUDesc objects to make available
+//
+
+//
+//  The FuPool object
+//
+
+BEGIN_DECLARE_SIM_OBJECT_PARAMS(FUPool)
+
+    SimObjectVectorParam<FUDesc *> FUList;
+
+END_DECLARE_SIM_OBJECT_PARAMS(FUPool)
+
+
+BEGIN_INIT_SIM_OBJECT_PARAMS(FUPool)
+
+    INIT_PARAM(FUList, "list of FU's for this pool")
+
+END_INIT_SIM_OBJECT_PARAMS(FUPool)
+
+
+CREATE_SIM_OBJECT(FUPool)
+{
+    return new FUPool(getInstanceName(), FUList);
+}
+
+REGISTER_SIM_OBJECT("FUPool", FUPool)
+
diff --git a/cpu/o3/fu_pool.hh b/cpu/o3/fu_pool.hh
new file mode 100644
index 000000000..d7b7acadb
--- /dev/null
+++ b/cpu/o3/fu_pool.hh
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_FU_POOL_HH__
+#define __CPU_O3_FU_POOL_HH__
+
+#include <bitset>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "base/sched_list.hh"
+#include "encumbered/cpu/full/op_class.hh"
+#include "sim/sim_object.hh"
+
+class FUDesc;
+class FuncUnit;
+
+/**
+ * Pool of FU's, specific to the new CPU model. The old FU pool had lists of
+ * free units and busy units, and whenever a FU was needed it would iterate
+ * through the free units to find a FU that provided the capability. This pool
+ * has lists of units specific to each of the capabilities, and whenever a FU
+ * is needed, it iterates through that list to find a free unit. The previous
+ * FU pool would have to be ticked each cycle to update which units became
+ * free. This FU pool lets the IEW stage handle freeing units, which frees
+ * them as their scheduled execution events complete. This limits units in this
+ * model to either have identical issue and op latencies, or 1 cycle issue
+ * latencies.
+ */
+class FUPool : public SimObject
+{
+  private:
+    /** Maximum op execution latencies, per op class. */
+    unsigned maxOpLatencies[Num_OpClasses];
+    /** Maximum issue latencies, per op class. */
+    unsigned maxIssueLatencies[Num_OpClasses];
+
+    /** Bitvector listing capabilities of this FU pool. */
+    std::bitset<Num_OpClasses> capabilityList;
+
+    /** Bitvector listing which FUs are busy. */
+    std::vector<bool> unitBusy;
+
+    /** List of units to be freed at the end of this cycle. */
+    std::vector<int> unitsToBeFreed;
+
+    /**
+     * Class that implements a circular queue to hold FU indices. The hope is
+     * that FUs that have been just used will be moved to the end of the queue
+     * by iterating through it, thus leaving free units at the head of the
+     * queue.
+     */
+    class FUIdxQueue {
+      public:
+        /** Constructs a circular queue of FU indices. */
+        FUIdxQueue()
+            : idx(0), size(0)
+        { }
+
+        /** Adds a FU to the queue. */
+        inline void addFU(int fu_idx);
+
+        /** Returns the index of the FU at the head of the queue, and changes
+         *  the index to the next element.
+         */
+        inline int getFU();
+
+      private:
+        /** Circular queue index. */
+        int idx;
+
+        /** Size of the queue. */
+        int size;
+
+        /** Queue of FU indices. */
+        std::vector<int> funcUnitsIdx;
+    };
+
+    /** Per op class queues of FUs that provide that capability. */
+    FUIdxQueue fuPerCapList[Num_OpClasses];
+
+    /** Number of FUs. */
+    int numFU;
+
+    /** Functional units. */
+    std::vector<FuncUnit *> funcUnits;
+
+    typedef std::vector<FuncUnit *>::iterator fuListIterator;
+
+  public:
+
+    /** Constructs a FU pool. */
+    FUPool(std::string name, std::vector<FUDesc *> l);
+    ~FUPool();
+
+    /** Annotates units that provide memory operations. Included only because
+     *  old FU pool provided this function.
+     */
+    void annotateMemoryUnits(unsigned hit_latency);
+
+    /**
+     * Gets a FU providing the requested capability. Will mark the unit as busy,
+     * but leaves the freeing of the unit up to the IEW stage.
+     * @param capability The capability requested.
+     * @return Returns -2 if the FU pool does not have the capability, -1 if
+     * there is no free FU, and the FU's index otherwise.
+     */
+    int getUnit(OpClass capability);
+
+    /** Frees a FU at the end of this cycle. */
+    void freeUnit(int fu_idx);
+
+    /** Frees all FUs on the list. */
+    void processFreeUnits();
+
+    /** Returns the total number of FUs. */
+    int size() { return numFU; }
+
+    /** Debugging function used to dump FU information. */
+    void dump();
+
+    /** Returns the operation execution latency of the given capability. */
+    unsigned getOpLatency(OpClass capability) {
+        return maxOpLatencies[capability];
+    }
+
+    /** Returns the issue latency of the given capability. */
+    unsigned getIssueLatency(OpClass capability) {
+        return maxIssueLatencies[capability];
+    }
+};
+
+#endif // __CPU_O3_FU_POOL_HH__
diff --git a/cpu/o3/iew.cc b/cpu/o3/iew.cc
index 45b5610e7..90d035f71 100644
--- a/cpu/o3/iew.cc
+++ b/cpu/o3/iew.cc
@@ -31,4 +31,4 @@
 #include "cpu/o3/iew_impl.hh"
 #include "cpu/o3/inst_queue.hh"
 
-template class SimpleIEW<AlphaSimpleImpl>;
+template class DefaultIEW<AlphaSimpleImpl>;
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index 1e370d4e6..e55837812 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -26,22 +26,38 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-//Todo: Update with statuses.
-//Need to handle delaying writes to the writeback bus if it's full at the
-//given time.
-
-#ifndef __CPU_O3_CPU_SIMPLE_IEW_HH__
-#define __CPU_O3_CPU_SIMPLE_IEW_HH__
+#ifndef __CPU_O3_IEW_HH__
+#define __CPU_O3_IEW_HH__
 
 #include <queue>
 
-#include "config/full_system.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
+#include "config/full_system.hh"
 #include "cpu/o3/comm.hh"
+#include "cpu/o3/scoreboard.hh"
+#include "cpu/o3/lsq.hh"
 
+class FUPool;
+
+/**
+ * DefaultIEW handles both single threaded and SMT IEW(issue/execute/writeback).
+ * It handles the dispatching of instructions to the LSQ/IQ as part of the issue
+ * stage, and has the IQ try to issue instructions each cycle. The execute
+ * latency is actually tied into the issue latency to allow the IQ to be able to
+ * do back-to-back scheduling without having to speculatively schedule
+ * instructions. This happens by having the IQ have access to the functional
+ * units, and the IQ gets the execution latencies from the FUs when it issues
+ * instructions. Instructions reach the execute stage on the last cycle of
+ * their execution, which is when the IQ knows to wake up any dependent
+ * instructions, allowing back to back scheduling. The execute portion of IEW
+ * separates memory instructions from non-memory instructions, either telling
+ * the LSQ to execute the instruction, or executing the instruction directly.
+ * The writeback portion of IEW completes the instructions by waking up any
+ * dependents, and marking the register ready on the scoreboard.
+ */
 template<class Impl>
-class SimpleIEW
+class DefaultIEW
 {
   private:
     //Typedefs from Impl
@@ -52,7 +68,7 @@ class SimpleIEW
 
     typedef typename CPUPol::IQ IQ;
     typedef typename CPUPol::RenameMap RenameMap;
-    typedef typename CPUPol::LDSTQ LDSTQ;
+    typedef typename CPUPol::LSQ LSQ;
 
     typedef typename CPUPol::TimeStruct TimeStruct;
     typedef typename CPUPol::IEWStruct IEWStruct;
@@ -60,77 +76,214 @@ class SimpleIEW
     typedef typename CPUPol::IssueStruct IssueStruct;
 
     friend class Impl::FullCPU;
+    friend class CPUPol::IQ;
+
   public:
+    /** Overall IEW stage status. Used to determine if the CPU can
+     * deschedule itself due to a lack of activity.
+     */
     enum Status {
+        Active,
+        Inactive
+    };
+
+    /** Status for Issue, Execute, and Writeback stages. */
+    enum StageStatus {
         Running,
         Blocked,
         Idle,
+        StartSquash,
         Squashing,
         Unblocking
     };
 
   private:
+    /** Overall stage status. */
     Status _status;
-    Status _issueStatus;
-    Status _exeStatus;
-    Status _wbStatus;
+    /** Dispatch status. */
+    StageStatus dispatchStatus[Impl::MaxThreads];
+    /** Execute status. */
+    StageStatus exeStatus;
+    /** Writeback status. */
+    StageStatus wbStatus;
 
   public:
-    class WritebackEvent : public Event {
+    /** LdWriteback event for a load completion. */
+    class LdWritebackEvent : public Event {
       private:
+        /** Instruction that is writing back data to the register file. */
         DynInstPtr inst;
-        SimpleIEW<Impl> *iewStage;
+        /** Pointer to IEW stage. */
+        DefaultIEW<Impl> *iewStage;
 
       public:
-        WritebackEvent(DynInstPtr &_inst, SimpleIEW<Impl> *_iew);
+        /** Constructs a load writeback event. */
+        LdWritebackEvent(DynInstPtr &_inst, DefaultIEW<Impl> *_iew);
 
+        /** Processes writeback event. */
         virtual void process();
+        /** Returns the description of the writeback event. */
         virtual const char *description();
     };
 
   public:
-    SimpleIEW(Params &params);
+    /** Constructs a DefaultIEW with the given parameters. */
+    DefaultIEW(Params *params);
 
+    /** Returns the name of the DefaultIEW stage. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Initializes stage; sends back the number of free IQ and LSQ entries. */
+    void initStage();
+
+    /** Sets CPU pointer for IEW, IQ, and LSQ. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets main time buffer used for backwards communication. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    /** Sets time buffer for getting instructions coming from rename. */
     void setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr);
 
+    /** Sets time buffer to pass on instructions to commit. */
     void setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr);
 
-    void setRenameMap(RenameMap *rm_ptr);
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
 
-    void squash();
+    /** Sets pointer to the scoreboard. */
+    void setScoreboard(Scoreboard *sb_ptr);
 
-    void squashDueToBranch(DynInstPtr &inst);
+    /** Sets page table pointer within LSQ. */
+//    void setPageTable(PageTable *pt_ptr);
 
-    void squashDueToMem(DynInstPtr &inst);
-
-    void block();
-
-    inline void unblock();
+    /** Squashes instructions in IEW for a specific thread. */
+    void squash(unsigned tid);
 
+    /** Wakes all dependents of a completed instruction. */
     void wakeDependents(DynInstPtr &inst);
 
+    /** Tells memory dependence unit that a memory instruction needs to be
+     * rescheduled. It will re-execute once replayMemInst() is called.
+     */
+    void rescheduleMemInst(DynInstPtr &inst);
+
+    /** Re-executes all rescheduled memory instructions. */
+    void replayMemInst(DynInstPtr &inst);
+
+    /** Sends an instruction to commit through the time buffer. */
     void instToCommit(DynInstPtr &inst);
 
-  private:
-    void dispatchInsts();
+    /** Inserts unused instructions of a thread into the skid buffer. */
+    void skidInsert(unsigned tid);
 
+    /** Returns the max of the number of entries in all of the skid buffers. */
+    int skidCount();
+
+    /** Returns if all of the skid buffers are empty. */
+    bool skidsEmpty();
+
+    /** Updates overall IEW status based on all of the stages' statuses. */
+    void updateStatus();
+
+    /** Resets entries of the IQ and the LSQ. */
+    void resetEntries();
+
+    /** Tells the CPU to wakeup if it has descheduled itself due to no
+     * activity. Used mainly by the LdWritebackEvent.
+     */
+    void wakeCPU();
+
+    /** Reports to the CPU that there is activity this cycle. */
+    void activityThisCycle();
+
+    /** Tells CPU that the IEW stage is active and running. */
+    inline void activateStage();
+
+    /** Tells CPU that the IEW stage is inactive and idle. */
+    inline void deactivateStage();
+
+//#if !FULL_SYSTEM
+    /** Returns if the LSQ has any stores to writeback. */
+    bool hasStoresToWB() { return ldstQueue.hasStoresToWB(); }
+//#endif
+
+  private:
+    /** Sends commit proper information for a squash due to a branch
+     * mispredict.
+     */
+    void squashDueToBranch(DynInstPtr &inst, unsigned thread_id);
+
+    /** Sends commit proper information for a squash due to a memory order
+     * violation.
+     */
+    void squashDueToMemOrder(DynInstPtr &inst, unsigned thread_id);
+
+    /** Sends commit proper information for a squash due to memory becoming
+     * blocked (younger issued instructions must be retried).
+     */
+    void squashDueToMemBlocked(DynInstPtr &inst, unsigned thread_id);
+
+    /** Sets Dispatch to blocked, and signals back to other stages to block. */
+    void block(unsigned thread_id);
+
+    /** Unblocks Dispatch if the skid buffer is empty, and signals back to
+     * other stages to unblock.
+     */
+    void unblock(unsigned thread_id);
+
+    /** Determines proper actions to take given Dispatch's status. */
+    void dispatch(unsigned tid);
+
+    /** Dispatches instructions to IQ and LSQ. */
+    void dispatchInsts(unsigned tid);
+
+    /** Executes instructions. In the case of memory operations, it informs the
+     * LSQ to execute the instructions. Also handles any redirects that occur
+     * due to the executed instructions.
+     */
     void executeInsts();
 
+    /** Writebacks instructions. In our model, the instruction's execute()
+     * function atomically reads registers, executes, and writes registers.
+     * Thus this writeback only wakes up dependent instructions, and informs
+     * the scoreboard of registers becoming ready.
+     */
+    void writebackInsts();
+
+    /** Returns the number of valid, non-squashed instructions coming from
+     * rename to dispatch.
+     */
+    unsigned validInstsFromRename();
+
+    /** Reads the stall signals. */
+    void readStallSignals(unsigned tid);
+
+    /** Checks if any of the stall conditions are currently true. */
+    bool checkStall(unsigned tid);
+
+    /** Processes inputs and changes state accordingly. */
+    void checkSignalsAndUpdate(unsigned tid);
+
+    /** Sorts instructions coming from rename into lists separated by thread. */
+    void sortInsts();
+
   public:
+    /** Ticks IEW stage, causing Dispatch, the IQ, the LSQ, Execute, and
+     * Writeback to run for one cycle.
+     */
     void tick();
 
-    void iew();
-
-    //Interfaces to objects inside and outside of IEW.
-    /** Time buffer interface. */
+  private:
+    /** Pointer to main time buffer used for backwards communication. */
     TimeBuffer<TimeStruct> *timeBuffer;
 
+    /** Wire to write information heading to previous stages. */
+    typename TimeBuffer<TimeStruct>::wire toFetch;
+
     /** Wire to get commit's output from backwards time buffer. */
     typename TimeBuffer<TimeStruct>::wire fromCommit;
 
@@ -158,32 +311,67 @@ class SimpleIEW
     /** Wire to write infromation heading to commit. */
     typename TimeBuffer<IEWStruct>::wire toCommit;
 
-    //Will need internal queue to hold onto instructions coming from
-    //the rename stage in case of a stall.
-    /** Skid buffer between rename and IEW. */
-    std::queue<RenameStruct> skidBuffer;
+    /** Queue of all instructions coming from rename this cycle. */
+    std::queue<DynInstPtr> insts[Impl::MaxThreads];
 
-  protected:
+    /** Skid buffer between rename and IEW. */
+    std::queue<DynInstPtr> skidBuffer[Impl::MaxThreads];
+
+    /** Scoreboard pointer. */
+    Scoreboard* scoreboard;
+
+  public:
     /** Instruction queue. */
     IQ instQueue;
 
-    LDSTQ ldstQueue;
+    /** Load / store queue. */
+    LSQ ldstQueue;
 
-#if !FULL_SYSTEM
-  public:
-    void lsqWriteback();
-#endif
+    /** Pointer to the functional unit pool. */
+    FUPool *fuPool;
 
   private:
-    /** Pointer to rename map.  Might not want this stage to directly
-     *  access this though...
-     */
-    RenameMap *renameMap;
-
-    /** CPU interface. */
+    /** CPU pointer. */
     FullCPU *cpu;
 
+    /** Records if IEW has written to the time buffer this cycle, so that the
+     * CPU can deschedule itself if there is no activity.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Source of possible stalls. */
+    struct Stalls {
+        bool commit;
+    };
+
+    /** Stages that are telling IEW to stall. */
+    Stalls stalls[Impl::MaxThreads];
+
+    /** Debug function to print instructions that are issued this cycle. */
+    void printAvailableInsts();
+
+  public:
+    /** Records if the LSQ needs to be updated on the next cycle, so that
+     * IEW knows if there will be activity on the next cycle.
+     */
+    bool updateLSQNextCycle;
+
   private:
+    /** Records if there is a fetch redirect on this cycle for each thread. */
+    bool fetchRedirect[Impl::MaxThreads];
+
+    /** Used to track if all instructions have been dispatched this cycle.
+     * If they have not, then blocking must have occurred, and the instructions
+     * would already be added to the skid buffer.
+     * @todo: Fix this hack.
+     */
+    bool dispatchedAllInsts;
+
+    /** Records if the queues have been changed (inserted or issued insts),
+     * so that IEW knows to broadcast the updated amount of free entries.
+     */
+    bool updatedQueues;
+
     /** Commit to IEW delay, in ticks. */
     unsigned commitToIEWDelay;
 
@@ -211,29 +399,63 @@ class SimpleIEW
      */
     unsigned executeWidth;
 
-    /** Number of cycles stage has been squashing.  Used so that the stage
-     *  knows when it can start unblocking, which is when the previous stage
-     *  has received the stall signal and clears up its outputs.
-     */
-    unsigned cyclesSquashing;
+    /** Index into queue of instructions being written back. */
+    unsigned wbNumInst;
 
+    /** Cycle number within the queue of instructions being written back.
+     * Used in case there are too many instructions writing back at the current
+     * cycle and writesbacks need to be scheduled for the future. See comments
+     * in instToCommit().
+     */
+    unsigned wbCycle;
+
+    /** Number of active threads. */
+    unsigned numThreads;
+
+    /** Pointer to list of active threads. */
+    std::list<unsigned> *activeThreads;
+
+    /** Maximum size of the skid buffer. */
+    unsigned skidBufferMax;
+
+    /** Stat for total number of idle cycles. */
     Stats::Scalar<> iewIdleCycles;
+    /** Stat for total number of squashing cycles. */
     Stats::Scalar<> iewSquashCycles;
+    /** Stat for total number of blocking cycles. */
     Stats::Scalar<> iewBlockCycles;
+    /** Stat for total number of unblocking cycles. */
     Stats::Scalar<> iewUnblockCycles;
-//    Stats::Scalar<> iewWBInsts;
+    /** Stat for total number of instructions dispatched. */
     Stats::Scalar<> iewDispatchedInsts;
+    /** Stat for total number of squashed instructions dispatch skips. */
     Stats::Scalar<> iewDispSquashedInsts;
+    /** Stat for total number of dispatched load instructions. */
     Stats::Scalar<> iewDispLoadInsts;
+    /** Stat for total number of dispatched store instructions. */
     Stats::Scalar<> iewDispStoreInsts;
+    /** Stat for total number of dispatched non speculative instructions. */
     Stats::Scalar<> iewDispNonSpecInsts;
+    /** Stat for number of times the IQ becomes full. */
     Stats::Scalar<> iewIQFullEvents;
+    /** Stat for number of times the LSQ becomes full. */
+    Stats::Scalar<> iewLSQFullEvents;
+    /** Stat for total number of executed instructions. */
     Stats::Scalar<> iewExecutedInsts;
+    /** Stat for total number of executed load instructions. */
     Stats::Scalar<> iewExecLoadInsts;
+    /** Stat for total number of executed store instructions. */
     Stats::Scalar<> iewExecStoreInsts;
+    /** Stat for total number of squashed instructions skipped at execute. */
     Stats::Scalar<> iewExecSquashedInsts;
+    /** Stat for total number of memory ordering violation events. */
     Stats::Scalar<> memOrderViolationEvents;
+    /** Stat for total number of incorrect predicted taken branches. */
     Stats::Scalar<> predictedTakenIncorrect;
+    /** Stat for total number of incorrect predicted not taken branches. */
+    Stats::Scalar<> predictedNotTakenIncorrect;
+    /** Stat for total number of mispredicted branches detected at execute. */
+    Stats::Formula branchMispredicts;
 };
 
-#endif // __CPU_O3_CPU_IEW_HH__
+#endif // __CPU_O3_IEW_HH__
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 85217dd10..21eb7dcf8 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -29,59 +29,84 @@
 // @todo: Fix the instantaneous communication among all the stages within
 // iew.  There's a clear delay between issue and execute, yet backwards
 // communication happens simultaneously.
-// Update the statuses for each stage.
 
 #include <queue>
 
 #include "base/timebuf.hh"
+#include "cpu/o3/fu_pool.hh"
 #include "cpu/o3/iew.hh"
 
+using namespace std;
+
 template<class Impl>
-SimpleIEW<Impl>::WritebackEvent::WritebackEvent(DynInstPtr &_inst,
-                                                SimpleIEW<Impl> *_iew)
-    : Event(&mainEventQueue, CPU_Tick_Pri), inst(_inst), iewStage(_iew)
+DefaultIEW<Impl>::LdWritebackEvent::LdWritebackEvent(DynInstPtr &_inst,
+                                                     DefaultIEW<Impl> *_iew)
+    : Event(&mainEventQueue), inst(_inst), iewStage(_iew)
 {
     this->setFlags(Event::AutoDelete);
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::WritebackEvent::process()
+DefaultIEW<Impl>::LdWritebackEvent::process()
 {
-    DPRINTF(IEW, "IEW: WRITEBACK EVENT!!!!\n");
+    DPRINTF(IEW, "Load writeback event [sn:%lli]\n", inst->seqNum);
+    DPRINTF(Activity, "Activity: Ld Writeback event [sn:%lli]\n", inst->seqNum);
+
+    //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
+
+    iewStage->wakeCPU();
+
+    if (inst->isSquashed()) {
+        inst = NULL;
+        return;
+    }
+
+    if (!inst->isExecuted()) {
+        inst->setExecuted();
+
+        // Execute again to copy data to proper place.
+        if (inst->isStore()) {
+            inst->completeAcc();
+        }
+    }
 
     // Need to insert instruction into queue to commit
     iewStage->instToCommit(inst);
-    // Need to execute second half of the instruction, do actual writing to
-    // registers and such
-    inst->execute();
+
+    //wroteToTimeBuffer = true;
+    iewStage->activityThisCycle();
+
+    inst = NULL;
 }
 
 template<class Impl>
 const char *
-SimpleIEW<Impl>::WritebackEvent::description()
+DefaultIEW<Impl>::LdWritebackEvent::description()
 {
-    return "LSQ writeback event";
+    return "Load writeback event";
 }
 
 template<class Impl>
-SimpleIEW<Impl>::SimpleIEW(Params &params)
+DefaultIEW<Impl>::DefaultIEW(Params *params)
     : // Just make this time buffer really big for now
+    // @todo: Make this into a parameter.
       issueToExecQueue(5, 5),
       instQueue(params),
       ldstQueue(params),
-      commitToIEWDelay(params.commitToIEWDelay),
-      renameToIEWDelay(params.renameToIEWDelay),
-      issueToExecuteDelay(params.issueToExecuteDelay),
-      issueReadWidth(params.issueWidth),
-      issueWidth(params.issueWidth),
-      executeWidth(params.executeWidth)
+      fuPool(params->fuPool),
+      commitToIEWDelay(params->commitToIEWDelay),
+      renameToIEWDelay(params->renameToIEWDelay),
+      issueToExecuteDelay(params->issueToExecuteDelay),
+      issueReadWidth(params->issueWidth),
+      issueWidth(params->issueWidth),
+      executeWidth(params->executeWidth),
+      numThreads(params->numberOfThreads)
 {
-    DPRINTF(IEW, "IEW: executeIntWidth: %i.\n", params.executeIntWidth);
-    _status = Idle;
-    _issueStatus = Idle;
-    _exeStatus = Idle;
-    _wbStatus = Idle;
+    DPRINTF(IEW, "executeIntWidth: %i.\n", params->executeIntWidth);
+    _status = Active;
+    exeStatus = Running;
+    wbStatus = Idle;
 
     // Setup wire to read instructions coming from issue.
     fromIssue = issueToExecQueue.getWire(-issueToExecuteDelay);
@@ -89,15 +114,36 @@ SimpleIEW<Impl>::SimpleIEW(Params &params)
     // Instruction queue needs the queue between issue and execute.
     instQueue.setIssueToExecuteQueue(&issueToExecQueue);
 
+    instQueue.setIEW(this);
     ldstQueue.setIEW(this);
+
+    for (int i=0; i < numThreads; i++) {
+        dispatchStatus[i] = Running;
+        stalls[i].commit = false;
+        fetchRedirect[i] = false;
+    }
+
+    updateLSQNextCycle = false;
+
+    // @todo: Make into a parameter
+    skidBufferMax = (3 * (renameToIEWDelay * params->renameWidth)) + issueWidth;
+}
+
+template <class Impl>
+std::string
+DefaultIEW<Impl>::name() const
+{
+    return cpu->name() + ".iew";
 }
 
 template <class Impl>
 void
-SimpleIEW<Impl>::regStats()
+DefaultIEW<Impl>::regStats()
 {
     instQueue.regStats();
 
+    //ldstQueue.regStats();
+
     iewIdleCycles
         .name(name() + ".iewIdleCycles")
         .desc("Number of cycles IEW is idle");
@@ -140,6 +186,10 @@ SimpleIEW<Impl>::regStats()
         .name(name() + ".iewIQFullEvents")
         .desc("Number of times the IQ has become full, causing a stall");
 
+    iewLSQFullEvents
+        .name(name() + ".iewLSQFullEvents")
+        .desc("Number of times the LSQ has become full, causing a stall");
+
     iewExecutedInsts
         .name(name() + ".iewExecutedInsts")
         .desc("Number of executed instructions");
@@ -163,24 +213,51 @@ SimpleIEW<Impl>::regStats()
     predictedTakenIncorrect
         .name(name() + ".predictedTakenIncorrect")
         .desc("Number of branches that were predicted taken incorrectly");
+
+    predictedNotTakenIncorrect
+        .name(name() + ".predictedNotTakenIncorrect")
+        .desc("Number of branches that were predicted not taken incorrectly");
+
+    branchMispredicts
+        .name(name() + ".branchMispredicts")
+        .desc("Number of branch mispredicts detected at execute");
+
+    branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultIEW<Impl>::initStage()
 {
-    DPRINTF(IEW, "IEW: Setting CPU pointer.\n");
+    for (int tid=0; tid < numThreads; tid++) {
+        toRename->iewInfo[tid].usedIQ = true;
+        toRename->iewInfo[tid].freeIQEntries =
+            instQueue.numFreeEntries(tid);
+
+        toRename->iewInfo[tid].usedLSQ = true;
+        toRename->iewInfo[tid].freeLSQEntries =
+            ldstQueue.numFreeEntries(tid);
+    }
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::setCPU(FullCPU *cpu_ptr)
+{
+    DPRINTF(IEW, "Setting CPU pointer.\n");
     cpu = cpu_ptr;
 
     instQueue.setCPU(cpu_ptr);
     ldstQueue.setCPU(cpu_ptr);
+
+    cpu->activateStage(FullCPU::IEWIdx);
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+DefaultIEW<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting time buffer pointer.\n");
+    DPRINTF(IEW, "Setting time buffer pointer.\n");
     timeBuffer = tb_ptr;
 
     // Setup wire to read information from time buffer, from commit.
@@ -189,15 +266,17 @@ SimpleIEW<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
     // Setup wire to write information back to previous stages.
     toRename = timeBuffer->getWire(0);
 
+    toFetch = timeBuffer->getWire(0);
+
     // Instruction queue also needs main time buffer.
     instQueue.setTimeBuffer(tb_ptr);
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
+DefaultIEW<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting rename queue pointer.\n");
+    DPRINTF(IEW, "Setting rename queue pointer.\n");
     renameQueue = rq_ptr;
 
     // Setup wire to read information from rename queue.
@@ -206,9 +285,9 @@ SimpleIEW<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
+DefaultIEW<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting IEW queue pointer.\n");
+    DPRINTF(IEW, "Setting IEW queue pointer.\n");
     iewQueue = iq_ptr;
 
     // Setup wire to write instructions to commit.
@@ -217,355 +296,900 @@ SimpleIEW<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setRenameMap(RenameMap *rm_ptr)
+DefaultIEW<Impl>::setActiveThreads(list<unsigned> *at_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting rename map pointer.\n");
-    renameMap = rm_ptr;
+    DPRINTF(IEW, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+
+    ldstQueue.setActiveThreads(at_ptr);
+    instQueue.setActiveThreads(at_ptr);
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::squash()
+DefaultIEW<Impl>::setScoreboard(Scoreboard *sb_ptr)
 {
-    DPRINTF(IEW, "IEW: Squashing all instructions.\n");
-    _status = Squashing;
+    DPRINTF(IEW, "Setting scoreboard pointer.\n");
+    scoreboard = sb_ptr;
+}
+
+#if 0
+template<class Impl>
+void
+DefaultIEW<Impl>::setPageTable(PageTable *pt_ptr)
+{
+    ldstQueue.setPageTable(pt_ptr);
+}
+#endif
+
+template<class Impl>
+void
+DefaultIEW<Impl>::squash(unsigned tid)
+{
+    DPRINTF(IEW, "[tid:%i]: Squashing all instructions.\n",
+            tid);
 
     // Tell the IQ to start squashing.
-    instQueue.squash();
+    instQueue.squash(tid);
 
     // Tell the LDSTQ to start squashing.
-    ldstQueue.squash(fromCommit->commitInfo.doneSeqNum);
-}
+    ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum,tid);
 
-template<class Impl>
-void
-SimpleIEW<Impl>::squashDueToBranch(DynInstPtr &inst)
-{
-    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
-            inst->PC);
-    // Perhaps leave the squashing up to the ROB stage to tell it when to
-    // squash?
-    _status = Squashing;
+    updatedQueues = true;
 
-    // Tell rename to squash through the time buffer.
-    toCommit->squash = true;
-    // Also send PC update information back to prior stages.
-    toCommit->squashedSeqNum = inst->seqNum;
-    toCommit->mispredPC = inst->readPC();
-    toCommit->nextPC = inst->readNextPC();
-    toCommit->branchMispredict = true;
-    // Prediction was incorrect, so send back inverse.
-    toCommit->branchTaken = inst->readNextPC() !=
-        (inst->readPC() + sizeof(TheISA::MachInst));
-}
+    // Clear the skid buffer in case it has any data in it.
+    while (!skidBuffer[tid].empty()) {
 
-template<class Impl>
-void
-SimpleIEW<Impl>::squashDueToMem(DynInstPtr &inst)
-{
-    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
-            inst->PC);
-    // Perhaps leave the squashing up to the ROB stage to tell it when to
-    // squash?
-    _status = Squashing;
+        if (skidBuffer[tid].front()->isLoad() ||
+            skidBuffer[tid].front()->isStore() ) {
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+        }
 
-    // Tell rename to squash through the time buffer.
-    toCommit->squash = true;
-    // Also send PC update information back to prior stages.
-    toCommit->squashedSeqNum = inst->seqNum;
-    toCommit->nextPC = inst->readNextPC();
-}
+        toRename->iewInfo[tid].dispatched++;
 
-template<class Impl>
-void
-SimpleIEW<Impl>::block()
-{
-    DPRINTF(IEW, "IEW: Blocking.\n");
-    // Set the status to Blocked.
-    _status = Blocked;
+        skidBuffer[tid].pop();
+    }
 
-    // Add the current inputs to the skid buffer so they can be
-    // reprocessed when this stage unblocks.
-    skidBuffer.push(*fromRename);
+    while (!insts[tid].empty()) {
+        if (insts[tid].front()->isLoad() ||
+            insts[tid].front()->isStore() ) {
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+        }
 
-    // Note that this stage only signals previous stages to stall when
-    // it is the cause of the stall originates at this stage.  Otherwise
-    // the previous stages are expected to check all possible stall signals.
-}
+        toRename->iewInfo[tid].dispatched++;
 
-template<class Impl>
-inline void
-SimpleIEW<Impl>::unblock()
-{
-    // Check if there's information in the skid buffer.  If there is, then
-    // set status to unblocking, otherwise set it directly to running.
-    DPRINTF(IEW, "IEW: Reading instructions out of the skid "
-            "buffer.\n");
-    // Remove the now processed instructions from the skid buffer.
-    skidBuffer.pop();
-
-    // If there's still information in the skid buffer, then
-    // continue to tell previous stages to stall.  They will be
-    // able to restart once the skid buffer is empty.
-    if (!skidBuffer.empty()) {
-        toRename->iewInfo.stall = true;
-    } else {
-        DPRINTF(IEW, "IEW: Stage is done unblocking.\n");
-        _status = Running;
+        insts[tid].pop();
     }
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::wakeDependents(DynInstPtr &inst)
+DefaultIEW<Impl>::squashDueToBranch(DynInstPtr &inst, unsigned tid)
+{
+    DPRINTF(IEW, "[tid:%i]: Squashing from a specific instruction, PC: %#x "
+            "[sn:%i].\n", tid, inst->readPC(), inst->seqNum);
+
+    // Tell rename to squash through the time buffer.
+    toCommit->squash[tid] = true;
+    toCommit->squashedSeqNum[tid] = inst->seqNum;
+    toCommit->mispredPC[tid] = inst->readPC();
+    toCommit->nextPC[tid] = inst->readNextPC();
+    toCommit->branchMispredict[tid] = true;
+    // Prediction was incorrect, so send back inverse.
+    toCommit->branchTaken[tid] = inst->readNextPC() !=
+        (inst->readPC() + sizeof(TheISA::MachInst));
+
+    toCommit->includeSquashInst[tid] = false;
+    //toCommit->iewSquashNum[tid] = inst->seqNum;
+
+    wroteToTimeBuffer = true;
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::squashDueToMemOrder(DynInstPtr &inst, unsigned tid)
+{
+    DPRINTF(IEW, "[tid:%i]: Squashing from a specific instruction, "
+            "PC: %#x [sn:%i].\n", tid, inst->readPC(), inst->seqNum);
+
+    // Tell rename to squash through the time buffer.
+    toCommit->squash[tid] = true;
+    toCommit->squashedSeqNum[tid] = inst->seqNum;
+    toCommit->nextPC[tid] = inst->readNextPC();
+
+    toCommit->includeSquashInst[tid] = false;
+    //toCommit->iewSquashNum[tid] = inst->seqNum;
+
+    wroteToTimeBuffer = true;
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::squashDueToMemBlocked(DynInstPtr &inst, unsigned tid)
+{
+    DPRINTF(IEW, "[tid:%i]: Memory blocked, squashing load and younger insts, "
+            "PC: %#x [sn:%i].\n", tid, inst->readPC(), inst->seqNum);
+
+    toCommit->squash[tid] = true;
+    toCommit->squashedSeqNum[tid] = inst->seqNum;
+    toCommit->nextPC[tid] = inst->readPC();
+
+    toCommit->includeSquashInst[tid] = true;
+
+    ldstQueue.setLoadBlockedHandled(tid);
+
+    wroteToTimeBuffer = true;
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::block(unsigned tid)
+{
+    DPRINTF(IEW, "[tid:%u]: Blocking.\n", tid);
+
+    if (dispatchStatus[tid] != Blocked &&
+        dispatchStatus[tid] != Unblocking) {
+        toRename->iewBlock[tid] = true;
+        wroteToTimeBuffer = true;
+    }
+
+    // Add the current inputs to the skid buffer so they can be
+    // reprocessed when this stage unblocks.
+    skidInsert(tid);
+
+    // Set the status to Blocked.
+    dispatchStatus[tid] = Blocked;
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::unblock(unsigned tid)
+{
+    DPRINTF(IEW, "[tid:%i]: Reading instructions out of the skid "
+            "buffer %u.\n",tid, tid);
+
+    // If the skid bufffer is empty, signal back to previous stages to unblock.
+    // Also switch status to running.
+    if (skidBuffer[tid].empty()) {
+        toRename->iewUnblock[tid] = true;
+        wroteToTimeBuffer = true;
+        DPRINTF(IEW, "[tid:%i]: Done unblocking.\n",tid);
+        dispatchStatus[tid] = Running;
+    }
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::wakeDependents(DynInstPtr &inst)
 {
     instQueue.wakeDependents(inst);
 }
 
+template<class Impl>
+void
+DefaultIEW<Impl>::rescheduleMemInst(DynInstPtr &inst)
+{
+    instQueue.rescheduleMemInst(inst);
+}
 
 template<class Impl>
 void
-SimpleIEW<Impl>::instToCommit(DynInstPtr &inst)
+DefaultIEW<Impl>::replayMemInst(DynInstPtr &inst)
 {
+    instQueue.replayMemInst(inst);
+}
 
+template<class Impl>
+void
+DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
+{
+    // First check the time slot that this instruction will write
+    // to.  If there are free write ports at the time, then go ahead
+    // and write the instruction to that time.  If there are not,
+    // keep looking back to see where's the first time there's a
+    // free slot.  What happens if you run out of free spaces?
+    // For now naively assume that all instructions take one cycle.
+    // Otherwise would have to look into the time buffer based on the
+    // latency of the instruction.
+    while ((*iewQueue)[wbCycle].insts[wbNumInst]) {
+        ++wbNumInst;
+        if (wbNumInst == issueWidth) {
+            ++wbCycle;
+            wbNumInst = 0;
+        }
+
+        assert(wbCycle < 5);
+    }
+
+    // Add finished instruction to queue to commit.
+    (*iewQueue)[wbCycle].insts[wbNumInst] = inst;
+    (*iewQueue)[wbCycle].size++;
+}
+
+template <class Impl>
+unsigned
+DefaultIEW<Impl>::validInstsFromRename()
+{
+    unsigned inst_count = 0;
+
+    for (int i=0; i<fromRename->size; i++) {
+        if (!fromRename->insts[i]->squashed)
+            inst_count++;
+    }
+
+    return inst_count;
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::skidInsert(unsigned tid)
+{
+    DynInstPtr inst = NULL;
+
+    while (!insts[tid].empty()) {
+        inst = insts[tid].front();
+
+        insts[tid].pop();
+
+        DPRINTF(Decode,"[tid:%i]: Inserting [sn:%lli] PC:%#x into "
+                "dispatch skidBuffer %i\n",tid, inst->seqNum,
+                inst->readPC(),tid);
+
+        skidBuffer[tid].push(inst);
+    }
+
+    assert(skidBuffer[tid].size() <= skidBufferMax &&
+           "Skidbuffer Exceeded Max Size");
+}
+
+template<class Impl>
+int
+DefaultIEW<Impl>::skidCount()
+{
+    int max=0;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned thread_count = skidBuffer[*threads++].size();
+        if (max < thread_count)
+            max = thread_count;
+    }
+
+    return max;
+}
+
+template<class Impl>
+bool
+DefaultIEW<Impl>::skidsEmpty()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        if (!skidBuffer[*threads++].empty())
+            return false;
+    }
+
+    return true;
 }
 
 template <class Impl>
 void
-SimpleIEW<Impl>::dispatchInsts()
+DefaultIEW<Impl>::updateStatus()
 {
-    ////////////////////////////////////////
-    // DISPATCH/ISSUE stage
-    ////////////////////////////////////////
+    bool any_unblocking = false;
 
-    //Put into its own function?
-    //Add instructions to IQ if there are any instructions there
+    list<unsigned>::iterator threads = (*activeThreads).begin();
 
-    // Check if there are any instructions coming from rename, and we're.
-    // not squashing.
-    if (fromRename->size > 0) {
-        int insts_to_add = fromRename->size;
+    threads = (*activeThreads).begin();
 
-        // Loop through the instructions, putting them in the instruction
-        // queue.
-        for (int inst_num = 0; inst_num < insts_to_add; ++inst_num)
-        {
-            DynInstPtr inst = fromRename->insts[inst_num];
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
 
-            // Make sure there's a valid instruction there.
-            assert(inst);
-
-            DPRINTF(IEW, "IEW: Issue: Adding PC %#x to IQ.\n",
-                    inst->readPC());
-
-            // Be sure to mark these instructions as ready so that the
-            // commit stage can go ahead and execute them, and mark
-            // them as issued so the IQ doesn't reprocess them.
-            if (inst->isSquashed()) {
-                ++iewDispSquashedInsts;
-                continue;
-            } else if (instQueue.isFull()) {
-                DPRINTF(IEW, "IEW: Issue: IQ has become full.\n");
-                // Call function to start blocking.
-                block();
-                // Tell previous stage to stall.
-                toRename->iewInfo.stall = true;
-
-                ++iewIQFullEvents;
-                break;
-            } else if (inst->isLoad()) {
-                DPRINTF(IEW, "IEW: Issue: Memory instruction "
-                        "encountered, adding to LDSTQ.\n");
-
-                // Reserve a spot in the load store queue for this
-                // memory access.
-                ldstQueue.insertLoad(inst);
-
-                ++iewDispLoadInsts;
-            } else if (inst->isStore()) {
-                ldstQueue.insertStore(inst);
-
-                ++iewDispStoreInsts;
-            } else if (inst->isNonSpeculative()) {
-                DPRINTF(IEW, "IEW: Issue: Nonspeculative instruction "
-                        "encountered, skipping.\n");
-
-                // Same hack as with stores.
-                inst->setCanCommit();
-
-                // Specificall insert it as nonspeculative.
-                instQueue.insertNonSpec(inst);
-
-                ++iewDispNonSpecInsts;
-
-                continue;
-            } else if (inst->isNop()) {
-                DPRINTF(IEW, "IEW: Issue: Nop instruction encountered "
-                        ", skipping.\n");
-
-                inst->setIssued();
-                inst->setExecuted();
-                inst->setCanCommit();
-
-                instQueue.advanceTail(inst);
-
-                continue;
-            } else if (inst->isExecuted()) {
-                assert(0 && "Instruction shouldn't be executed.\n");
-                DPRINTF(IEW, "IEW: Issue: Executed branch encountered, "
-                        "skipping.\n");
-
-                inst->setIssued();
-                inst->setCanCommit();
-
-                instQueue.advanceTail(inst);
-
-                continue;
-            }
-
-            // If the instruction queue is not full, then add the
-            // instruction.
-            instQueue.insert(fromRename->insts[inst_num]);
-
-            ++iewDispatchedInsts;
+        if (dispatchStatus[tid] == Unblocking) {
+            any_unblocking = true;
+            break;
         }
     }
+
+    // If there are no ready instructions waiting to be scheduled by the IQ,
+    // and there's no stores waiting to write back, and dispatch is not
+    // unblocking, then there is no internal activity for the IEW stage.
+    if (_status == Active && !instQueue.hasReadyInsts() &&
+        !ldstQueue.willWB() && !any_unblocking) {
+        DPRINTF(IEW, "IEW switching to idle\n");
+
+        deactivateStage();
+
+        _status = Inactive;
+    } else if (_status == Inactive && (instQueue.hasReadyInsts() ||
+                                       ldstQueue.willWB() ||
+                                       any_unblocking)) {
+        // Otherwise there is internal activity.  Set to active.
+        DPRINTF(IEW, "IEW switching to active\n");
+
+        activateStage();
+
+        _status = Active;
+    }
 }
 
 template <class Impl>
 void
-SimpleIEW<Impl>::executeInsts()
+DefaultIEW<Impl>::resetEntries()
 {
-    ////////////////////////////////////////
-    //EXECUTE/WRITEBACK stage
-    ////////////////////////////////////////
+    instQueue.resetEntries();
+    ldstQueue.resetEntries();
+}
 
-    //Put into its own function?
-    //Similarly should probably have separate execution for int vs FP.
-    // Above comment is handled by the issue queue only issuing a valid
-    // mix of int/fp instructions.
-    //Actually okay to just have one execution, buuuuuut will need
-    //somewhere that defines the execution latency of all instructions.
-    // @todo: Move to the FU pool used in the current full cpu.
+template <class Impl>
+void
+DefaultIEW<Impl>::readStallSignals(unsigned tid)
+{
+    if (fromCommit->commitBlock[tid]) {
+        stalls[tid].commit = true;
+    }
 
-    int fu_usage = 0;
-    bool fetch_redirect = false;
-    int inst_slot = 0;
-    int time_slot = 0;
+    if (fromCommit->commitUnblock[tid]) {
+        assert(stalls[tid].commit);
+        stalls[tid].commit = false;
+    }
+}
+
+template <class Impl>
+bool
+DefaultIEW<Impl>::checkStall(unsigned tid)
+{
+    bool ret_val(false);
+
+    if (stalls[tid].commit) {
+        DPRINTF(IEW,"[tid:%i]: Stall from Commit stage detected.\n",tid);
+        ret_val = true;
+    } else if (instQueue.isFull(tid)) {
+        DPRINTF(IEW,"[tid:%i]: Stall: IQ  is full.\n",tid);
+        ret_val = true;
+    } else if (ldstQueue.isFull(tid)) {
+        DPRINTF(IEW,"[tid:%i]: Stall: LSQ is full\n",tid);
+
+        if (ldstQueue.numLoads(tid) > 0 ) {
+
+            DPRINTF(IEW,"[tid:%i]: LSQ oldest load: [sn:%i] \n",
+                    tid,ldstQueue.getLoadHeadSeqNum(tid));
+        }
+
+        if (ldstQueue.numStores(tid) > 0) {
+
+            DPRINTF(IEW,"[tid:%i]: LSQ oldest store: [sn:%i] \n",
+                    tid,ldstQueue.getStoreHeadSeqNum(tid));
+        }
+
+        ret_val = true;
+    } else if (ldstQueue.isStalled(tid)) {
+        DPRINTF(IEW,"[tid:%i]: Stall: LSQ stall detected.\n",tid);
+        ret_val = true;
+    }
+
+    return ret_val;
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::checkSignalsAndUpdate(unsigned tid)
+{
+    // Check if there's a squash signal, squash if there is
+    // Check stall signals, block if there is.
+    // If status was Blocked
+    //     if so then go to unblocking
+    // If status was Squashing
+    //     check if squashing is not high.  Switch to running this cycle.
+
+    readStallSignals(tid);
+
+    if (fromCommit->commitInfo[tid].squash) {
+        squash(tid);
+
+        if (dispatchStatus[tid] == Blocked ||
+            dispatchStatus[tid] == Unblocking) {
+            toRename->iewUnblock[tid] = true;
+            wroteToTimeBuffer = true;
+        }
+
+        dispatchStatus[tid] = Squashing;
+
+        fetchRedirect[tid] = false;
+        return;
+    }
+
+    if (fromCommit->commitInfo[tid].robSquashing) {
+        DPRINTF(IEW, "[tid:%i]: ROB is still squashing.\n");
+
+        dispatchStatus[tid] = Squashing;
+
+        return;
+    }
+
+    if (checkStall(tid)) {
+        block(tid);
+        dispatchStatus[tid] = Blocked;
+        return;
+    }
+
+    if (dispatchStatus[tid] == Blocked) {
+        // Status from previous cycle was blocked, but there are no more stall
+        // conditions.  Switch over to unblocking.
+        DPRINTF(IEW, "[tid:%i]: Done blocking, switching to unblocking.\n",
+                tid);
+
+        dispatchStatus[tid] = Unblocking;
+
+        unblock(tid);
+
+        return;
+    }
+
+    if (dispatchStatus[tid] == Squashing) {
+        // Switch status to running if rename isn't being told to block or
+        // squash this cycle.
+        DPRINTF(IEW, "[tid:%i]: Done squashing, switching to running.\n",
+                tid);
+
+        dispatchStatus[tid] = Running;
+
+        return;
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::sortInsts()
+{
+    int insts_from_rename = fromRename->size;
+
+    for (int i = 0; i < numThreads; i++)
+        assert(insts[i].empty());
+
+    for (int i = 0; i < insts_from_rename; ++i) {
+        insts[fromRename->insts[i]->threadNumber].push(fromRename->insts[i]);
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::wakeCPU()
+{
+    cpu->wakeCPU();
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::activityThisCycle()
+{
+    DPRINTF(Activity, "Activity this cycle.\n");
+    cpu->activityThisCycle();
+}
+
+template <class Impl>
+inline void
+DefaultIEW<Impl>::activateStage()
+{
+    DPRINTF(Activity, "Activating stage.\n");
+    cpu->activateStage(FullCPU::IEWIdx);
+}
+
+template <class Impl>
+inline void
+DefaultIEW<Impl>::deactivateStage()
+{
+    DPRINTF(Activity, "Deactivating stage.\n");
+    cpu->deactivateStage(FullCPU::IEWIdx);
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::dispatch(unsigned tid)
+{
+    // If status is Running or idle,
+    //     call dispatchInsts()
+    // If status is Unblocking,
+    //     buffer any instructions coming from rename
+    //     continue trying to empty skid buffer
+    //     check if stall conditions have passed
+
+    if (dispatchStatus[tid] == Blocked) {
+        ++iewBlockCycles;
+
+    } else if (dispatchStatus[tid] == Squashing) {
+        ++iewSquashCycles;
+    }
+
+    // Dispatch should try to dispatch as many instructions as its bandwidth
+    // will allow, as long as it is not currently blocked.
+    if (dispatchStatus[tid] == Running ||
+        dispatchStatus[tid] == Idle) {
+        DPRINTF(IEW, "[tid:%i] Not blocked, so attempting to run "
+                "dispatch.\n", tid);
+
+        dispatchInsts(tid);
+    } else if (dispatchStatus[tid] == Unblocking) {
+        // Make sure that the skid buffer has something in it if the
+        // status is unblocking.
+        assert(!skidsEmpty());
+
+        // If the status was unblocking, then instructions from the skid
+        // buffer were used.  Remove those instructions and handle
+        // the rest of unblocking.
+        dispatchInsts(tid);
+
+        ++iewUnblockCycles;
+
+        if (validInstsFromRename() && dispatchedAllInsts) {
+            // Add the current inputs to the skid buffer so they can be
+            // reprocessed when this stage unblocks.
+            skidInsert(tid);
+        }
+
+        unblock(tid);
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::dispatchInsts(unsigned tid)
+{
+    dispatchedAllInsts = true;
+
+    // Obtain instructions from skid buffer if unblocking, or queue from rename
+    // otherwise.
+    std::queue<DynInstPtr> &insts_to_dispatch =
+        dispatchStatus[tid] == Unblocking ?
+        skidBuffer[tid] : insts[tid];
+
+    int insts_to_add = insts_to_dispatch.size();
+
+    DynInstPtr inst;
+    bool add_to_iq = false;
+    int dis_num_inst = 0;
+
+    // Loop through the instructions, putting them in the instruction
+    // queue.
+    for ( ; dis_num_inst < insts_to_add &&
+              dis_num_inst < issueReadWidth;
+          ++dis_num_inst)
+    {
+        inst = insts_to_dispatch.front();
+
+        if (dispatchStatus[tid] == Unblocking) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Examining instruction from skid "
+                    "buffer\n", tid);
+        }
+
+        // Make sure there's a valid instruction there.
+        assert(inst);
+
+        DPRINTF(IEW, "[tid:%i]: Issue: Adding PC %#x [sn:%lli] [tid:%i] to "
+                "IQ.\n",
+                tid, inst->readPC(), inst->seqNum, inst->threadNumber);
+
+        // Be sure to mark these instructions as ready so that the
+        // commit stage can go ahead and execute them, and mark
+        // them as issued so the IQ doesn't reprocess them.
+        // -------------
+        // @TODO: What happens if the ldstqueue is full?
+        //        Do we process the other instructions?
+
+        // Check for squashed instructions.
+        if (inst->isSquashed()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Squashed instruction encountered, "
+                    "not adding to IQ.\n", tid);
+
+            ++iewDispSquashedInsts;
+
+            insts_to_dispatch.pop();
+
+            //Tell Rename That An Instruction has been processed
+            if (inst->isLoad() || inst->isStore()) {
+                toRename->iewInfo[tid].dispatchedToLSQ++;
+            }
+            toRename->iewInfo[tid].dispatched++;
+
+            continue;
+        }
+
+        // Check for full conditions.
+        if (instQueue.isFull(tid)) {
+            DPRINTF(IEW, "[tid:%i]: Issue: IQ has become full.\n", tid);
+
+            // Call function to start blocking.
+            block(tid);
+
+            // Set unblock to false. Special case where we are using
+            // skidbuffer (unblocking) instructions but then we still
+            // get full in the IQ.
+            toRename->iewUnblock[tid] = false;
+
+            dispatchedAllInsts = false;
+
+            ++iewIQFullEvents;
+            break;
+        } else if (ldstQueue.isFull(tid)) {
+            DPRINTF(IEW, "[tid:%i]: Issue: LSQ has become full.\n",tid);
+
+            // Call function to start blocking.
+            block(tid);
+
+            // Set unblock to false. Special case where we are using
+            // skidbuffer (unblocking) instructions but then we still
+            // get full in the IQ.
+            toRename->iewUnblock[tid] = false;
+
+            dispatchedAllInsts = false;
+
+            ++iewLSQFullEvents;
+            break;
+        }
+
+        // Otherwise issue the instruction just fine.
+        if (inst->isLoad()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction "
+                    "encountered, adding to LSQ.\n", tid);
+
+            // Reserve a spot in the load store queue for this
+            // memory access.
+            ldstQueue.insertLoad(inst);
+
+            ++iewDispLoadInsts;
+
+            add_to_iq = true;
+
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+        } else if (inst->isStore()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction "
+                    "encountered, adding to LSQ.\n", tid);
+
+            ldstQueue.insertStore(inst);
+
+            ++iewDispStoreInsts;
+
+            if (inst->isNonSpeculative()) {
+                inst->setCanCommit();
+                instQueue.insertNonSpec(inst);
+                add_to_iq = false;
+
+                ++iewDispNonSpecInsts;
+            } else {
+                add_to_iq = true;
+            }
+
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+#if FULL_SYSTEM
+        } else if (inst->isMemBarrier() || inst->isWriteBarrier()) {
+            inst->setCanCommit();
+            instQueue.insertBarrier(inst);
+            add_to_iq = false;
+#endif
+        } else if (inst->isNonSpeculative()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Nonspeculative instruction "
+                    "encountered, skipping.\n", tid);
+
+            // Same hack as with stores.
+            inst->setCanCommit();
+
+            // Specifically insert it as nonspeculative.
+            instQueue.insertNonSpec(inst);
+
+            ++iewDispNonSpecInsts;
+
+            add_to_iq = false;
+        } else if (inst->isNop()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Nop instruction encountered, "
+                    "skipping.\n", tid);
+
+            inst->setIssued();
+            inst->setExecuted();
+            inst->setCanCommit();
+
+            instQueue.advanceTail(inst);
+
+            add_to_iq = false;
+        } else if (inst->isExecuted()) {
+            assert(0 && "Instruction shouldn't be executed.\n");
+            DPRINTF(IEW, "Issue: Executed branch encountered, "
+                    "skipping.\n");
+
+            inst->setIssued();
+            inst->setCanCommit();
+
+            instQueue.advanceTail(inst);
+
+            add_to_iq = false;
+        } else {
+            add_to_iq = true;
+        }
+
+        // If the instruction queue is not full, then add the
+        // instruction.
+        if (add_to_iq) {
+            instQueue.insert(inst);
+        }
+
+        insts_to_dispatch.pop();
+
+        toRename->iewInfo[tid].dispatched++;
+
+        ++iewDispatchedInsts;
+    }
+
+    if (!insts_to_dispatch.empty()) {
+        DPRINTF(IEW,"[tid:%i]: Issue: Bandwidth Full. Blocking.\n");
+        block(tid);
+        toRename->iewUnblock[tid] = false;
+    }
+
+    if (dispatchStatus[tid] == Idle && dis_num_inst) {
+        dispatchStatus[tid] = Running;
+
+        updatedQueues = true;
+    }
+
+    dis_num_inst = 0;
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::printAvailableInsts()
+{
+    int inst = 0;
+
+    cout << "Available Instructions: ";
+
+    while (fromIssue->insts[inst]) {
+
+        if (inst%3==0) cout << "\n\t";
+
+        cout << "PC: " << fromIssue->insts[inst]->readPC()
+             << " TN: " << fromIssue->insts[inst]->threadNumber
+             << " SN: " << fromIssue->insts[inst]->seqNum << " | ";
+
+        inst++;
+
+    }
+
+    cout << "\n";
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::executeInsts()
+{
+    //bool fetch_redirect[(*activeThreads).size()];
+    wbNumInst = 0;
+    wbCycle = 0;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+        fetchRedirect[tid] = false;
+    }
+
+#if 0
+    printAvailableInsts();
+#endif
 
     // Execute/writeback any instructions that are available.
-    for (int inst_num = 0;
-         fu_usage < executeWidth && /* Haven't exceeded available FU's. */
-             inst_num < issueWidth &&
-             fromIssue->insts[inst_num];
+    int inst_num = 0;
+    for ( ; inst_num < issueWidth &&  /* Haven't exceeded issue bandwidth */
+              fromIssue->insts[inst_num];
          ++inst_num) {
 
-        DPRINTF(IEW, "IEW: Execute: Executing instructions from IQ.\n");
+        DPRINTF(IEW, "Execute: Executing instructions from IQ.\n");
 
         // Get instruction from issue's queue.
         DynInstPtr inst = fromIssue->insts[inst_num];
 
-        DPRINTF(IEW, "IEW: Execute: Processing PC %#x.\n", inst->readPC());
+        DPRINTF(IEW, "Execute: Processing PC %#x, [tid:%i] [sn:%i].\n",
+                inst->readPC(), inst->threadNumber,inst->seqNum);
 
         // Check if the instruction is squashed; if so then skip it
         // and don't count it towards the FU usage.
         if (inst->isSquashed()) {
-            DPRINTF(IEW, "IEW: Execute: Instruction was squashed.\n");
+            DPRINTF(IEW, "Execute: Instruction was squashed.\n");
 
             // Consider this instruction executed so that commit can go
             // ahead and retire the instruction.
             inst->setExecuted();
 
-            toCommit->insts[inst_num] = inst;
+            // Not sure if I should set this here or just let commit try to
+            // commit any squashed instructions.  I like the latter a bit more.
+            inst->setCanCommit();
 
             ++iewExecSquashedInsts;
 
             continue;
         }
 
-        inst->setExecuted();
-
-        // If an instruction is executed, then count it towards FU usage.
-        ++fu_usage;
+        Fault fault = NoFault;
 
         // Execute instruction.
         // Note that if the instruction faults, it will be handled
         // at the commit stage.
-        if (inst->isMemRef()) {
-            DPRINTF(IEW, "IEW: Execute: Calculating address for memory "
+        if (inst->isMemRef() &&
+            (!inst->isDataPrefetch() && !inst->isInstPrefetch())) {
+            DPRINTF(IEW, "Execute: Calculating address for memory "
                     "reference.\n");
 
             // Tell the LDSTQ to execute this instruction (if it is a load).
             if (inst->isLoad()) {
-                ldstQueue.executeLoad(inst);
+                // Loads will mark themselves as executed, and their writeback
+                // event adds the instruction to the queue to commit
+                fault = ldstQueue.executeLoad(inst);
 
                 ++iewExecLoadInsts;
             } else if (inst->isStore()) {
                 ldstQueue.executeStore(inst);
 
                 ++iewExecStoreInsts;
+
+                // If the store had a fault then it may not have a mem req
+                if (inst->req && !(inst->req->flags & LOCKED)) {
+                    inst->setExecuted();
+
+                    instToCommit(inst);
+                }
+                // Store conditionals will mark themselves as executed, and
+                // their writeback event will add the instruction to the queue
+                // to commit.
             } else {
-                panic("IEW: Unexpected memory type!\n");
+                panic("Unexpected memory type!\n");
             }
 
         } else {
             inst->execute();
 
             ++iewExecutedInsts;
+
+            inst->setExecuted();
+
+            instToCommit(inst);
         }
 
-        // First check the time slot that this instruction will write
-        // to.  If there are free write ports at the time, then go ahead
-        // and write the instruction to that time.  If there are not,
-        // keep looking back to see where's the first time there's a
-        // free slot.  What happens if you run out of free spaces?
-        // For now naively assume that all instructions take one cycle.
-        // Otherwise would have to look into the time buffer based on the
-        // latency of the instruction.
-        (*iewQueue)[time_slot].insts[inst_slot];
-        while ((*iewQueue)[time_slot].insts[inst_slot]) {
-            if (inst_slot < issueWidth) {
-                ++inst_slot;
-            } else {
-                ++time_slot;
-                inst_slot = 0;
-            }
-
-            assert(time_slot < 5);
-        }
-
-        // May actually have to work this out, especially with loads and stores
-
-        // Add finished instruction to queue to commit.
-        (*iewQueue)[time_slot].insts[inst_slot] = inst;
-        (*iewQueue)[time_slot].size++;
-
         // Check if branch was correct.  This check happens after the
         // instruction is added to the queue because even if the branch
         // is mispredicted, the branch instruction itself is still valid.
         // Only handle this if there hasn't already been something that
         // redirects fetch in this group of instructions.
-        if (!fetch_redirect) {
-            if (inst->mispredicted()) {
-                fetch_redirect = true;
 
-                DPRINTF(IEW, "IEW: Execute: Branch mispredict detected.\n");
-                DPRINTF(IEW, "IEW: Execute: Redirecting fetch to PC: %#x.\n",
+        // This probably needs to prioritize the redirects if a different
+        // scheduler is used.  Currently the scheduler schedules the oldest
+        // instruction first, so the branch resolution order will be correct.
+        unsigned tid = inst->threadNumber;
+
+        if (!fetchRedirect[tid]) {
+
+            if (inst->mispredicted()) {
+                fetchRedirect[tid] = true;
+
+                DPRINTF(IEW, "Execute: Branch mispredict detected.\n");
+                DPRINTF(IEW, "Execute: Redirecting fetch to PC: %#x.\n",
                         inst->nextPC);
 
                 // If incorrect, then signal the ROB that it must be squashed.
-                squashDueToBranch(inst);
+                squashDueToBranch(inst, tid);
 
                 if (inst->predTaken()) {
                     predictedTakenIncorrect++;
+                } else {
+                    predictedNotTakenIncorrect++;
                 }
-            } else if (ldstQueue.violation()) {
-                fetch_redirect = true;
+            } else if (ldstQueue.violation(tid)) {
+                fetchRedirect[tid] = true;
 
-                // Get the DynInst that caused the violation.
-                DynInstPtr violator = ldstQueue.getMemDepViolator();
+                // Get the DynInst that caused the violation.  Note that this
+                // clears the violation signal.
+                DynInstPtr violator;
+                violator = ldstQueue.getMemDepViolator(tid);
 
-                DPRINTF(IEW, "IEW: LDSTQ detected a violation.  Violator PC: "
+                DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
                         "%#x, inst PC: %#x.  Addr is: %#x.\n",
                         violator->readPC(), inst->readPC(), inst->physEffAddr);
 
@@ -573,129 +1197,42 @@ SimpleIEW<Impl>::executeInsts()
                 instQueue.violation(inst, violator);
 
                 // Squash.
-                squashDueToMem(inst);
+                squashDueToMemOrder(inst,tid);
 
                 ++memOrderViolationEvents;
+            } else if (ldstQueue.loadBlocked(tid) &&
+                       !ldstQueue.isLoadBlockedHandled(tid)) {
+                fetchRedirect[tid] = true;
+
+                DPRINTF(IEW, "Load operation couldn't execute because the "
+                        "memory system is blocked.  PC: %#x [sn:%lli]\n",
+                        inst->readPC(), inst->seqNum);
+
+                squashDueToMemBlocked(inst, tid);
             }
         }
     }
+
+    if (inst_num) {
+        if (exeStatus == Idle) {
+            exeStatus = Running;
+        }
+
+        updatedQueues = true;
+
+        cpu->activityThisCycle();
+    }
+
+    // Need to reset this in case a writeback event needs to write into the
+    // iew queue.  That way the writeback event will write into the correct
+    // spot in the queue.
+    wbNumInst = 0;
 }
 
-template<class Impl>
+template <class Impl>
 void
-SimpleIEW<Impl>::tick()
+DefaultIEW<Impl>::writebackInsts()
 {
-    // Considering putting all the state-determining stuff in this section.
-
-    // Try to fill up issue queue with as many instructions as bandwidth
-    // allows.
-    // Decode should try to execute as many instructions as its bandwidth
-    // will allow, as long as it is not currently blocked.
-
-    // Check if the stage is in a running status.
-    if (_status != Blocked && _status != Squashing) {
-        DPRINTF(IEW, "IEW: Status is not blocked, attempting to run "
-                     "stage.\n");
-        iew();
-
-        // If it's currently unblocking, check to see if it should switch
-        // to running.
-        if (_status == Unblocking) {
-            unblock();
-
-            ++iewUnblockCycles;
-        }
-    } else if (_status == Squashing) {
-
-        DPRINTF(IEW, "IEW: Still squashing.\n");
-
-        // Check if stage should remain squashing.  Stop squashing if the
-        // squash signal clears.
-        if (!fromCommit->commitInfo.squash &&
-            !fromCommit->commitInfo.robSquashing) {
-            DPRINTF(IEW, "IEW: Done squashing, changing status to "
-                    "running.\n");
-
-            _status = Running;
-            instQueue.stopSquash();
-        } else {
-            instQueue.doSquash();
-        }
-
-        ++iewSquashCycles;
-    } else if (_status == Blocked) {
-        // Continue to tell previous stage to stall.
-        toRename->iewInfo.stall = true;
-
-        // Check if possible stall conditions have cleared.
-        if (!fromCommit->commitInfo.stall &&
-            !instQueue.isFull()) {
-            DPRINTF(IEW, "IEW: Stall signals cleared, going to unblock.\n");
-            _status = Unblocking;
-        }
-
-        // If there's still instructions coming from rename, continue to
-        // put them on the skid buffer.
-        if (fromRename->size == 0) {
-            block();
-        }
-
-        if (fromCommit->commitInfo.squash ||
-            fromCommit->commitInfo.robSquashing) {
-            squash();
-        }
-
-        ++iewBlockCycles;
-    }
-
-    // @todo: Maybe put these at the beginning, so if it's idle it can
-    // return early.
-    // Write back number of free IQ entries here.
-    toRename->iewInfo.freeIQEntries = instQueue.numFreeEntries();
-
-    ldstQueue.writebackStores();
-
-    // Check the committed load/store signals to see if there's a load
-    // or store to commit.  Also check if it's being told to execute a
-    // nonspeculative instruction.
-    // This is pretty inefficient...
-    if (!fromCommit->commitInfo.squash &&
-        !fromCommit->commitInfo.robSquashing) {
-        ldstQueue.commitStores(fromCommit->commitInfo.doneSeqNum);
-        ldstQueue.commitLoads(fromCommit->commitInfo.doneSeqNum);
-    }
-
-    if (fromCommit->commitInfo.nonSpecSeqNum != 0) {
-        instQueue.scheduleNonSpec(fromCommit->commitInfo.nonSpecSeqNum);
-    }
-
-    DPRINTF(IEW, "IEW: IQ has %i free entries.\n",
-            instQueue.numFreeEntries());
-}
-
-template<class Impl>
-void
-SimpleIEW<Impl>::iew()
-{
-    // Might want to put all state checks in the tick() function.
-    // Check if being told to stall from commit.
-    if (fromCommit->commitInfo.stall) {
-        block();
-        return;
-    } else if (fromCommit->commitInfo.squash ||
-               fromCommit->commitInfo.robSquashing) {
-        // Also check if commit is telling this stage to squash.
-        squash();
-        return;
-    }
-
-    dispatchInsts();
-
-    // Have the instruction queue try to schedule any ready instructions.
-    instQueue.scheduleReadyInsts();
-
-    executeInsts();
-
     // Loop through the head of the time buffer and wake any dependents.
     // These instructions are about to write back.  In the simple model
     // this loop can really happen within the previous loop, but when
@@ -704,33 +1241,152 @@ SimpleIEW<Impl>::iew()
     // Either have IEW have direct access to rename map, or have this as
     // part of backwards communication.
     for (int inst_num = 0; inst_num < issueWidth &&
-             toCommit->insts[inst_num]; inst_num++)
-    {
+             toCommit->insts[inst_num]; inst_num++) {
         DynInstPtr inst = toCommit->insts[inst_num];
 
-        DPRINTF(IEW, "IEW: Sending instructions to commit, PC %#x.\n",
+        DPRINTF(IEW, "Sending instructions to commit, PC %#x.\n",
                 inst->readPC());
 
-        if(!inst->isSquashed()) {
+        // Some instructions will be sent to commit without having
+        // executed because they need commit to handle them.
+        // E.g. Uncached loads have not actually executed when they
+        // are first sent to commit.  Instead commit must tell the LSQ
+        // when it's ready to execute the uncached load.
+        if (!inst->isSquashed() && inst->isExecuted()) {
             instQueue.wakeDependents(inst);
 
-            for (int i = 0; i < inst->numDestRegs(); i++)
-            {
-                renameMap->markAsReady(inst->renamedDestRegIdx(i));
+            for (int i = 0; i < inst->numDestRegs(); i++) {
+                //mark as Ready
+                DPRINTF(IEW,"Setting Destination Register %i\n",
+                        inst->renamedDestRegIdx(i));
+                scoreboard->setReg(inst->renamedDestRegIdx(i));
             }
         }
     }
-
-    // Also should advance its own time buffers if the stage ran.
-    // Not the best place for it, but this works (hopefully).
-    issueToExecQueue.advance();
 }
 
-#if !FULL_SYSTEM
 template<class Impl>
 void
-SimpleIEW<Impl>::lsqWriteback()
+DefaultIEW<Impl>::tick()
 {
-    ldstQueue.writebackAllInsts();
+    // Try to fill up issue queue with as many instructions as bandwidth
+    // allows.
+    wbNumInst = 0;
+    wbCycle = 0;
+
+    wroteToTimeBuffer = false;
+    updatedQueues = false;
+
+    sortInsts();
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    // Check stall and squash signals.
+    while (threads != (*activeThreads).end()) {
+           unsigned tid = *threads++;
+
+        DPRINTF(IEW,"Issue: Processing [tid:%i]\n",tid);
+
+        checkSignalsAndUpdate(tid);
+        dispatch(tid);
+
+    }
+
+    if (exeStatus != Squashing) {
+        executeInsts();
+
+        writebackInsts();
+
+        // Have the instruction queue try to schedule any ready instructions.
+        // (In actuality, this scheduling is for instructions that will
+        // be executed next cycle.)
+        instQueue.scheduleReadyInsts();
+
+        // Also should advance its own time buffers if the stage ran.
+        // Not the best place for it, but this works (hopefully).
+        issueToExecQueue.advance();
+    }
+
+    bool broadcast_free_entries = false;
+
+    if (updatedQueues || exeStatus == Running || updateLSQNextCycle) {
+        exeStatus = Idle;
+        updateLSQNextCycle = false;
+
+        broadcast_free_entries = true;
+    }
+
+    // Writeback any stores using any leftover bandwidth.
+    ldstQueue.writebackStores();
+
+    // Free function units marked as being freed this cycle.
+    fuPool->processFreeUnits();
+
+    // Check the committed load/store signals to see if there's a load
+    // or store to commit.  Also check if it's being told to execute a
+    // nonspeculative instruction.
+    // This is pretty inefficient...
+
+    threads = (*activeThreads).begin();
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = (*threads++);
+
+        DPRINTF(IEW,"Processing [tid:%i]\n",tid);
+
+        if (fromCommit->commitInfo[tid].doneSeqNum != 0 &&
+            !fromCommit->commitInfo[tid].squash &&
+            !fromCommit->commitInfo[tid].robSquashing) {
+
+            ldstQueue.commitStores(fromCommit->commitInfo[tid].doneSeqNum,tid);
+
+            ldstQueue.commitLoads(fromCommit->commitInfo[tid].doneSeqNum,tid);
+
+            updateLSQNextCycle = true;
+            instQueue.commit(fromCommit->commitInfo[tid].doneSeqNum,tid);
+        }
+
+        if (fromCommit->commitInfo[tid].nonSpecSeqNum != 0) {
+
+            //DPRINTF(IEW,"NonspecInst from thread %i",tid);
+            if (fromCommit->commitInfo[tid].uncached) {
+                instQueue.replayMemInst(fromCommit->commitInfo[tid].uncachedLoad);
+            } else {
+                instQueue.scheduleNonSpec(
+                    fromCommit->commitInfo[tid].nonSpecSeqNum);
+            }
+        }
+
+        if (broadcast_free_entries) {
+            toFetch->iewInfo[tid].iqCount =
+                instQueue.getCount(tid);
+            toFetch->iewInfo[tid].ldstqCount =
+                ldstQueue.getCount(tid);
+
+            toRename->iewInfo[tid].usedIQ = true;
+            toRename->iewInfo[tid].freeIQEntries =
+                instQueue.numFreeEntries();
+            toRename->iewInfo[tid].usedLSQ = true;
+            toRename->iewInfo[tid].freeLSQEntries =
+                ldstQueue.numFreeEntries(tid);
+
+            wroteToTimeBuffer = true;
+        }
+
+        DPRINTF(IEW, "[tid:%i], Dispatch dispatched %i instructions.\n",
+                tid, toRename->iewInfo[tid].dispatched);
+
+        //thread_queue.pop();
+    }
+
+    DPRINTF(IEW, "IQ has %i free entries (Can schedule: %i).  "
+            "LSQ has %i free entries.\n",
+            instQueue.numFreeEntries(), instQueue.hasReadyInsts(),
+            ldstQueue.numFreeEntries());
+
+    updateStatus();
+
+    if (wroteToTimeBuffer) {
+        DPRINTF(Activity, "Activity this cycle.\n");
+        cpu->activityThisCycle();
+    }
 }
-#endif
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 43fe96c49..283bbdc22 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_INST_QUEUE_HH__
-#define __CPU_O3_CPU_INST_QUEUE_HH__
+#ifndef __CPU_O3_INST_QUEUE_HH__
+#define __CPU_O3_INST_QUEUE_HH__
 
 #include <list>
 #include <map>
@@ -37,8 +37,12 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/inst_seq.hh"
+#include "encumbered/cpu/full/op_class.hh"
 #include "sim/host.hh"
 
+class FUPool;
+class MemInterface;
+
 /**
  * A standard instruction queue class.  It holds ready instructions, in
  * order, in seperate priority queues to facilitate the scheduling of
@@ -47,7 +51,14 @@
  * floating point registers have their indices start after the integer
  * registers (ie with 96 int and 96 fp registers, regs 0-95 are integer
  * and 96-191 are fp).  This remains true even for both logical and
- * physical register indices.
+ * physical register indices. The IQ depends on the memory dependence unit to
+ * track when memory operations are ready in terms of ordering; register
+ * dependencies are tracked normally. Right now the IQ also handles the
+ * execution timing; this is mainly to allow back-to-back scheduling without
+ * requiring IEW to be able to peek into the IQ. At the end of the execution
+ * latency, the instruction is put into the queue to execute, where it will
+ * have the execute() function called on it.
+ * @todo: Make IQ able to handle multiple FU pools.
  */
 template <class Impl>
 class InstructionQueue
@@ -58,87 +69,178 @@ class InstructionQueue
     typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::Params Params;
 
+    typedef typename Impl::CPUPol::IEW IEW;
     typedef typename Impl::CPUPol::MemDepUnit MemDepUnit;
     typedef typename Impl::CPUPol::IssueStruct IssueStruct;
     typedef typename Impl::CPUPol::TimeStruct TimeStruct;
 
-    // Typedef of iterator through the list of instructions.  Might be
-    // better to untie this from the FullCPU or pass its information to
-    // the stages.
+    // Typedef of iterator through the list of instructions.
     typedef typename std::list<DynInstPtr>::iterator ListIt;
 
-    /**
-     * Struct for comparing entries to be added to the priority queue.  This
-     * gives reverse ordering to the instructions in terms of sequence
-     * numbers: the instructions with smaller sequence numbers (and hence
-     * are older) will be at the top of the priority queue.
-     */
-    struct pqCompare
-    {
-        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
-        {
-            return lhs->seqNum > rhs->seqNum;
-        }
+    friend class Impl::FullCPU;
+
+    /** FU completion event class. */
+    class FUCompletion : public Event {
+      private:
+        /** Executing instruction. */
+        DynInstPtr inst;
+
+        /** Index of the FU used for executing. */
+        int fuIdx;
+
+        /** Pointer back to the instruction queue. */
+        InstructionQueue<Impl> *iqPtr;
+
+      public:
+        /** Construct a FU completion event. */
+        FUCompletion(DynInstPtr &_inst, int fu_idx,
+                     InstructionQueue<Impl> *iq_ptr);
+
+        virtual void process();
+        virtual const char *description();
     };
 
-    /**
-     * Struct for comparing entries to be added to the set.  This gives
-     * standard ordering in terms of sequence numbers.
-     */
-    struct setCompare
-    {
-        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
-        {
-            return lhs->seqNum < rhs->seqNum;
-        }
-    };
+    /** Constructs an IQ. */
+    InstructionQueue(Params *params);
 
-    typedef std::priority_queue<DynInstPtr, vector<DynInstPtr>, pqCompare>
-    ReadyInstQueue;
+    /** Destructs the IQ. */
+    ~InstructionQueue();
 
-    InstructionQueue(Params &params);
+    /** Returns the name of the IQ. */
+    std::string name() const;
 
+    /** Registers statistics. */
     void regStats();
 
-    void setCPU(FullCPU *cpu);
+    /** Sets CPU pointer. */
+    void setCPU(FullCPU *_cpu) { cpu = _cpu; }
 
+    /** Sets active threads list. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Sets the IEW pointer. */
+    void setIEW(IEW *iew_ptr) { iewStage = iew_ptr; }
+
+    /** Sets the timer buffer between issue and execute. */
     void setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2eQueue);
 
+    /** Sets the global time buffer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    /** Number of entries needed for given amount of threads. */
+    int entryAmount(int num_threads);
+
+    /** Resets max entries for all threads. */
+    void resetEntries();
+
+    /** Returns total number of free entries. */
     unsigned numFreeEntries();
 
+    /** Returns number of free entries for a thread. */
+    unsigned numFreeEntries(unsigned tid);
+
+    /** Returns whether or not the IQ is full. */
     bool isFull();
 
+    /** Returns whether or not the IQ is full for a specific thread. */
+    bool isFull(unsigned tid);
+
+    /** Returns if there are any ready instructions in the IQ. */
+    bool hasReadyInsts();
+
+    /** Inserts a new instruction into the IQ. */
     void insert(DynInstPtr &new_inst);
 
+    /** Inserts a new, non-speculative instruction into the IQ. */
     void insertNonSpec(DynInstPtr &new_inst);
 
+    /** Inserts a memory or write barrier into the IQ to make sure
+     *  loads and stores are ordered properly.
+     */
+    void insertBarrier(DynInstPtr &barr_inst);
+
+    /**
+     * Advances the tail of the IQ, used if an instruction is not added to the
+     * IQ for scheduling.
+     * @todo: Rename this function.
+     */
     void advanceTail(DynInstPtr &inst);
 
+    /** Process FU completion event. */
+    void processFUCompletion(DynInstPtr &inst, int fu_idx);
+
+    /**
+     * Schedules ready instructions, adding the ready ones (oldest first) to
+     * the queue to execute.
+     */
     void scheduleReadyInsts();
 
+    /** Schedules a single specific non-speculative instruction. */
     void scheduleNonSpec(const InstSeqNum &inst);
 
+    /**
+     * Commits all instructions up to and including the given sequence number,
+     * for a specific thread.
+     */
+    void commit(const InstSeqNum &inst, unsigned tid = 0);
+
+    /** Wakes all dependents of a completed instruction. */
     void wakeDependents(DynInstPtr &completed_inst);
 
+    /** Adds a ready memory instruction to the ready list. */
+    void addReadyMemInst(DynInstPtr &ready_inst);
+
+    /**
+     * Reschedules a memory instruction. It will be ready to issue once
+     * replayMemInst() is called.
+     */
+    void rescheduleMemInst(DynInstPtr &resched_inst);
+
+    /** Replays a memory instruction. It must be rescheduled first. */
+    void replayMemInst(DynInstPtr &replay_inst);
+
+    /** Completes a memory operation. */
+    void completeMemInst(DynInstPtr &completed_inst);
+
+    /** Indicates an ordering violation between a store and a load. */
     void violation(DynInstPtr &store, DynInstPtr &faulting_load);
 
-    // Change this to take in the sequence number
-    void squash();
+    /**
+     * Squashes instructions for a thread. Squashing information is obtained
+     * from the time buffer.
+     */
+    void squash(unsigned tid);
 
-    void doSquash();
+    /** Returns the number of used entries for a thread. */
+    unsigned getCount(unsigned tid) { return count[tid]; };
 
-    void stopSquash();
+    /** Updates the number of free entries. */
+    void updateFreeEntries(int num) { freeEntries += num; }
+
+    /** Debug function to print all instructions. */
+    void printInsts();
 
   private:
+    /** Does the actual squashing. */
+    void doSquash(unsigned tid);
+
+    /////////////////////////
+    // Various pointers
+    /////////////////////////
+
     /** Pointer to the CPU. */
     FullCPU *cpu;
 
+    /** Cache interface. */
+    MemInterface *dcacheInterface;
+
+    /** Pointer to IEW stage. */
+    IEW *iewStage;
+
     /** The memory dependence unit, which tracks/predicts memory dependences
      *  between instructions.
      */
-    MemDepUnit memDepUnit;
+    MemDepUnit memDepUnit[Impl::MaxThreads];
 
     /** The queue to the execute stage.  Issued instructions will be written
      *  into it.
@@ -151,36 +253,45 @@ class InstructionQueue
     /** Wire to read information from timebuffer. */
     typename TimeBuffer<TimeStruct>::wire fromCommit;
 
-    enum InstList {
-        Int,
-        Float,
-        Branch,
-        Memory,
-        Misc,
-        Squashed,
-        None
+    /** Function unit pool. */
+    FUPool *fuPool;
+
+    //////////////////////////////////////
+    // Instruction lists, ready queues, and ordering
+    //////////////////////////////////////
+
+    /** List of all the instructions in the IQ (some of which may be issued). */
+    std::list<DynInstPtr> instList[Impl::MaxThreads];
+
+    /**
+     * Struct for comparing entries to be added to the priority queue.  This
+     * gives reverse ordering to the instructions in terms of sequence
+     * numbers: the instructions with smaller sequence numbers (and hence
+     * are older) will be at the top of the priority queue.
+     */
+    struct pqCompare {
+        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
+        {
+            return lhs->seqNum > rhs->seqNum;
+        }
     };
 
-    /** List of ready int instructions.  Used to keep track of the order in
-     *  which instructions should issue.
+    /**
+     * Struct for an IQ entry. It includes the instruction and an iterator
+     * to the instruction's spot in the IQ.
      */
-    ReadyInstQueue readyIntInsts;
+    struct IQEntry {
+        DynInstPtr inst;
+        ListIt iqIt;
+    };
 
-    /** List of ready floating point instructions. */
-    ReadyInstQueue readyFloatInsts;
+    typedef std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare>
+    ReadyInstQueue;
 
-    /** List of ready branch instructions. */
-    ReadyInstQueue readyBranchInsts;
-
-    /** List of ready miscellaneous instructions. */
-    ReadyInstQueue readyMiscInsts;
-
-    /** List of squashed instructions (which are still valid and in IQ).
-     *  Implemented using a priority queue; the entries must contain both
-     *  the IQ index and sequence number of each instruction so that
-     *  ordering based on sequence numbers can be used.
+    /** List of ready instructions, per op class.  They are separated by op
+     *  class to allow for easy mapping to FUs.
      */
-    ReadyInstQueue squashedInsts;
+    ReadyInstQueue readyInsts[Num_OpClasses];
 
     /** List of non-speculative instructions that will be scheduled
      *  once the IQ gets a signal from commit.  While it's redundant to
@@ -188,10 +299,68 @@ class InstructionQueue
      *  inside of DynInst), when these instructions are woken up only
      *  the sequence number will be available.  Thus it is most efficient to be
      *  able to search by the sequence number alone.
+     *  @todo: Maybe change this to a priority queue per thread.
      */
     std::map<InstSeqNum, DynInstPtr> nonSpecInsts;
 
-    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator non_spec_it_t;
+    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator NonSpecMapIt;
+
+    /** Entry for the list age ordering by op class. */
+    struct ListOrderEntry {
+        OpClass queueType;
+        InstSeqNum oldestInst;
+    };
+
+    /** List that contains the age order of the oldest instruction of each
+     *  ready queue.  Used to select the oldest instruction available
+     *  among op classes.
+     */
+    std::list<ListOrderEntry> listOrder;
+
+    typedef typename std::list<ListOrderEntry>::iterator ListOrderIt;
+
+    /** Tracks if each ready queue is on the age order list. */
+    bool queueOnList[Num_OpClasses];
+
+    /** Iterators of each ready queue.  Points to their spot in the age order
+     *  list.
+     */
+    ListOrderIt readyIt[Num_OpClasses];
+
+    /** Add an op class to the age order list. */
+    void addToOrderList(OpClass op_class);
+
+    /**
+     * Called when the oldest instruction has been removed from a ready queue;
+     * this places that ready queue into the proper spot in the age order list.
+     */
+    void moveToYoungerInst(ListOrderIt age_order_it);
+
+    //////////////////////////////////////
+    // Various parameters
+    //////////////////////////////////////
+
+    /** IQ Resource Sharing Policy */
+    enum IQPolicy {
+        Dynamic,
+        Partitioned,
+        Threshold
+    };
+
+    /** IQ sharing policy for SMT. */
+    IQPolicy iqPolicy;
+
+    /** Number of Total Threads*/
+    unsigned numThreads;
+
+    /** Pointer to list of active threads. */
+    std::list<unsigned> *activeThreads;
+
+    /** Per Thread IQ count */
+    unsigned count[Impl::MaxThreads];
+
+    /** Max IQ Entries Per Thread */
+    unsigned maxEntries[Impl::MaxThreads];
 
     /** Number of free IQ entries left. */
     unsigned freeEntries;
@@ -199,26 +368,10 @@ class InstructionQueue
     /** The number of entries in the instruction queue. */
     unsigned numEntries;
 
-    /** The number of integer instructions that can be issued in one
-     *  cycle.
-     */
-    unsigned intWidth;
-
-    /** The number of floating point instructions that can be issued
-     *  in one cycle.
-     */
-    unsigned floatWidth;
-
-    /** The number of branches that can be issued in one cycle. */
-    unsigned branchWidth;
-
-    /** The number of memory instructions that can be issued in one cycle. */
-    unsigned memoryWidth;
-
     /** The total number of instructions that can be issued in one cycle. */
     unsigned totalWidth;
 
-    //The number of physical registers in the CPU.
+    /** The number of physical registers in the CPU. */
     unsigned numPhysRegs;
 
     /** The number of physical integer registers in the CPU. */
@@ -237,15 +390,12 @@ class InstructionQueue
     //////////////////////////////////
 
     /** The sequence number of the squashed instruction. */
-    InstSeqNum squashedSeqNum;
-
-    /** Iterator that points to the youngest instruction in the IQ. */
-    ListIt tail;
+    InstSeqNum squashedSeqNum[Impl::MaxThreads];
 
     /** Iterator that points to the last instruction that has been squashed.
      *  This will not be valid unless the IQ is in the process of squashing.
      */
-    ListIt squashIt;
+    ListIt squashIt[Impl::MaxThreads];
 
     ///////////////////////////////////
     // Dependency graph stuff
@@ -254,6 +404,10 @@ class InstructionQueue
     class DependencyEntry
     {
       public:
+        DependencyEntry()
+            : inst(NULL), next(NULL)
+        { }
+
         DynInstPtr inst;
         //Might want to include data about what arch. register the
         //dependence is waiting on.
@@ -288,15 +442,17 @@ class InstructionQueue
      *  is basically a secondary scoreboard, and should pretty much mirror
      *  the scoreboard that exists in the rename map.
      */
-    vector<bool> regScoreboard;
+    std::vector<bool> regScoreboard;
 
+    /** Adds an instruction to the dependency graph, as a producer. */
     bool addToDependents(DynInstPtr &new_inst);
-    void insertDependency(DynInstPtr &new_inst);
+
+    /** Adds an instruction to the dependency graph, as a consumer. */
     void createDependency(DynInstPtr &new_inst);
 
+    /** Moves an instruction to the ready queue if it is ready. */
     void addIfReady(DynInstPtr &inst);
 
-  private:
     /** Debugging function to count how many entries are in the IQ.  It does
      *  a linear walk through the instructions, so do not call this function
      *  during normal execution.
@@ -313,24 +469,42 @@ class InstructionQueue
      */
     void dumpLists();
 
+    /** Debugging function to dump out all instructions that are in the
+     *  IQ.
+     */
+    void dumpInsts();
+
+    /** Stat for number of instructions added. */
     Stats::Scalar<> iqInstsAdded;
+    /** Stat for number of non-speculative instructions added. */
     Stats::Scalar<> iqNonSpecInstsAdded;
 //    Stats::Scalar<> iqIntInstsAdded;
+    /** Stat for number of integer instructions issued. */
     Stats::Scalar<> iqIntInstsIssued;
 //    Stats::Scalar<> iqFloatInstsAdded;
+    /** Stat for number of floating point instructions issued. */
     Stats::Scalar<> iqFloatInstsIssued;
 //    Stats::Scalar<> iqBranchInstsAdded;
+    /** Stat for number of branch instructions issued. */
     Stats::Scalar<> iqBranchInstsIssued;
 //    Stats::Scalar<> iqMemInstsAdded;
+    /** Stat for number of memory instructions issued. */
     Stats::Scalar<> iqMemInstsIssued;
 //    Stats::Scalar<> iqMiscInstsAdded;
+    /** Stat for number of miscellaneous instructions issued. */
     Stats::Scalar<> iqMiscInstsIssued;
+    /** Stat for number of squashed instructions that were ready to issue. */
     Stats::Scalar<> iqSquashedInstsIssued;
-    Stats::Scalar<> iqLoopSquashStalls;
+    /** Stat for number of squashed instructions examined when squashing. */
     Stats::Scalar<> iqSquashedInstsExamined;
+    /** Stat for number of squashed instruction operands examined when
+     * squashing.
+     */
     Stats::Scalar<> iqSquashedOperandsExamined;
+    /** Stat for number of non-speculative instructions removed due to a squash.
+     */
     Stats::Scalar<> iqSquashedNonSpecRemoved;
 
 };
 
-#endif //__CPU_O3_CPU_INST_QUEUE_HH__
+#endif //__CPU_O3_INST_QUEUE_HH__
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index 048dc7c00..cfdd25cd5 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -39,32 +39,63 @@
 
 #include "sim/root.hh"
 
+#include "cpu/o3/fu_pool.hh"
 #include "cpu/o3/inst_queue.hh"
 
-// Either compile error or max int due to sign extension.
-// Hack to avoid compile warnings.
-const InstSeqNum MaxInstSeqNum = std::numeric_limits<InstSeqNum>::max();
+using namespace std;
 
 template <class Impl>
-InstructionQueue<Impl>::InstructionQueue(Params &params)
-    : memDepUnit(params),
-      numEntries(params.numIQEntries),
-      intWidth(params.executeIntWidth),
-      floatWidth(params.executeFloatWidth),
-      branchWidth(params.executeBranchWidth),
-      memoryWidth(params.executeMemoryWidth),
-      totalWidth(params.issueWidth),
-      numPhysIntRegs(params.numPhysIntRegs),
-      numPhysFloatRegs(params.numPhysFloatRegs),
-      commitToIEWDelay(params.commitToIEWDelay)
+InstructionQueue<Impl>::FUCompletion::FUCompletion(DynInstPtr &_inst,
+                                                   int fu_idx,
+                                                   InstructionQueue<Impl> *iq_ptr)
+    : Event(&mainEventQueue, Stat_Event_Pri),
+      inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr)
 {
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::FUCompletion::process()
+{
+    iqPtr->processFUCompletion(inst, fuIdx);
+    inst = NULL;
+}
+
+
+template <class Impl>
+const char *
+InstructionQueue<Impl>::FUCompletion::description()
+{
+    return "Functional unit completion event";
+}
+
+template <class Impl>
+InstructionQueue<Impl>::InstructionQueue(Params *params)
+    : dcacheInterface(params->dcacheInterface),
+      fuPool(params->fuPool),
+      numEntries(params->numIQEntries),
+      totalWidth(params->issueWidth),
+      numPhysIntRegs(params->numPhysIntRegs),
+      numPhysFloatRegs(params->numPhysFloatRegs),
+      commitToIEWDelay(params->commitToIEWDelay)
+{
+    assert(fuPool);
+
+    numThreads = params->numberOfThreads;
+
+    //Initialize thread IQ counts
+    for (int i = 0; i <numThreads; i++) {
+        count[i] = 0;
+    }
+
     // Initialize the number of free IQ entries.
     freeEntries = numEntries;
 
     // Set the number of physical registers as the number of int + float
     numPhysRegs = numPhysIntRegs + numPhysFloatRegs;
 
-    DPRINTF(IQ, "IQ: There are %i physical registers.\n", numPhysRegs);
+    DPRINTF(IQ, "There are %i physical registers.\n", numPhysRegs);
 
     //Create an entry for each physical register within the
     //dependency graph.
@@ -73,6 +104,12 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)
     // Resize the register scoreboard.
     regScoreboard.resize(numPhysRegs);
 
+    //Initialize Mem Dependence Units
+    for (int i = 0; i < numThreads; i++) {
+        memDepUnit[i].init(params,i);
+        memDepUnit[i].setIQ(this);
+    }
+
     // Initialize all the head pointers to point to NULL, and all the
     // entries as unready.
     // Note that in actuality, the registers corresponding to the logical
@@ -80,13 +117,107 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)
     // IQ as the instruction should have been correctly told if those
     // registers are ready in rename.  Thus it can all be initialized as
     // unready.
-    for (int i = 0; i < numPhysRegs; ++i)
-    {
+    for (int i = 0; i < numPhysRegs; ++i) {
         dependGraph[i].next = NULL;
         dependGraph[i].inst = NULL;
         regScoreboard[i] = false;
     }
 
+    for (int i = 0; i < numThreads; ++i) {
+        squashedSeqNum[i] = 0;
+    }
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        queueOnList[i] = false;
+        readyIt[i] = listOrder.end();
+    }
+
+    string policy = params->smtIQPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Figure out resource sharing policy
+    if (policy == "dynamic") {
+        iqPolicy = Dynamic;
+
+        //Set Max Entries to Total ROB Capacity
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = numEntries;
+        }
+
+    } else if (policy == "partitioned") {
+        iqPolicy = Partitioned;
+
+        //@todo:make work if part_amt doesnt divide evenly.
+        int part_amt = numEntries / numThreads;
+
+        //Divide ROB up evenly
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = part_amt;
+        }
+
+        DPRINTF(Fetch, "IQ sharing policy set to Partitioned:"
+                "%i entries per thread.\n",part_amt);
+
+    } else if (policy == "threshold") {
+        iqPolicy = Threshold;
+
+        double threshold =  (double)params->smtIQThreshold / 100;
+
+        int thresholdIQ = (int)((double)threshold * numEntries);
+
+        //Divide up by threshold amount
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = thresholdIQ;
+        }
+
+        DPRINTF(Fetch, "IQ sharing policy set to Threshold:"
+                "%i entries per thread.\n",thresholdIQ);
+   } else {
+       assert(0 && "Invalid IQ Sharing Policy.Options Are:{Dynamic,"
+              "Partitioned, Threshold}");
+   }
+}
+
+template <class Impl>
+InstructionQueue<Impl>::~InstructionQueue()
+{
+    // Clear the dependency graph
+    DependencyEntry *curr;
+    DependencyEntry *prev;
+
+    for (int i = 0; i < numPhysRegs; ++i) {
+        curr = dependGraph[i].next;
+
+        while (curr) {
+            DependencyEntry::mem_alloc_counter--;
+
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
+
+            delete prev;
+        }
+
+        if (dependGraph[i].inst) {
+            dependGraph[i].inst = NULL;
+        }
+
+        dependGraph[i].next = NULL;
+    }
+
+    assert(DependencyEntry::mem_alloc_counter == 0);
+
+    delete [] dependGraph;
+}
+
+template <class Impl>
+std::string
+InstructionQueue<Impl>::name() const
+{
+    return cpu->name() + ".iq";
 }
 
 template <class Impl>
@@ -143,12 +274,6 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of squashed instructions issued")
         .prereq(iqSquashedInstsIssued);
 
-    iqLoopSquashStalls
-        .name(name() + ".iqLoopSquashStalls")
-        .desc("Number of times issue loop had to restart due to squashed "
-              "inst; mainly for profiling")
-        .prereq(iqLoopSquashStalls);
-
     iqSquashedInstsExamined
         .name(name() + ".iqSquashedInstsExamined")
         .desc("Number of squashed instructions iterated over during squash;"
@@ -166,25 +291,25 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of squashed non-spec instructions that were removed")
         .prereq(iqSquashedNonSpecRemoved);
 
-    // Tell mem dependence unit to reg stats as well.
-    memDepUnit.regStats();
+    for ( int i=0; i < numThreads; i++) {
+        // Tell mem dependence unit to reg stats as well.
+        memDepUnit[i].regStats();
+    }
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::setCPU(FullCPU *cpu_ptr)
+InstructionQueue<Impl>::setActiveThreads(list<unsigned> *at_ptr)
 {
-    cpu = cpu_ptr;
-
-    tail = cpu->instList.begin();
+    DPRINTF(IQ, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::setIssueToExecuteQueue(
-                        TimeBuffer<IssueStruct> *i2e_ptr)
+InstructionQueue<Impl>::setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2e_ptr)
 {
-    DPRINTF(IQ, "IQ: Set the issue to execute queue.\n");
+    DPRINTF(IQ, "Set the issue to execute queue.\n");
     issueToExecuteQueue = i2e_ptr;
 }
 
@@ -192,12 +317,44 @@ template <class Impl>
 void
 InstructionQueue<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
-    DPRINTF(IQ, "IQ: Set the time buffer.\n");
+    DPRINTF(IQ, "Set the time buffer.\n");
     timeBuffer = tb_ptr;
 
     fromCommit = timeBuffer->getWire(-commitToIEWDelay);
 }
 
+template <class Impl>
+int
+InstructionQueue<Impl>::entryAmount(int num_threads)
+{
+    if (iqPolicy == Partitioned) {
+        return numEntries / num_threads;
+    } else {
+        return 0;
+    }
+}
+
+
+template <class Impl>
+void
+InstructionQueue<Impl>::resetEntries()
+{
+    if (iqPolicy != Dynamic || numThreads > 1) {
+        int active_threads = (*activeThreads).size();
+
+        list<unsigned>::iterator threads  = (*activeThreads).begin();
+        list<unsigned>::iterator list_end = (*activeThreads).end();
+
+        while (threads != list_end) {
+            if (iqPolicy == Partitioned) {
+                maxEntries[*threads++] = numEntries / active_threads;
+            } else if(iqPolicy == Threshold && active_threads == 1) {
+                maxEntries[*threads++] = numEntries;
+            }
+        }
+    }
+}
+
 template <class Impl>
 unsigned
 InstructionQueue<Impl>::numFreeEntries()
@@ -205,6 +362,13 @@ InstructionQueue<Impl>::numFreeEntries()
     return freeEntries;
 }
 
+template <class Impl>
+unsigned
+InstructionQueue<Impl>::numFreeEntries(unsigned tid)
+{
+    return maxEntries[tid] - count[tid];
+}
+
 // Might want to do something more complex if it knows how many instructions
 // will be issued this cycle.
 template <class Impl>
@@ -218,6 +382,34 @@ InstructionQueue<Impl>::isFull()
     }
 }
 
+template <class Impl>
+bool
+InstructionQueue<Impl>::isFull(unsigned tid)
+{
+    if (numFreeEntries(tid) == 0) {
+        return(true);
+    } else {
+        return(false);
+    }
+}
+
+template <class Impl>
+bool
+InstructionQueue<Impl>::hasReadyInsts()
+{
+    if (!listOrder.empty()) {
+        return true;
+    }
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        if (!readyInsts[i].empty()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
@@ -225,7 +417,7 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // Make sure the instruction is valid
     assert(new_inst);
 
-    DPRINTF(IQ, "IQ: Adding instruction PC %#x to the IQ.\n",
+    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
             new_inst->readPC());
 
     // Check if there are any free entries.  Panic if there are none.
@@ -233,26 +425,14 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // panicing.
     assert(freeEntries != 0);
 
-    // If the IQ currently has nothing in it, then there's a possibility
-    // that the tail iterator is invalid (might have been pointing at an
-    // instruction that was retired).  Reset the tail iterator.
-    if (freeEntries == numEntries) {
-        tail = cpu->instList.begin();
-    }
-
-    // Move the tail iterator.  Instructions may not have been issued
-    // to the IQ, so we may have to increment the iterator more than once.
-    while ((*tail) != new_inst) {
-        tail++;
-
-        // Make sure the tail iterator points at something legal.
-        assert(tail != cpu->instList.end());
-    }
-
+    instList[new_inst->threadNumber].push_back(new_inst);
 
     // Decrease the number of free entries.
     --freeEntries;
 
+    //Mark Instruction as in IQ
+    new_inst->setInIQ();
+
     // Look through its source registers (physical regs), and mark any
     // dependencies.
     addToDependents(new_inst);
@@ -264,9 +444,7 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // If it's a memory instruction, add it to the memory dependency
     // unit.
     if (new_inst->isMemRef()) {
-        memDepUnit.insert(new_inst);
-        // Uh..forgot to look it up and put it on the proper dependency list
-        // if the instruction should not go yet.
+        memDepUnit[new_inst->threadNumber].insert(new_inst);
     } else {
         // If the instruction is ready then add it to the ready list.
         addIfReady(new_inst);
@@ -274,105 +452,145 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
 
     ++iqInstsAdded;
 
+
+    //Update Thread IQ Count
+    count[new_inst->threadNumber]++;
+
     assert(freeEntries == (numEntries - countInsts()));
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::insertNonSpec(DynInstPtr &inst)
+InstructionQueue<Impl>::insertNonSpec(DynInstPtr &new_inst)
 {
-    nonSpecInsts[inst->seqNum] = inst;
-
     // @todo: Clean up this code; can do it by setting inst as unable
     // to issue, then calling normal insert on the inst.
 
     // Make sure the instruction is valid
-    assert(inst);
+    assert(new_inst);
 
-    DPRINTF(IQ, "IQ: Adding instruction PC %#x to the IQ.\n",
-            inst->readPC());
+    nonSpecInsts[new_inst->seqNum] = new_inst;
+
+    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
+            new_inst->readPC());
 
     // Check if there are any free entries.  Panic if there are none.
     // Might want to have this return a fault in the future instead of
     // panicing.
     assert(freeEntries != 0);
 
-    // If the IQ currently has nothing in it, then there's a possibility
-    // that the tail iterator is invalid (might have been pointing at an
-    // instruction that was retired).  Reset the tail iterator.
-    if (freeEntries == numEntries) {
-        tail = cpu->instList.begin();
-    }
-
-    // Move the tail iterator.  Instructions may not have been issued
-    // to the IQ, so we may have to increment the iterator more than once.
-    while ((*tail) != inst) {
-        tail++;
-
-        // Make sure the tail iterator points at something legal.
-        assert(tail != cpu->instList.end());
-    }
+    instList[new_inst->threadNumber].push_back(new_inst);
 
     // Decrease the number of free entries.
     --freeEntries;
 
+    //Mark Instruction as in IQ
+    new_inst->setInIQ();
+
     // Have this instruction set itself as the producer of its destination
     // register(s).
-    createDependency(inst);
+    createDependency(new_inst);
 
     // If it's a memory instruction, add it to the memory dependency
     // unit.
-    if (inst->isMemRef()) {
-        memDepUnit.insertNonSpec(inst);
+    if (new_inst->isMemRef()) {
+        memDepUnit[new_inst->threadNumber].insertNonSpec(new_inst);
     }
 
     ++iqNonSpecInstsAdded;
+
+    //Update Thread IQ Count
+    count[new_inst->threadNumber]++;
+
+    assert(freeEntries == (numEntries - countInsts()));
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::insertBarrier(DynInstPtr &barr_inst)
+{
+    memDepUnit[barr_inst->threadNumber].insertBarrier(barr_inst);
+
+    insertNonSpec(barr_inst);
 }
 
-// Slightly hack function to advance the tail iterator in the case that
-// the IEW stage issues an instruction that is not added to the IQ.  This
-// is needed in case a long chain of such instructions occurs.
-// I don't think this is used anymore.
 template <class Impl>
 void
 InstructionQueue<Impl>::advanceTail(DynInstPtr &inst)
 {
-    // Make sure the instruction is valid
-    assert(inst);
-
-    DPRINTF(IQ, "IQ: Adding instruction PC %#x to the IQ.\n",
-            inst->readPC());
-
-    // Check if there are any free entries.  Panic if there are none.
-    // Might want to have this return a fault in the future instead of
-    // panicing.
-    assert(freeEntries != 0);
-
-    // If the IQ currently has nothing in it, then there's a possibility
-    // that the tail iterator is invalid (might have been pointing at an
-    // instruction that was retired).  Reset the tail iterator.
-    if (freeEntries == numEntries) {
-        tail = cpu->instList.begin();
-    }
-
-    // Move the tail iterator.  Instructions may not have been issued
-    // to the IQ, so we may have to increment the iterator more than once.
-    while ((*tail) != inst) {
-        tail++;
-
-        // Make sure the tail iterator points at something legal.
-        assert(tail != cpu->instList.end());
-    }
-
-    assert(freeEntries <= numEntries);
-
     // Have this instruction set itself as the producer of its destination
     // register(s).
     createDependency(inst);
 }
 
-// Need to make sure the number of float and integer instructions
-// issued does not exceed the total issue bandwidth.
+template <class Impl>
+void
+InstructionQueue<Impl>::addToOrderList(OpClass op_class)
+{
+    assert(!readyInsts[op_class].empty());
+
+    ListOrderEntry queue_entry;
+
+    queue_entry.queueType = op_class;
+
+    queue_entry.oldestInst = readyInsts[op_class].top()->seqNum;
+
+    ListOrderIt list_it = listOrder.begin();
+    ListOrderIt list_end_it = listOrder.end();
+
+    while (list_it != list_end_it) {
+        if ((*list_it).oldestInst > queue_entry.oldestInst) {
+            break;
+        }
+
+        list_it++;
+    }
+
+    readyIt[op_class] = listOrder.insert(list_it, queue_entry);
+    queueOnList[op_class] = true;
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::moveToYoungerInst(ListOrderIt list_order_it)
+{
+    // Get iterator of next item on the list
+    // Delete the original iterator
+    // Determine if the next item is either the end of the list or younger
+    // than the new instruction.  If so, then add in a new iterator right here.
+    // If not, then move along.
+    ListOrderEntry queue_entry;
+    OpClass op_class = (*list_order_it).queueType;
+    ListOrderIt next_it = list_order_it;
+
+    ++next_it;
+
+    queue_entry.queueType = op_class;
+    queue_entry.oldestInst = readyInsts[op_class].top()->seqNum;
+
+    while (next_it != listOrder.end() &&
+           (*next_it).oldestInst < queue_entry.oldestInst) {
+        ++next_it;
+    }
+
+    readyIt[op_class] = listOrder.insert(next_it, queue_entry);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
+{
+    // The CPU could have been sleeping until this op completed (*extremely*
+    // long latency op).  Wake it if it was.  This may be overkill.
+    iewStage->wakeCPU();
+
+    fuPool->freeUnit(fu_idx);
+
+    int &size = issueToExecuteQueue->access(0)->size;
+
+    issueToExecuteQueue->access(0)->insts[size++] = inst;
+}
+
 // @todo: Figure out a better way to remove the squashed items from the
 // lists.  Checking the top item of each list to see if it's squashed
 // wastes time and forces jumps.
@@ -380,258 +598,181 @@ template <class Impl>
 void
 InstructionQueue<Impl>::scheduleReadyInsts()
 {
-    DPRINTF(IQ, "IQ: Attempting to schedule ready instructions from "
-                "the IQ.\n");
-
-    int int_issued = 0;
-    int float_issued = 0;
-    int branch_issued = 0;
-    int memory_issued = 0;
-    int squashed_issued = 0;
-    int total_issued = 0;
+    DPRINTF(IQ, "Attempting to schedule ready instructions from "
+            "the IQ.\n");
 
     IssueStruct *i2e_info = issueToExecuteQueue->access(0);
 
-    bool insts_available = !readyBranchInsts.empty() ||
-        !readyIntInsts.empty() ||
-        !readyFloatInsts.empty() ||
-        !memDepUnit.empty() ||
-        !readyMiscInsts.empty() ||
-        !squashedInsts.empty();
-
-    // Note: Requires a globally defined constant.
-    InstSeqNum oldest_inst = MaxInstSeqNum;
-    InstList list_with_oldest = None;
-
-    // Temporary values.
-    DynInstPtr int_head_inst;
-    DynInstPtr float_head_inst;
-    DynInstPtr branch_head_inst;
-    DynInstPtr mem_head_inst;
-    DynInstPtr misc_head_inst;
-    DynInstPtr squashed_head_inst;
-
-    // Somewhat nasty code to look at all of the lists where issuable
-    // instructions are located, and choose the oldest instruction among
-    // those lists.  Consider a rewrite in the future.
-    while (insts_available && total_issued < totalWidth)
-    {
-        // Set this to false.  Each if-block is required to set it to true
-        // if there were instructions available this check.  This will cause
-        // this loop to run once more than necessary, but avoids extra calls.
-        insts_available = false;
-
-        oldest_inst = MaxInstSeqNum;
-
-        list_with_oldest = None;
-
-        if (!readyIntInsts.empty() &&
-            int_issued < intWidth) {
-
-            insts_available = true;
-
-            int_head_inst = readyIntInsts.top();
-
-            if (int_head_inst->isSquashed()) {
-                readyIntInsts.pop();
-
-                ++iqLoopSquashStalls;
-
-                continue;
-            }
-
-            oldest_inst = int_head_inst->seqNum;
-
-            list_with_oldest = Int;
-        }
-
-        if (!readyFloatInsts.empty() &&
-            float_issued < floatWidth) {
-
-            insts_available = true;
-
-            float_head_inst = readyFloatInsts.top();
-
-            if (float_head_inst->isSquashed()) {
-                readyFloatInsts.pop();
-
-                ++iqLoopSquashStalls;
-
-                continue;
-            } else if (float_head_inst->seqNum < oldest_inst) {
-                oldest_inst = float_head_inst->seqNum;
-
-                list_with_oldest = Float;
+    // Will need to reorder the list if either a queue is not on the list,
+    // or it has an older instruction than last time.
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        if (!readyInsts[i].empty()) {
+            if (!queueOnList[i]) {
+                addToOrderList(OpClass(i));
+            } else if (readyInsts[i].top()->seqNum  <
+                       (*readyIt[i]).oldestInst) {
+                listOrder.erase(readyIt[i]);
+                addToOrderList(OpClass(i));
             }
         }
-
-        if (!readyBranchInsts.empty() &&
-            branch_issued < branchWidth) {
-
-            insts_available = true;
-
-            branch_head_inst = readyBranchInsts.top();
-
-            if (branch_head_inst->isSquashed()) {
-                readyBranchInsts.pop();
-
-                ++iqLoopSquashStalls;
-
-                continue;
-            } else if (branch_head_inst->seqNum < oldest_inst) {
-                oldest_inst = branch_head_inst->seqNum;
-
-                list_with_oldest = Branch;
-            }
-
-        }
-
-        if (!memDepUnit.empty() &&
-            memory_issued < memoryWidth) {
-
-            insts_available = true;
-
-            mem_head_inst = memDepUnit.top();
-
-            if (mem_head_inst->isSquashed()) {
-                memDepUnit.pop();
-
-                ++iqLoopSquashStalls;
-
-                continue;
-            } else if (mem_head_inst->seqNum < oldest_inst) {
-                oldest_inst = mem_head_inst->seqNum;
-
-                list_with_oldest = Memory;
-            }
-        }
-
-        if (!readyMiscInsts.empty()) {
-
-            insts_available = true;
-
-            misc_head_inst = readyMiscInsts.top();
-
-            if (misc_head_inst->isSquashed()) {
-                readyMiscInsts.pop();
-
-                ++iqLoopSquashStalls;
-
-                continue;
-            } else if (misc_head_inst->seqNum < oldest_inst) {
-                oldest_inst = misc_head_inst->seqNum;
-
-                list_with_oldest = Misc;
-            }
-        }
-
-        if (!squashedInsts.empty()) {
-
-            insts_available = true;
-
-            squashed_head_inst = squashedInsts.top();
-
-            if (squashed_head_inst->seqNum < oldest_inst) {
-                list_with_oldest = Squashed;
-            }
-
-        }
-
-        DynInstPtr issuing_inst = NULL;
-
-        switch (list_with_oldest) {
-          case None:
-            DPRINTF(IQ, "IQ: Not able to schedule any instructions. Issuing "
-                    "inst is %#x.\n", issuing_inst);
-            break;
-
-          case Int:
-            issuing_inst = int_head_inst;
-            readyIntInsts.pop();
-            ++int_issued;
-            DPRINTF(IQ, "IQ: Issuing integer instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
-
-          case Float:
-            issuing_inst = float_head_inst;
-            readyFloatInsts.pop();
-            ++float_issued;
-            DPRINTF(IQ, "IQ: Issuing float instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
-
-          case Branch:
-            issuing_inst = branch_head_inst;
-            readyBranchInsts.pop();
-            ++branch_issued;
-            DPRINTF(IQ, "IQ: Issuing branch instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
-
-          case Memory:
-            issuing_inst = mem_head_inst;
-
-            memDepUnit.pop();
-            ++memory_issued;
-            DPRINTF(IQ, "IQ: Issuing memory instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
-
-          case Misc:
-            issuing_inst = misc_head_inst;
-            readyMiscInsts.pop();
-
-            ++iqMiscInstsIssued;
-
-            DPRINTF(IQ, "IQ: Issuing a miscellaneous instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
-
-          case Squashed:
-            assert(0 && "Squashed insts should not issue any more!");
-            squashedInsts.pop();
-            // Set the squashed instruction as able to commit so that commit
-            // can just drop it from the ROB.  This is a bit faked.
-            ++squashed_issued;
-            ++freeEntries;
-
-            DPRINTF(IQ, "IQ: Issuing squashed instruction PC %#x.\n",
-                    squashed_head_inst->readPC());
-            break;
-        }
-
-        if (list_with_oldest != None && list_with_oldest != Squashed) {
-            i2e_info->insts[total_issued] = issuing_inst;
-            i2e_info->size++;
-
-            issuing_inst->setIssued();
-
-            ++freeEntries;
-            ++total_issued;
-        }
-
-        assert(freeEntries == (numEntries - countInsts()));
     }
 
-    iqIntInstsIssued += int_issued;
-    iqFloatInstsIssued += float_issued;
-    iqBranchInstsIssued += branch_issued;
-    iqMemInstsIssued += memory_issued;
-    iqSquashedInstsIssued += squashed_issued;
+    // Have iterator to head of the list
+    // While I haven't exceeded bandwidth or reached the end of the list,
+    // Try to get a FU that can do what this op needs.
+    // If successful, change the oldestInst to the new top of the list, put
+    // the queue in the proper place in the list.
+    // Increment the iterator.
+    // This will avoid trying to schedule a certain op class if there are no
+    // FUs that handle it.
+    ListOrderIt order_it = listOrder.begin();
+    ListOrderIt order_end_it = listOrder.end();
+    int total_issued = 0;
+    int exec_queue_slot = i2e_info->size;
+
+    while (exec_queue_slot < totalWidth && order_it != order_end_it) {
+        OpClass op_class = (*order_it).queueType;
+
+        assert(!readyInsts[op_class].empty());
+
+        DynInstPtr issuing_inst = readyInsts[op_class].top();
+
+        assert(issuing_inst->seqNum == (*order_it).oldestInst);
+
+        if (issuing_inst->isSquashed()) {
+            readyInsts[op_class].pop();
+
+            if (!readyInsts[op_class].empty()) {
+                moveToYoungerInst(order_it);
+            } else {
+                readyIt[op_class] = listOrder.end();
+                queueOnList[op_class] = false;
+            }
+
+            listOrder.erase(order_it++);
+
+            ++iqSquashedInstsIssued;
+
+            continue;
+        }
+
+        int idx = fuPool->getUnit(op_class);
+
+        if (idx == -2) {
+            assert(op_class == No_OpClass);
+
+            i2e_info->insts[exec_queue_slot++] = issuing_inst;
+            i2e_info->size++;
+
+            DPRINTF(IQ, "Thread %i: Issuing instruction PC that needs no FU"
+                    " %#x [sn:%lli]\n",
+                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    issuing_inst->seqNum);
+
+            readyInsts[op_class].pop();
+
+            if (!readyInsts[op_class].empty()) {
+                moveToYoungerInst(order_it);
+            } else {
+                readyIt[op_class] = listOrder.end();
+                queueOnList[op_class] = false;
+            }
+
+            issuing_inst->setIssued();
+            ++total_issued;
+
+            if (!issuing_inst->isMemRef()) {
+                // Memory instructions can not be freed from the IQ until they
+                // complete.
+                ++freeEntries;
+                count[issuing_inst->threadNumber]--;
+                issuing_inst->removeInIQ();
+            } else {
+                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
+            }
+
+            listOrder.erase(order_it++);
+
+        } else if (idx != -1) {
+            int op_latency = fuPool->getOpLatency(op_class);
+
+            if (op_latency == 1) {
+                i2e_info->insts[exec_queue_slot++] = issuing_inst;
+                i2e_info->size++;
+
+                // Add the FU onto the list of FU's to be freed next cycle.
+                fuPool->freeUnit(idx);
+            } else {
+                int issue_latency = fuPool->getIssueLatency(op_class);
+
+                if (issue_latency > 1) {
+                    // Generate completion event for the FU
+                    FUCompletion *execution = new FUCompletion(issuing_inst,
+                                                               idx, this);
+
+                    execution->schedule(curTick + issue_latency - 1);
+                } else {
+                    i2e_info->insts[exec_queue_slot++] = issuing_inst;
+                    i2e_info->size++;
+
+                    // Add the FU onto the list of FU's to be freed next cycle.
+                    fuPool->freeUnit(idx);
+                }
+            }
+
+            DPRINTF(IQ, "Thread %i: Issuing instruction PC %#x "
+                    "[sn:%lli]\n",
+                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    issuing_inst->seqNum);
+
+            readyInsts[op_class].pop();
+
+            if (!readyInsts[op_class].empty()) {
+                moveToYoungerInst(order_it);
+            } else {
+                readyIt[op_class] = listOrder.end();
+                queueOnList[op_class] = false;
+            }
+
+            issuing_inst->setIssued();
+            ++total_issued;
+
+            if (!issuing_inst->isMemRef()) {
+                // Memory instructions can not be freed from the IQ until they
+                // complete.
+                ++freeEntries;
+                count[issuing_inst->threadNumber]--;
+                issuing_inst->removeInIQ();
+            } else {
+                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
+            }
+
+            listOrder.erase(order_it++);
+        } else {
+            ++order_it;
+        }
+    }
+
+    if (total_issued) {
+        cpu->activityThisCycle();
+    } else {
+        DPRINTF(IQ, "Not able to schedule any instructions.\n");
+    }
 }
 
 template <class Impl>
 void
 InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
 {
-    DPRINTF(IQ, "IQ: Marking nonspeculative instruction with sequence "
-            "number %i as ready to execute.\n", inst);
+    DPRINTF(IQ, "Marking nonspeculative instruction [sn:%lli] as ready "
+            "to execute.\n", inst);
 
-    non_spec_it_t inst_it = nonSpecInsts.find(inst);
+    NonSpecMapIt inst_it = nonSpecInsts.find(inst);
 
     assert(inst_it != nonSpecInsts.end());
 
+    unsigned tid = (*inst_it).second->threadNumber;
+
     // Mark this instruction as ready to issue.
     (*inst_it).second->setCanIssue();
 
@@ -639,27 +780,58 @@ InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
     if (!(*inst_it).second->isMemRef()) {
         addIfReady((*inst_it).second);
     } else {
-        memDepUnit.nonSpecInstReady((*inst_it).second);
+        memDepUnit[tid].nonSpecInstReady((*inst_it).second);
     }
 
+    (*inst_it).second = NULL;
+
     nonSpecInsts.erase(inst_it);
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::commit(const InstSeqNum &inst, unsigned tid)
+{
+    /*Need to go through each thread??*/
+    DPRINTF(IQ, "[tid:%i]: Committing instructions older than [sn:%i]\n",
+            tid,inst);
+
+    ListIt iq_it = instList[tid].begin();
+
+    while (iq_it != instList[tid].end() &&
+           (*iq_it)->seqNum <= inst) {
+        ++iq_it;
+        instList[tid].pop_front();
+    }
+
+    assert(freeEntries == (numEntries - countInsts()));
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
 {
-    DPRINTF(IQ, "IQ: Waking dependents of completed instruction.\n");
-    //Look at the physical destination register of the DynInst
-    //and look it up on the dependency graph.  Then mark as ready
-    //any instructions within the instruction queue.
+    DPRINTF(IQ, "Waking dependents of completed instruction.\n");
+
+    assert(!completed_inst->isSquashed());
+    // Look at the physical destination register of the DynInst
+    // and look it up on the dependency graph.  Then mark as ready
+    // any instructions within the instruction queue.
     DependencyEntry *curr;
+    DependencyEntry *prev;
 
     // Tell the memory dependence unit to wake any dependents on this
-    // instruction if it is a memory instruction.
-
+    // instruction if it is a memory instruction.  Also complete the memory
+    // instruction at this point since we know it executed fine.
+    // @todo: Might want to rename "completeMemInst" to
+    // something that indicates that it won't need to be replayed, and call
+    // this earlier.  Might not be a big deal.
     if (completed_inst->isMemRef()) {
-        memDepUnit.wakeDependents(completed_inst);
+        memDepUnit[completed_inst->threadNumber].wakeDependents(completed_inst);
+        completeMemInst(completed_inst);
+    } else if (completed_inst->isMemBarrier() ||
+               completed_inst->isWriteBarrier()) {
+        memDepUnit[completed_inst->threadNumber].completeBarrier(completed_inst);
     }
 
     for (int dest_reg_idx = 0;
@@ -676,17 +848,17 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
             continue;
         }
 
-        DPRINTF(IQ, "IQ: Waking any dependents on register %i.\n",
+        DPRINTF(IQ, "Waking any dependents on register %i.\n",
                 (int) dest_reg);
 
         //Maybe abstract this part into a function.
         //Go through the dependency chain, marking the registers as ready
         //within the waiting instructions.
-        while (dependGraph[dest_reg].next) {
 
-            curr = dependGraph[dest_reg].next;
+        curr = dependGraph[dest_reg].next;
 
-            DPRINTF(IQ, "IQ: Waking up a dependent instruction, PC%#x.\n",
+        while (curr) {
+            DPRINTF(IQ, "Waking up a dependent instruction, PC%#x.\n",
                     curr->inst->readPC());
 
             // Might want to give more information to the instruction
@@ -697,13 +869,13 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
 
             addIfReady(curr->inst);
 
-            dependGraph[dest_reg].next = curr->next;
-
             DependencyEntry::mem_alloc_counter--;
 
-            curr->inst = NULL;
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
 
-            delete curr;
+            delete prev;
         }
 
         // Reset the head node now that all of its dependents have been woken
@@ -716,63 +888,116 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
     }
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst)
+{
+    OpClass op_class = ready_inst->opClass();
+
+    readyInsts[op_class].push(ready_inst);
+
+    DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
+            "the ready list, PC %#x opclass:%i [sn:%lli].\n",
+            ready_inst->readPC(), op_class, ready_inst->seqNum);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::rescheduleMemInst(DynInstPtr &resched_inst)
+{
+    memDepUnit[resched_inst->threadNumber].reschedule(resched_inst);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::replayMemInst(DynInstPtr &replay_inst)
+{
+    memDepUnit[replay_inst->threadNumber].replay(replay_inst);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::completeMemInst(DynInstPtr &completed_inst)
+{
+    int tid = completed_inst->threadNumber;
+
+    DPRINTF(IQ, "Completing mem instruction PC:%#x [sn:%lli]\n",
+            completed_inst->readPC(), completed_inst->seqNum);
+
+    ++freeEntries;
+
+    completed_inst->memOpDone = true;
+
+    memDepUnit[tid].completed(completed_inst);
+
+    count[tid]--;
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::violation(DynInstPtr &store,
                                   DynInstPtr &faulting_load)
 {
-    memDepUnit.violation(store, faulting_load);
+    memDepUnit[store->threadNumber].violation(store, faulting_load);
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::squash()
+InstructionQueue<Impl>::squash(unsigned tid)
 {
-    DPRINTF(IQ, "IQ: Starting to squash instructions in the IQ.\n");
+    DPRINTF(IQ, "[tid:%i]: Starting to squash instructions in "
+            "the IQ.\n", tid);
 
     // Read instruction sequence number of last instruction out of the
     // time buffer.
-    squashedSeqNum = fromCommit->commitInfo.doneSeqNum;
+    squashedSeqNum[tid] = fromCommit->commitInfo[tid].doneSeqNum;
 
     // Setup the squash iterator to point to the tail.
-    squashIt = tail;
+    squashIt[tid] = instList[tid].end();
+    --squashIt[tid];
 
     // Call doSquash if there are insts in the IQ
-    if (freeEntries != numEntries) {
-        doSquash();
+    if (count[tid] > 0) {
+        doSquash(tid);
     }
 
     // Also tell the memory dependence unit to squash.
-    memDepUnit.squash(squashedSeqNum);
+    memDepUnit[tid].squash(squashedSeqNum[tid], tid);
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::doSquash()
+InstructionQueue<Impl>::doSquash(unsigned tid)
 {
-    // Make sure the squash iterator isn't pointing to nothing.
-    assert(squashIt != cpu->instList.end());
     // Make sure the squashed sequence number is valid.
-    assert(squashedSeqNum != 0);
+//    assert(squashedSeqNum[tid] != 0);
 
-    DPRINTF(IQ, "IQ: Squashing instructions in the IQ.\n");
+    DPRINTF(IQ, "[tid:%i]: Squashing until sequence number %i!\n",
+            tid, squashedSeqNum[tid]);
 
     // Squash any instructions younger than the squashed sequence number
     // given.
-    while ((*squashIt)->seqNum > squashedSeqNum) {
-        DynInstPtr squashed_inst = (*squashIt);
+    while (squashIt[tid] != instList[tid].end() &&
+           (*squashIt[tid])->seqNum > squashedSeqNum[tid]) {
+
+        DynInstPtr squashed_inst = (*squashIt[tid]);
 
         // Only handle the instruction if it actually is in the IQ and
         // hasn't already been squashed in the IQ.
-        if (!squashed_inst->isIssued() &&
-            !squashed_inst->isSquashedInIQ()) {
+        if (squashed_inst->threadNumber != tid ||
+            squashed_inst->isSquashedInIQ()) {
+            --squashIt[tid];
+            continue;
+        }
+
+        if (!squashed_inst->isIssued() ||
+            (squashed_inst->isMemRef() &&
+             !squashed_inst->memOpDone)) {
 
             // Remove the instruction from the dependency list.
-            // Hack for now: These below don't add themselves to the
-            // dependency list, so don't try to remove them.
-            if (!squashed_inst->isNonSpeculative()/* &&
-                                                     !squashed_inst->isStore()*/
-                ) {
+            if (!squashed_inst->isNonSpeculative() &&
+                !squashed_inst->isMemBarrier() &&
+                !squashed_inst->isWriteBarrier()) {
 
                 for (int src_reg_idx = 0;
                      src_reg_idx < squashed_inst->numSrcRegs();
@@ -787,19 +1012,29 @@ InstructionQueue<Impl>::doSquash()
                     // dependency chain aren't informed that a specific src
                     // register has become ready.  This may not always be true
                     // in the future.
+                    // Instead of doing a linked list traversal, we can just
+                    // remove these squashed instructions either at issue time,
+                    // or when the register is overwritten.  The only downside
+                    // to this is it leaves more room for error.
+
                     if (!squashed_inst->isReadySrcRegIdx(src_reg_idx) &&
                         src_reg < numPhysRegs) {
                         dependGraph[src_reg].remove(squashed_inst);
                     }
 
+
                     ++iqSquashedOperandsExamined;
                 }
 
                 // Might want to remove producers as well.
             } else {
-                nonSpecInsts[squashed_inst->seqNum] = NULL;
+                NonSpecMapIt ns_inst_it =
+                    nonSpecInsts.find(squashed_inst->seqNum);
+                assert(ns_inst_it != nonSpecInsts.end());
 
-                nonSpecInsts.erase(squashed_inst->seqNum);
+                (*ns_inst_it).second = NULL;
+
+                nonSpecInsts.erase(ns_inst_it);
 
                 ++iqSquashedNonSpecRemoved;
             }
@@ -809,37 +1044,30 @@ InstructionQueue<Impl>::doSquash()
             // Mark it as squashed within the IQ.
             squashed_inst->setSquashedInIQ();
 
-//            squashedInsts.push(squashed_inst);
+            // @todo: Remove this hack where several statuses are set so the
+            // inst will flow through the rest of the pipeline.
             squashed_inst->setIssued();
             squashed_inst->setCanCommit();
+            squashed_inst->removeInIQ();
+
+            //Update Thread IQ Count
+            count[squashed_inst->threadNumber]--;
 
             ++freeEntries;
 
-            DPRINTF(IQ, "IQ: Instruction PC %#x squashed.\n",
-                    squashed_inst->readPC());
+            if (numThreads > 1) {
+                DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
+                        "squashed.\n",
+                        tid, squashed_inst->seqNum, squashed_inst->readPC());
+            } else {
+                DPRINTF(IQ, "Instruction [sn:%lli] PC %#x squashed.\n",
+                        squashed_inst->seqNum, squashed_inst->readPC());
+            }
         }
 
-        --squashIt;
+        instList[tid].erase(squashIt[tid]--);
         ++iqSquashedInstsExamined;
     }
-
-    assert(freeEntries <= numEntries);
-
-    if (freeEntries == numEntries) {
-        tail = cpu->instList.end();
-    }
-
-}
-
-template <class Impl>
-void
-InstructionQueue<Impl>::stopSquash()
-{
-    // Clear up the squash variables to ensure that squashing doesn't
-    // get called improperly.
-    squashedSeqNum = 0;
-
-    squashIt = cpu->instList.end();
 }
 
 template <class Impl>
@@ -877,8 +1105,7 @@ InstructionQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove)
     }
 
     // Find the instruction to remove within the dependency linked list.
-    while(curr->inst != inst_to_remove)
-    {
+    while (curr->inst != inst_to_remove) {
         prev = curr;
         curr = curr->next;
 
@@ -920,7 +1147,7 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
             if (src_reg >= numPhysRegs) {
                 continue;
             } else if (regScoreboard[src_reg] == false) {
-                DPRINTF(IQ, "IQ: Instruction PC %#x has src reg %i that "
+                DPRINTF(IQ, "Instruction PC %#x has src reg %i that "
                         "is being added to the dependency chain.\n",
                         new_inst->readPC(), src_reg);
 
@@ -930,7 +1157,7 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
                 // was added to the dependency graph.
                 return_val = true;
             } else {
-                DPRINTF(IQ, "IQ: Instruction PC %#x has src reg %i that "
+                DPRINTF(IQ, "Instruction PC %#x has src reg %i that "
                         "became ready before it reached the IQ.\n",
                         new_inst->readPC(), src_reg);
                 // Mark a register ready within the instruction.
@@ -966,13 +1193,13 @@ InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
             continue;
         }
 
-        dependGraph[dest_reg].inst = new_inst;
-
         if (dependGraph[dest_reg].next) {
             dumpDependGraph();
-            panic("IQ: Dependency graph not empty!");
+            panic("Dependency graph %i not empty!", dest_reg);
         }
 
+        dependGraph[dest_reg].inst = new_inst;
+
         // Mark the scoreboard to say it's not yet ready.
         regScoreboard[dest_reg] = false;
     }
@@ -987,96 +1214,62 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
     if (inst->readyToIssue()) {
 
         //Add the instruction to the proper ready list.
-        if (inst->isControl()) {
+        if (inst->isMemRef()) {
 
-            DPRINTF(IQ, "IQ: Branch instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x.\n",
-                    inst->readPC());
-            readyBranchInsts.push(inst);
-
-        } else if (inst->isMemRef()) {
-
-            DPRINTF(IQ, "IQ: Checking if memory instruction can issue.\n");
+            DPRINTF(IQ, "Checking if memory instruction can issue.\n");
 
             // Message to the mem dependence unit that this instruction has
             // its registers ready.
 
-            memDepUnit.regsReady(inst);
+            memDepUnit[inst->threadNumber].regsReady(inst);
 
-#if 0
-            if (memDepUnit.readyToIssue(inst)) {
-                DPRINTF(IQ, "IQ: Memory instruction is ready to issue, "
-                        "putting it onto the ready list, PC %#x.\n",
-                        inst->readPC());
-                readyMemInsts.push(inst);
-            } else {
-                // Make dependent on the store.
-                // Will need some way to get the store instruction it should
-                // be dependent upon; then when the store issues it can
-                // put the instruction on the ready list.
-                // Yet another tree?
-                assert(0 && "Instruction has no way to actually issue");
-            }
-#endif
-
-        } else if (inst->isInteger()) {
-
-            DPRINTF(IQ, "IQ: Integer instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x.\n",
-                    inst->readPC());
-            readyIntInsts.push(inst);
-
-        } else if (inst->isFloating()) {
-
-            DPRINTF(IQ, "IQ: Floating instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x.\n",
-                    inst->readPC());
-            readyFloatInsts.push(inst);
-
-        } else {
-            DPRINTF(IQ, "IQ: Miscellaneous instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x..\n",
-                    inst->readPC());
-
-            readyMiscInsts.push(inst);
+            return;
         }
+
+        OpClass op_class = inst->opClass();
+
+        DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
+                "the ready list, PC %#x opclass:%i [sn:%lli].\n",
+                inst->readPC(), op_class, inst->seqNum);
+
+        readyInsts[op_class].push(inst);
     }
 }
 
-/*
- * Caution, this function must not be called prior to tail being updated at
- * least once, otherwise it will fail the assertion.  This is because
- * instList.begin() actually changes upon the insertion of an element into the
- * list when the list is empty.
- */
 template <class Impl>
 int
 InstructionQueue<Impl>::countInsts()
 {
-    ListIt count_it = cpu->instList.begin();
+    //ksewell:This works but definitely could use a cleaner write
+    //with a more intuitive way of counting. Right now it's
+    //just brute force ....
+
+#if 0
     int total_insts = 0;
 
-    if (tail == cpu->instList.end())
-        return 0;
+    for (int i = 0; i < numThreads; ++i) {
+        ListIt count_it = instList[i].begin();
 
-    while (count_it != tail) {
-        if (!(*count_it)->isIssued()) {
-            ++total_insts;
+        while (count_it != instList[i].end()) {
+            if (!(*count_it)->isSquashed() && !(*count_it)->isSquashedInIQ()) {
+                if (!(*count_it)->isIssued()) {
+                    ++total_insts;
+                } else if ((*count_it)->isMemRef() &&
+                           !(*count_it)->memOpDone) {
+                    // Loads that have not been marked as executed still count
+                    // towards the total instructions.
+                    ++total_insts;
+                }
+            }
+
+            ++count_it;
         }
-
-        ++count_it;
-
-        assert(count_it != cpu->instList.end());
-    }
-
-    // Need to count the tail iterator as well.
-    if (count_it != cpu->instList.end() &&
-        (*count_it) &&
-        !(*count_it)->isIssued()) {
-        ++total_insts;
     }
 
     return total_insts;
+#else
+    return numEntries - freeEntries;
+#endif
 }
 
 template <class Impl>
@@ -1090,8 +1283,8 @@ InstructionQueue<Impl>::dumpDependGraph()
         curr = &dependGraph[i];
 
         if (curr->inst) {
-            cprintf("dependGraph[%i]: producer: %#x consumer: ", i,
-                    curr->inst->readPC());
+            cprintf("dependGraph[%i]: producer: %#x [sn:%lli] consumer: ",
+                    i, curr->inst->readPC(), curr->inst->seqNum);
         } else {
             cprintf("dependGraph[%i]: No producer. consumer: ", i);
         }
@@ -1099,7 +1292,8 @@ InstructionQueue<Impl>::dumpDependGraph()
         while (curr->next != NULL) {
             curr = curr->next;
 
-            cprintf("%#x ", curr->inst->readPC());
+            cprintf("%#x [sn:%lli] ",
+                    curr->inst->readPC(), curr->inst->seqNum);
         }
 
         cprintf("\n");
@@ -1110,27 +1304,87 @@ template <class Impl>
 void
 InstructionQueue<Impl>::dumpLists()
 {
-    cprintf("Ready integer list size: %i\n", readyIntInsts.size());
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        cprintf("Ready list %i size: %i\n", i, readyInsts[i].size());
 
-    cprintf("Ready float list size: %i\n", readyFloatInsts.size());
-
-    cprintf("Ready branch list size: %i\n", readyBranchInsts.size());
-
-    cprintf("Ready misc list size: %i\n", readyMiscInsts.size());
-
-    cprintf("Squashed list size: %i\n", squashedInsts.size());
+        cprintf("\n");
+    }
 
     cprintf("Non speculative list size: %i\n", nonSpecInsts.size());
 
-    non_spec_it_t non_spec_it = nonSpecInsts.begin();
+    NonSpecMapIt non_spec_it = nonSpecInsts.begin();
+    NonSpecMapIt non_spec_end_it = nonSpecInsts.end();
 
     cprintf("Non speculative list: ");
 
-    while (non_spec_it != nonSpecInsts.end()) {
-        cprintf("%#x ", (*non_spec_it).second->readPC());
+    while (non_spec_it != non_spec_end_it) {
+        cprintf("%#x [sn:%lli]", (*non_spec_it).second->readPC(),
+                (*non_spec_it).second->seqNum);
         ++non_spec_it;
     }
 
     cprintf("\n");
 
+    ListOrderIt list_order_it = listOrder.begin();
+    ListOrderIt list_order_end_it = listOrder.end();
+    int i = 1;
+
+    cprintf("List order: ");
+
+    while (list_order_it != list_order_end_it) {
+        cprintf("%i OpClass:%i [sn:%lli] ", i, (*list_order_it).queueType,
+                (*list_order_it).oldestInst);
+
+        ++list_order_it;
+        ++i;
+    }
+
+    cprintf("\n");
+}
+
+
+template <class Impl>
+void
+InstructionQueue<Impl>::dumpInsts()
+{
+    for (int i = 0; i < numThreads; ++i) {
+        int num = 0;
+        int valid_num = 0;
+        ListIt inst_list_it = instList[i].begin();
+
+        while (inst_list_it != instList[i].end())
+        {
+            cprintf("Instruction:%i\n",
+                    num);
+            if (!(*inst_list_it)->isSquashed()) {
+                if (!(*inst_list_it)->isIssued()) {
+                    ++valid_num;
+                    cprintf("Count:%i\n", valid_num);
+                } else if ((*inst_list_it)->isMemRef() &&
+                           !(*inst_list_it)->memOpDone) {
+                    // Loads that have not been marked as executed still count
+                    // towards the total instructions.
+                    ++valid_num;
+                    cprintf("Count:%i\n", valid_num);
+                }
+            }
+
+            cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                    "Issued:%i\nSquashed:%i\n",
+                    (*inst_list_it)->readPC(),
+                    (*inst_list_it)->seqNum,
+                    (*inst_list_it)->threadNumber,
+                    (*inst_list_it)->isIssued(),
+                    (*inst_list_it)->isSquashed());
+
+            if ((*inst_list_it)->isMemRef()) {
+                cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+            }
+
+            cprintf("\n");
+
+            inst_list_it++;
+            ++num;
+        }
+    }
 }
diff --git a/cpu/o3/lsq.cc b/cpu/o3/lsq.cc
new file mode 100644
index 000000000..8991ab8f8
--- /dev/null
+++ b/cpu/o3/lsq.cc
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/alpha_dyn_inst.hh"
+#include "cpu/o3/alpha_cpu.hh"
+#include "cpu/o3/alpha_impl.hh"
+#include "cpu/o3/lsq_impl.hh"
+
+// Force the instantiation of LDSTQ for all the implementations we care about.
+template class LSQ<AlphaSimpleImpl>;
+
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
new file mode 100644
index 000000000..c59b5f13b
--- /dev/null
+++ b/cpu/o3/lsq.hh
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_LSQ_HH__
+#define __CPU_O3_LSQ_HH__
+
+#include <map>
+#include <queue>
+
+#include "base/hashmap.hh"
+#include "config/full_system.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/o3/cpu_policy.hh"
+#include "cpu/o3/lsq_unit.hh"
+#include "mem/mem_interface.hh"
+//#include "mem/page_table.hh"
+#include "sim/sim_object.hh"
+
+template <class Impl>
+class LSQ {
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::CPUPol::IEW IEW;
+    typedef typename Impl::CPUPol::LSQUnit LSQUnit;
+
+    enum LSQPolicy {
+        Dynamic,
+        Partitioned,
+        Threshold
+    };
+
+    /** Constructs an LSQ with the given parameters. */
+    LSQ(Params *params);
+
+    /** Returns the name of the LSQ. */
+    std::string name() const;
+
+    /** Sets the pointer to the list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+    /** Sets the CPU pointer. */
+    void setCPU(FullCPU *cpu_ptr);
+    /** Sets the IEW stage pointer. */
+    void setIEW(IEW *iew_ptr);
+    /** Sets the page table pointer. */
+//    void setPageTable(PageTable *pt_ptr);
+
+    /** Number of entries needed for the given amount of threads.*/
+    int entryAmount(int num_threads);
+    void removeEntries(unsigned tid);
+    /** Reset the max entries for each thread. */
+    void resetEntries();
+    /** Resize the max entries for a thread. */
+    void resizeEntries(unsigned size, unsigned tid);
+
+    /** Ticks the LSQ. */
+    void tick();
+    /** Ticks a specific LSQ Unit. */
+    void tick(unsigned tid);
+
+    /** Inserts a load into the LSQ. */
+    void insertLoad(DynInstPtr &load_inst);
+    /** Inserts a store into the LSQ. */
+    void insertStore(DynInstPtr &store_inst);
+
+    /** Executes a load. */
+    Fault executeLoad(DynInstPtr &inst);
+
+    Fault executeLoad(int lq_idx, unsigned tid);
+    /** Executes a store. */
+    Fault executeStore(DynInstPtr &inst);
+
+    /**
+     * Commits loads up until the given sequence number for a specific thread.
+     */
+    void commitLoads(InstSeqNum &youngest_inst, unsigned tid);
+    /**
+     * Commits stores up until the given sequence number for a specific thread.
+     */
+    void commitStores(InstSeqNum &youngest_inst, unsigned tid);
+
+    /**
+     * Attempts to write back stores until all cache ports are used or the
+     * interface becomes blocked.
+     */
+    void writebackStores();
+    /** Same as above, but only for one thread. */
+    void writebackStores(unsigned tid);
+
+    /**
+     * Squash instructions from a thread until the specified sequence number.
+     */
+    void squash(const InstSeqNum &squashed_num, unsigned tid);
+
+    /** Returns whether or not there was a memory ordering violation. */
+    bool violation();
+    /**
+     * Returns whether or not there was a memory ordering violation for a
+     * specific thread.
+     */
+    bool violation(unsigned tid);
+
+    /** Returns if a load is blocked due to the memory system for a specific
+     *  thread.
+     */
+    bool loadBlocked(unsigned tid);
+
+    bool isLoadBlockedHandled(unsigned tid)
+    { return thread[tid].isLoadBlockedHandled(); }
+
+    void setLoadBlockedHandled(unsigned tid)
+    { thread[tid].setLoadBlockedHandled(); }
+
+    /** Gets the instruction that caused the memory ordering violation. */
+    DynInstPtr getMemDepViolator(unsigned tid);
+
+    /** Returns the head index of the load queue for a specific thread. */
+    int getLoadHead(unsigned tid);
+    /** Returns the sequence number of the head of the load queue. */
+    InstSeqNum getLoadHeadSeqNum(unsigned tid)
+    {
+        return thread[tid].getLoadHeadSeqNum();
+    }
+
+    /** Returns the head index of the store queue. */
+    int getStoreHead(unsigned tid);
+    /** Returns the sequence number of the head of the store queue. */
+    InstSeqNum getStoreHeadSeqNum(unsigned tid)
+    {
+        return thread[tid].getStoreHeadSeqNum();
+    }
+
+    /** Returns the number of instructions in all of the queues. */
+    int getCount();
+    /** Returns the number of instructions in the queues of one thread. */
+    int getCount(unsigned tid);
+
+    /** Returns the total number of loads in the load queue. */
+    int numLoads();
+    /** Returns the total number of loads for a single thread. */
+    int numLoads(unsigned tid);
+
+    /** Returns the total number of stores in the store queue. */
+    int numStores();
+    /** Returns the total number of stores for a single thread. */
+    int numStores(unsigned tid);
+
+    /** Returns the total number of loads that are ready. */
+    int numLoadsReady();
+    /** Returns the number of loads that are ready for a single thread. */
+    int numLoadsReady(unsigned tid);
+
+    /** Returns the number of free entries. */
+    unsigned numFreeEntries();
+    /** Returns the number of free entries for a specific thread. */
+    unsigned numFreeEntries(unsigned tid);
+
+    /** Returns if the LSQ is full (either LQ or SQ is full). */
+    bool isFull();
+    /**
+     * Returns if the LSQ is full for a specific thread (either LQ or SQ is
+     * full).
+     */
+    bool isFull(unsigned tid);
+
+    /** Returns if any of the LQs are full. */
+    bool lqFull();
+    /** Returns if the LQ of a given thread is full. */
+    bool lqFull(unsigned tid);
+
+    /** Returns if any of the SQs are full. */
+    bool sqFull();
+    /** Returns if the SQ of a given thread is full. */
+    bool sqFull(unsigned tid);
+
+    /**
+     * Returns if the LSQ is stalled due to a memory operation that must be
+     * replayed.
+     */
+    bool isStalled();
+    /**
+     * Returns if the LSQ of a specific thread is stalled due to a memory
+     * operation that must be replayed.
+     */
+    bool isStalled(unsigned tid);
+
+    /** Returns whether or not there are any stores to write back to memory. */
+    bool hasStoresToWB();
+    /** Returns whether or not a specific thread has any stores to write back
+     * to memory.
+     */
+    bool hasStoresToWB(unsigned tid);
+    /** Returns the number of stores a specific thread has to write back. */
+    int  numStoresToWB(unsigned tid);
+
+    /** Returns if the LSQ will write back to memory this cycle. */
+    bool willWB();
+    /** Returns if the LSQ of a specific thread will write back to memory this
+     * cycle.
+     */
+    bool willWB(unsigned tid);
+
+    /** Debugging function to print out all instructions. */
+    void dumpInsts();
+    /** Debugging function to print out instructions from a specific thread. */
+    void dumpInsts(unsigned tid);
+
+    /** Executes a read operation, using the load specified at the load index. */
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    /** Executes a store operation, using the store specified at the store
+     *   index.
+     */
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+  private:
+    /** The LSQ policy for SMT mode. */
+    LSQPolicy lsqPolicy;
+
+    /** The LSQ units for individual threads. */
+    LSQUnit thread[Impl::MaxThreads];
+
+    /** The CPU pointer. */
+    FullCPU *cpu;
+
+    /** The IEW stage pointer. */
+    IEW *iewStage;
+
+    /** The pointer to the page table. */
+//    PageTable *pTable;
+
+    /** List of Active Threads in System. */
+    std::list<unsigned> *activeThreads;
+
+    /** Total Size of LQ Entries. */
+    unsigned LQEntries;
+    /** Total Size of SQ Entries. */
+    unsigned SQEntries;
+
+    /** Max LQ Size - Used to Enforce Sharing Policies. */
+    unsigned maxLQEntries;
+
+    /** Max SQ Size - Used to Enforce Sharing Policies. */
+    unsigned maxSQEntries;
+
+    /** Global Load Count. */
+    int loads;
+
+    /** Global Store Count */
+    int stores;
+
+    /** Global Store To WB Count */
+    int storesToWB;
+
+    /** Number of Threads. */
+    unsigned numThreads;
+};
+
+template <class Impl>
+template <class T>
+Fault
+LSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+    unsigned tid = req->thread_num;
+
+    return thread[tid].read(req, data, load_idx);
+}
+
+template <class Impl>
+template <class T>
+Fault
+LSQ<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+    unsigned tid = req->thread_num;
+
+    return thread[tid].write(req, data, store_idx);
+}
+
+#endif // __CPU_O3_LSQ_HH__
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
new file mode 100644
index 000000000..523517869
--- /dev/null
+++ b/cpu/o3/lsq_impl.hh
@@ -0,0 +1,645 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/lsq.hh"
+
+using namespace std;
+
+template <class Impl>
+LSQ<Impl>::LSQ(Params *params)
+    : LQEntries(params->LQEntries), SQEntries(params->SQEntries),
+      loads(0), stores(0), storesToWB(0),
+      numThreads(params->numberOfThreads)
+{
+    DPRINTF(LSQ, "Creating LSQ object.\n");
+
+    //**********************************************/
+    //************ Handle SMT Parameters ***********/
+    //**********************************************/
+    string policy = params->smtLSQPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Figure out fetch policy
+    if (policy == "dynamic") {
+        lsqPolicy = Dynamic;
+
+        maxLQEntries = LQEntries;
+        maxSQEntries = SQEntries;
+
+        DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n");
+
+    } else if (policy == "partitioned") {
+        lsqPolicy = Partitioned;
+
+        //@todo:make work if part_amt doesnt divide evenly.
+        maxLQEntries = LQEntries / numThreads;
+        maxSQEntries = SQEntries / numThreads;
+
+        DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: "
+                "%i entries per LQ | %i entries per SQ",
+                maxLQEntries,maxSQEntries);
+
+    } else if (policy == "threshold") {
+        lsqPolicy = Threshold;
+
+        assert(params->smtLSQThreshold > LQEntries);
+        assert(params->smtLSQThreshold > SQEntries);
+
+        //Divide up by threshold amount
+        //@todo: Should threads check the max and the total
+        //amount of the LSQ
+        maxLQEntries  = params->smtLSQThreshold;
+        maxSQEntries  = params->smtLSQThreshold;
+
+        DPRINTF(LSQ, "LSQ sharing policy set to Threshold: "
+                "%i entries per LQ | %i entries per SQ",
+                maxLQEntries,maxSQEntries);
+
+    } else {
+        assert(0 && "Invalid LSQ Sharing Policy.Options Are:{Dynamic,"
+                    "Partitioned, Threshold}");
+    }
+
+    //Initialize LSQs
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].init(params, maxLQEntries+1, maxSQEntries+1, tid);
+    }
+}
+
+
+template<class Impl>
+std::string
+LSQ<Impl>::name() const
+{
+    return iewStage->name() + ".lsq";
+}
+
+template<class Impl>
+void
+LSQ<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    activeThreads = at_ptr;
+    assert(activeThreads != 0);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::setCPU(FullCPU *cpu_ptr)
+{
+    cpu = cpu_ptr;
+
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].setCPU(cpu_ptr);
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::setIEW(IEW *iew_ptr)
+{
+    iewStage = iew_ptr;
+
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].setIEW(iew_ptr);
+    }
+}
+
+#if 0
+template<class Impl>
+void
+LSQ<Impl>::setPageTable(PageTable *pt_ptr)
+{
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].setPageTable(pt_ptr);
+    }
+}
+#endif
+
+template <class Impl>
+int
+LSQ<Impl>::entryAmount(int num_threads)
+{
+    if (lsqPolicy == Partitioned) {
+        return LQEntries / num_threads;
+    } else {
+        return 0;
+    }
+}
+
+template <class Impl>
+void
+LSQ<Impl>::resetEntries()
+{
+    if (lsqPolicy != Dynamic || numThreads > 1) {
+        int active_threads = (*activeThreads).size();
+
+        list<unsigned>::iterator threads  = (*activeThreads).begin();
+        list<unsigned>::iterator list_end = (*activeThreads).end();
+
+        int maxEntries;
+
+        if (lsqPolicy == Partitioned) {
+            maxEntries = LQEntries / active_threads;
+        } else if (lsqPolicy == Threshold && active_threads == 1) {
+            maxEntries = LQEntries;
+        } else {
+            maxEntries = LQEntries;
+        }
+
+        while (threads != list_end) {
+            resizeEntries(maxEntries,*threads++);
+        }
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::removeEntries(unsigned tid)
+{
+    thread[tid].clearLQ();
+    thread[tid].clearSQ();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::resizeEntries(unsigned size,unsigned tid)
+{
+    thread[tid].resizeLQ(size);
+    thread[tid].resizeSQ(size);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::tick()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+
+        thread[tid].tick();
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::tick(unsigned tid)
+{
+    thread[tid].tick();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::insertLoad(DynInstPtr &load_inst)
+{
+    unsigned tid = load_inst->threadNumber;
+
+    thread[tid].insertLoad(load_inst);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::insertStore(DynInstPtr &store_inst)
+{
+    unsigned tid = store_inst->threadNumber;
+
+    thread[tid].insertStore(store_inst);
+}
+
+template<class Impl>
+Fault
+LSQ<Impl>::executeLoad(DynInstPtr &inst)
+{
+    unsigned tid = inst->threadNumber;
+
+    return thread[tid].executeLoad(inst);
+}
+
+template<class Impl>
+Fault
+LSQ<Impl>::executeLoad(int lq_idx, unsigned tid)
+{
+    return thread[tid].executeLoad(lq_idx);
+}
+
+template<class Impl>
+Fault
+LSQ<Impl>::executeStore(DynInstPtr &inst)
+{
+    unsigned tid = inst->threadNumber;
+
+    return thread[tid].executeStore(inst);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::commitLoads(InstSeqNum &youngest_inst,unsigned tid)
+{
+    thread[tid].commitLoads(youngest_inst);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::commitStores(InstSeqNum &youngest_inst,unsigned tid)
+{
+    thread[tid].commitStores(youngest_inst);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::writebackStores()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+
+        if (numStoresToWB(tid) > 0) {
+            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores available"
+                " for Writeback.\n", tid, numStoresToWB(tid));
+        }
+
+        thread[tid].writebackStores();
+    }
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numStoresToWB(unsigned tid)
+{
+    return thread[tid].numStoresToWB();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::squash(const InstSeqNum &squashed_num, unsigned tid)
+{
+        thread[tid].squash(squashed_num);
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::violation()
+{
+    /* Answers: Does Anybody Have a Violation?*/
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (thread[tid].violation())
+            return true;
+    }
+
+    return false;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::violation(unsigned tid)
+{
+    return thread[tid].violation();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::loadBlocked(unsigned tid)
+{
+    return thread[tid].loadBlocked();
+}
+
+template<class Impl>
+typename Impl::DynInstPtr
+LSQ<Impl>::getMemDepViolator(unsigned tid)
+{
+    return thread[tid].getMemDepViolator();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::getLoadHead(unsigned tid)
+{
+    return thread[tid].getLoadHead();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::getStoreHead(unsigned tid)
+{
+    return thread[tid].getStoreHead();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::getCount()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += getCount(tid);
+    }
+
+    return total;
+}
+
+template<class Impl>
+int
+LSQ<Impl>::getCount(unsigned tid)
+{
+    return thread[tid].getCount();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numLoads()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += numLoads(tid);
+    }
+
+    return total;
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numLoads(unsigned tid)
+{
+    return thread[tid].numLoads();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numStores()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += thread[tid].numStores();
+    }
+
+    return total;
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numStores(unsigned tid)
+{
+    return thread[tid].numStores();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numLoadsReady()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += thread[tid].numLoadsReady();
+    }
+
+    return total;
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numLoadsReady(unsigned tid)
+{
+    return thread[tid].numLoadsReady();
+}
+
+template<class Impl>
+unsigned
+LSQ<Impl>::numFreeEntries()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += thread[tid].numFreeEntries();
+    }
+
+    return total;
+}
+
+template<class Impl>
+unsigned
+LSQ<Impl>::numFreeEntries(unsigned tid)
+{
+    //if( lsqPolicy == Dynamic )
+    //return numFreeEntries();
+    //else
+        return thread[tid].numFreeEntries();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::isFull()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (! (thread[tid].lqFull() || thread[tid].sqFull()) )
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::isFull(unsigned tid)
+{
+    //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if( lsqPolicy == Dynamic )
+        return isFull();
+    else
+        return thread[tid].lqFull() || thread[tid].sqFull();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::lqFull()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!thread[tid].lqFull())
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::lqFull(unsigned tid)
+{
+    //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if( lsqPolicy == Dynamic )
+        return lqFull();
+    else
+        return thread[tid].lqFull();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::sqFull()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!sqFull(tid))
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::sqFull(unsigned tid)
+{
+     //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if( lsqPolicy == Dynamic )
+        return sqFull();
+    else
+        return thread[tid].sqFull();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::isStalled()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!thread[tid].isStalled())
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::isStalled(unsigned tid)
+{
+    if( lsqPolicy == Dynamic )
+        return isStalled();
+    else
+        return thread[tid].isStalled();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::hasStoresToWB()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!hasStoresToWB(tid))
+            return false;
+    }
+
+    return true;
+}
+
+
+template<class Impl>
+bool
+LSQ<Impl>::hasStoresToWB(unsigned tid)
+{
+    return thread[tid].hasStoresToWB();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::willWB()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!willWB(tid))
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::willWB(unsigned tid)
+{
+    return thread[tid].willWB();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::dumpInsts()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        thread[tid].dumpInsts();
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::dumpInsts(unsigned tid)
+{
+    thread[tid].dumpInsts();
+}
diff --git a/cpu/o3/lsq_unit.cc b/cpu/o3/lsq_unit.cc
new file mode 100644
index 000000000..dd29007bc
--- /dev/null
+++ b/cpu/o3/lsq_unit.cc
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/alpha_dyn_inst.hh"
+#include "cpu/o3/alpha_cpu.hh"
+#include "cpu/o3/alpha_impl.hh"
+#include "cpu/o3/lsq_unit_impl.hh"
+
+// Force the instantiation of LDSTQ for all the implementations we care about.
+template class LSQUnit<AlphaSimpleImpl>;
+
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
new file mode 100644
index 000000000..73c485ce9
--- /dev/null
+++ b/cpu/o3/lsq_unit.hh
@@ -0,0 +1,703 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_LSQ_UNIT_HH__
+#define __CPU_O3_LSQ_UNIT_HH__
+
+#include <map>
+#include <queue>
+#include <algorithm>
+
+#include "config/full_system.hh"
+#include "base/hashmap.hh"
+#include "cpu/inst_seq.hh"
+#include "mem/mem_interface.hh"
+//#include "mem/page_table.hh"
+#include "sim/sim_object.hh"
+#include "arch/faults.hh"
+
+/**
+ * Class that implements the actual LQ and SQ for each specific thread.
+ * Both are circular queues; load entries are freed upon committing, while
+ * store entries are freed once they writeback. The LSQUnit tracks if there
+ * are memory ordering violations, and also detects partial load to store
+ * forwarding cases (a store only has part of a load's data) that requires
+ * the load to wait until the store writes back. In the former case it
+ * holds onto the instruction until the dependence unit looks at it, and
+ * in the latter it stalls the LSQ until the store writes back. At that
+ * point the load is replayed.
+ */
+template <class Impl>
+class LSQUnit {
+  protected:
+    typedef TheISA::IntReg IntReg;
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::CPUPol::IEW IEW;
+    typedef typename Impl::CPUPol::IssueStruct IssueStruct;
+
+  private:
+    class StoreCompletionEvent : public Event {
+      public:
+        /** Constructs a store completion event. */
+        StoreCompletionEvent(int store_idx, Event *wb_event, LSQUnit *lsq_ptr);
+
+        /** Processes the store completion event. */
+        void process();
+
+        /** Returns the description of this event. */
+        const char *description();
+
+      private:
+        /** The store index of the store being written back. */
+        int storeIdx;
+        /** The writeback event for the store.  Needed for store
+         * conditionals.
+         */
+        Event *wbEvent;
+        /** The pointer to the LSQ unit that issued the store. */
+        LSQUnit<Impl> *lsqPtr;
+    };
+
+    friend class StoreCompletionEvent;
+
+  public:
+    /** Constructs an LSQ unit. init() must be called prior to use. */
+    LSQUnit();
+
+    /** Initializes the LSQ unit with the specified number of entries. */
+    void init(Params *params, unsigned maxLQEntries,
+              unsigned maxSQEntries, unsigned id);
+
+    /** Returns the name of the LSQ unit. */
+    std::string name() const;
+
+    /** Sets the CPU pointer. */
+    void setCPU(FullCPU *cpu_ptr)
+    { cpu = cpu_ptr; }
+
+    /** Sets the IEW stage pointer. */
+    void setIEW(IEW *iew_ptr)
+    { iewStage = iew_ptr; }
+
+    /** Sets the page table pointer. */
+//    void setPageTable(PageTable *pt_ptr);
+
+    /** Ticks the LSQ unit, which in this case only resets the number of
+     * used cache ports.
+     * @todo: Move the number of used ports up to the LSQ level so it can
+     * be shared by all LSQ units.
+     */
+    void tick() { usedPorts = 0; }
+
+    /** Inserts an instruction. */
+    void insert(DynInstPtr &inst);
+    /** Inserts a load instruction. */
+    void insertLoad(DynInstPtr &load_inst);
+    /** Inserts a store instruction. */
+    void insertStore(DynInstPtr &store_inst);
+
+    /** Executes a load instruction. */
+    Fault executeLoad(DynInstPtr &inst);
+
+    Fault executeLoad(int lq_idx);
+    /** Executes a store instruction. */
+    Fault executeStore(DynInstPtr &inst);
+
+    /** Commits the head load. */
+    void commitLoad();
+    /** Commits a specific load, given by the sequence number. */
+    void commitLoad(InstSeqNum &inst);
+    /** Commits loads older than a specific sequence number. */
+    void commitLoads(InstSeqNum &youngest_inst);
+
+    /** Commits stores older than a specific sequence number. */
+    void commitStores(InstSeqNum &youngest_inst);
+
+    /** Writes back stores. */
+    void writebackStores();
+
+    // @todo: Include stats in the LSQ unit.
+    //void regStats();
+
+    /** Clears all the entries in the LQ. */
+    void clearLQ();
+
+    /** Clears all the entries in the SQ. */
+    void clearSQ();
+
+    /** Resizes the LQ to a given size. */
+    void resizeLQ(unsigned size);
+
+    /** Resizes the SQ to a given size. */
+    void resizeSQ(unsigned size);
+
+    /** Squashes all instructions younger than a specific sequence number. */
+    void squash(const InstSeqNum &squashed_num);
+
+    /** Returns if there is a memory ordering violation. Value is reset upon
+     * call to getMemDepViolator().
+     */
+    bool violation() { return memDepViolator; }
+
+    /** Returns the memory ordering violator. */
+    DynInstPtr getMemDepViolator();
+
+    /** Returns if a load became blocked due to the memory system.  It clears
+     *  the bool's value upon this being called.
+     */
+    bool loadBlocked()
+    { return isLoadBlocked; }
+
+    void clearLoadBlocked()
+    { isLoadBlocked = false; }
+
+    bool isLoadBlockedHandled()
+    { return loadBlockedHandled; }
+
+    void setLoadBlockedHandled()
+    { loadBlockedHandled = true; }
+
+    /** Returns the number of free entries (min of free LQ and SQ entries). */
+    unsigned numFreeEntries();
+
+    /** Returns the number of loads ready to execute. */
+    int numLoadsReady();
+
+    /** Returns the number of loads in the LQ. */
+    int numLoads() { return loads; }
+
+    /** Returns the number of stores in the SQ. */
+    int numStores() { return stores; }
+
+    /** Returns if either the LQ or SQ is full. */
+    bool isFull() { return lqFull() || sqFull(); }
+
+    /** Returns if the LQ is full. */
+    bool lqFull() { return loads >= (LQEntries - 1); }
+
+    /** Returns if the SQ is full. */
+    bool sqFull() { return stores >= (SQEntries - 1); }
+
+    /** Debugging function to dump instructions in the LSQ. */
+    void dumpInsts();
+
+    /** Returns the number of instructions in the LSQ. */
+    unsigned getCount() { return loads + stores; }
+
+    /** Returns if there are any stores to writeback. */
+    bool hasStoresToWB() { return storesToWB; }
+
+    /** Returns the number of stores to writeback. */
+    int numStoresToWB() { return storesToWB; }
+
+    /** Returns if the LSQ unit will writeback on this cycle. */
+    bool willWB() { return storeQueue[storeWBIdx].canWB &&
+                        !storeQueue[storeWBIdx].completed &&
+                        !dcacheInterface->isBlocked(); }
+
+  private:
+    /** Completes the store at the specified index. */
+    void completeStore(int store_idx);
+
+    /** Increments the given store index (circular queue). */
+    inline void incrStIdx(int &store_idx);
+    /** Decrements the given store index (circular queue). */
+    inline void decrStIdx(int &store_idx);
+    /** Increments the given load index (circular queue). */
+    inline void incrLdIdx(int &load_idx);
+    /** Decrements the given load index (circular queue). */
+    inline void decrLdIdx(int &load_idx);
+
+  private:
+    /** Pointer to the CPU. */
+    FullCPU *cpu;
+
+    /** Pointer to the IEW stage. */
+    IEW *iewStage;
+
+    /** Pointer to the D-cache. */
+    MemInterface *dcacheInterface;
+
+    /** Pointer to the page table. */
+//    PageTable *pTable;
+
+  public:
+    struct SQEntry {
+        /** Constructs an empty store queue entry. */
+        SQEntry()
+            : inst(NULL), req(NULL), size(0), data(0),
+              canWB(0), committed(0), completed(0)
+        { }
+
+        /** Constructs a store queue entry for a given instruction. */
+        SQEntry(DynInstPtr &_inst)
+            : inst(_inst), req(NULL), size(0), data(0),
+              canWB(0), committed(0), completed(0)
+        { }
+
+        /** The store instruction. */
+        DynInstPtr inst;
+        /** The memory request for the store. */
+        MemReqPtr req;
+        /** The size of the store. */
+        int size;
+        /** The store data. */
+        IntReg data;
+        /** Whether or not the store can writeback. */
+        bool canWB;
+        /** Whether or not the store is committed. */
+        bool committed;
+        /** Whether or not the store is completed. */
+        bool completed;
+    };
+
+    enum Status {
+        Running,
+        Idle,
+        DcacheMissStall,
+        DcacheMissSwitch
+    };
+
+  private:
+    /** The LSQUnit thread id. */
+    unsigned lsqID;
+
+    /** The status of the LSQ unit. */
+    Status _status;
+
+    /** The store queue. */
+    std::vector<SQEntry> storeQueue;
+
+    /** The load queue. */
+    std::vector<DynInstPtr> loadQueue;
+
+    // Consider making these 16 bits
+    /** The number of LQ entries. */
+    unsigned LQEntries;
+    /** The number of SQ entries. */
+    unsigned SQEntries;
+
+    /** The number of load instructions in the LQ. */
+    int loads;
+    /** The number of store instructions in the SQ (excludes those waiting to
+     * writeback).
+     */
+    int stores;
+    /** The number of store instructions in the SQ waiting to writeback. */
+    int storesToWB;
+
+    /** The index of the head instruction in the LQ. */
+    int loadHead;
+    /** The index of the tail instruction in the LQ. */
+    int loadTail;
+
+    /** The index of the head instruction in the SQ. */
+    int storeHead;
+    /** The index of the first instruction that is ready to be written back,
+     * and has not yet been written back.
+     */
+    int storeWBIdx;
+    /** The index of the tail instruction in the SQ. */
+    int storeTail;
+
+    /// @todo Consider moving to a more advanced model with write vs read ports
+    /** The number of cache ports available each cycle. */
+    int cachePorts;
+
+    /** The number of used cache ports in this cycle. */
+    int usedPorts;
+
+    //list<InstSeqNum> mshrSeqNums;
+
+     //Stats::Scalar<> dcacheStallCycles;
+    Counter lastDcacheStall;
+
+    /** Wire to read information from the issue stage time queue. */
+    typename TimeBuffer<IssueStruct>::wire fromIssue;
+
+    // Make these per thread?
+    /** Whether or not the LSQ is stalled. */
+    bool stalled;
+    /** The store that causes the stall due to partial store to load
+     * forwarding.
+     */
+    InstSeqNum stallingStoreIsn;
+    /** The index of the above store. */
+    int stallingLoadIdx;
+
+    /** Whether or not a load is blocked due to the memory system.  It is
+     *  cleared when this value is checked via loadBlocked().
+     */
+    bool isLoadBlocked;
+
+    bool loadBlockedHandled;
+
+    InstSeqNum blockedLoadSeqNum;
+
+    /** The oldest faulting load instruction. */
+    DynInstPtr loadFaultInst;
+    /** The oldest faulting store instruction. */
+    DynInstPtr storeFaultInst;
+
+    /** The oldest load that caused a memory ordering violation. */
+    DynInstPtr memDepViolator;
+
+    // Will also need how many read/write ports the Dcache has.  Or keep track
+    // of that in stage that is one level up, and only call executeLoad/Store
+    // the appropriate number of times.
+
+  public:
+    /** Executes the load at the given index. */
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    /** Executes the store at the given index. */
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+    /** Returns the index of the head load instruction. */
+    int getLoadHead() { return loadHead; }
+    /** Returns the sequence number of the head load instruction. */
+    InstSeqNum getLoadHeadSeqNum()
+    {
+        if (loadQueue[loadHead]) {
+            return loadQueue[loadHead]->seqNum;
+        } else {
+            return 0;
+        }
+
+    }
+
+    /** Returns the index of the head store instruction. */
+    int getStoreHead() { return storeHead; }
+    /** Returns the sequence number of the head store instruction. */
+    InstSeqNum getStoreHeadSeqNum()
+    {
+        if (storeQueue[storeHead].inst) {
+            return storeQueue[storeHead].inst->seqNum;
+        } else {
+            return 0;
+        }
+
+    }
+
+    /** Returns whether or not the LSQ unit is stalled. */
+    bool isStalled()  { return stalled; }
+};
+
+template <class Impl>
+template <class T>
+Fault
+LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+    //Depending on issue2execute delay a squashed load could
+    //execute if it is found to be squashed in the same
+    //cycle it is scheduled to execute
+    assert(loadQueue[load_idx]);
+
+    if (loadQueue[load_idx]->isExecuted()) {
+        panic("Should not reach this point with split ops!");
+        memcpy(&data,req->data,req->size);
+
+        return NoFault;
+    }
+
+    // Make sure this isn't an uncacheable access
+    // A bit of a hackish way to get uncached accesses to work only if they're
+    // at the head of the LSQ and are ready to commit (at the head of the ROB
+    // too).
+    // @todo: Fix uncached accesses.
+    if (req->flags & UNCACHEABLE &&
+        (load_idx != loadHead || !loadQueue[load_idx]->reachedCommit)) {
+        iewStage->rescheduleMemInst(loadQueue[load_idx]);
+        return TheISA::genMachineCheckFault();
+    }
+
+    // Check the SQ for any previous stores that might lead to forwarding
+    int store_idx = loadQueue[load_idx]->sqIdx;
+
+    int store_size = 0;
+
+    DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
+            "storeHead: %i addr: %#x\n",
+            load_idx, store_idx, storeHead, req->paddr);
+
+#ifdef FULL_SYSTEM
+    if (req->flags & LOCKED) {
+        cpu->lockAddr = req->paddr;
+        cpu->lockFlag = true;
+    }
+#endif
+
+    while (store_idx != -1) {
+        // End once we've reached the top of the LSQ
+        if (store_idx == storeWBIdx) {
+            break;
+        }
+
+        // Move the index to one younger
+        if (--store_idx < 0)
+            store_idx += SQEntries;
+
+        assert(storeQueue[store_idx].inst);
+
+        store_size = storeQueue[store_idx].size;
+
+        if (store_size == 0)
+            continue;
+
+        // Check if the store data is within the lower and upper bounds of
+        // addresses that the request needs.
+        bool store_has_lower_limit =
+            req->vaddr >= storeQueue[store_idx].inst->effAddr;
+        bool store_has_upper_limit =
+            (req->vaddr + req->size) <= (storeQueue[store_idx].inst->effAddr +
+                                         store_size);
+        bool lower_load_has_store_part =
+            req->vaddr < (storeQueue[store_idx].inst->effAddr +
+                           store_size);
+        bool upper_load_has_store_part =
+            (req->vaddr + req->size) > storeQueue[store_idx].inst->effAddr;
+
+        // If the store's data has all of the data needed, we can forward.
+        if (store_has_lower_limit && store_has_upper_limit) {
+
+            int shift_amt = req->vaddr & (store_size - 1);
+            // Assumes byte addressing
+            shift_amt = shift_amt << 3;
+
+            // Cast this to type T?
+            data = storeQueue[store_idx].data >> shift_amt;
+
+            req->cmd = Read;
+            assert(!req->completionEvent);
+            req->completionEvent = NULL;
+            req->time = curTick;
+            assert(!req->data);
+            req->data = new uint8_t[64];
+
+            memcpy(req->data, &data, req->size);
+
+            DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
+                    "addr %#x, data %#x\n",
+                    store_idx, req->vaddr, *(req->data));
+
+            typename IEW::LdWritebackEvent *wb =
+                new typename IEW::LdWritebackEvent(loadQueue[load_idx],
+                                                   iewStage);
+
+            // We'll say this has a 1 cycle load-store forwarding latency
+            // for now.
+            // @todo: Need to make this a parameter.
+            wb->schedule(curTick);
+
+            // Should keep track of stat for forwarded data
+            return NoFault;
+        } else if ((store_has_lower_limit && lower_load_has_store_part) ||
+                   (store_has_upper_limit && upper_load_has_store_part) ||
+                   (lower_load_has_store_part && upper_load_has_store_part)) {
+            // This is the partial store-load forwarding case where a store
+            // has only part of the load's data.
+
+            // If it's already been written back, then don't worry about
+            // stalling on it.
+            if (storeQueue[store_idx].completed) {
+                continue;
+            }
+
+            // Must stall load and force it to retry, so long as it's the oldest
+            // load that needs to do so.
+            if (!stalled ||
+                (stalled &&
+                 loadQueue[load_idx]->seqNum <
+                 loadQueue[stallingLoadIdx]->seqNum)) {
+                stalled = true;
+                stallingStoreIsn = storeQueue[store_idx].inst->seqNum;
+                stallingLoadIdx = load_idx;
+            }
+
+            // Tell IQ/mem dep unit that this instruction will need to be
+            // rescheduled eventually
+            iewStage->rescheduleMemInst(loadQueue[load_idx]);
+
+            // Do not generate a writeback event as this instruction is not
+            // complete.
+
+            DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
+                    "Store idx %i to load addr %#x\n",
+                    store_idx, req->vaddr);
+
+            return NoFault;
+        }
+    }
+
+
+    // If there's no forwarding case, then go access memory
+    DynInstPtr inst = loadQueue[load_idx];
+
+    DPRINTF(LSQUnit, "Doing functional access for inst PC %#x\n",
+            loadQueue[load_idx]->readPC());
+    assert(!req->data);
+    req->data = new uint8_t[64];
+    Fault fault = cpu->read(req, data);
+    memcpy(req->data, &data, sizeof(T));
+
+    ++usedPorts;
+
+    // if we have a cache, do cache access too
+    if (fault == NoFault && dcacheInterface) {
+        if (dcacheInterface->isBlocked()) {
+            // There's an older load that's already going to squash.
+            if (isLoadBlocked && blockedLoadSeqNum < inst->seqNum)
+                return NoFault;
+
+            isLoadBlocked = true;
+            loadBlockedHandled = false;
+            blockedLoadSeqNum = inst->seqNum;
+            // No fault occurred, even though the interface is blocked.
+            return NoFault;
+        }
+        DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
+                loadQueue[load_idx]->readPC());
+        req->cmd = Read;
+        req->completionEvent = NULL;
+        req->time = curTick;
+
+        assert(!req->completionEvent);
+        req->completionEvent =
+            new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
+        MemAccessResult result = dcacheInterface->access(req);
+
+        assert(dcacheInterface->doEvents());
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT) {
+            DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
+            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
+                    inst->seqNum);
+
+            lastDcacheStall = curTick;
+
+            _status = DcacheMissStall;
+
+        } else {
+            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
+                    inst->seqNum);
+
+            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
+        }
+    }
+#if 0
+    // if we have a cache, do cache access too
+    if (dcacheInterface) {
+        if (dcacheInterface->isBlocked()) {
+            isLoadBlocked = true;
+            // No fault occurred, even though the interface is blocked.
+            return NoFault;
+        }
+
+        DPRINTF(LSQUnit, "LSQUnit: D-cache: PC:%#x reading from paddr:%#x "
+                "vaddr:%#x flags:%i\n",
+                inst->readPC(), req->paddr, req->vaddr, req->flags);
+
+        // Setup MemReq pointer
+        req->cmd = Read;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+
+        assert(!req->completionEvent);
+        req->completionEvent =
+            new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
+
+        // Do Cache Access
+        MemAccessResult result = dcacheInterface->access(req);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        // @todo: Probably should support having no events
+        if (result != MA_HIT) {
+            DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
+            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
+                    inst->seqNum);
+
+            lastDcacheStall = curTick;
+
+            _status = DcacheMissStall;
+
+        } else {
+            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
+                    inst->seqNum);
+
+            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
+        }
+    } else {
+        fatal("Must use D-cache with new memory system");
+    }
+#endif
+
+    return fault;
+}
+
+template <class Impl>
+template <class T>
+Fault
+LSQUnit<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+    assert(storeQueue[store_idx].inst);
+
+    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x data %#x"
+            " | storeHead:%i [sn:%i]\n",
+            store_idx, req->paddr, data, storeHead,
+            storeQueue[store_idx].inst->seqNum);
+/*
+    if (req->flags & LOCKED) {
+        if (req->flags & UNCACHEABLE) {
+            req->result = 2;
+        } else {
+            req->result = 1;
+        }
+    }
+*/
+    storeQueue[store_idx].req = req;
+    storeQueue[store_idx].size = sizeof(T);
+    storeQueue[store_idx].data = data;
+
+    // This function only writes the data to the store queue, so no fault
+    // can happen here.
+    return NoFault;
+}
+
+#endif // __CPU_O3_LSQ_UNIT_HH__
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
new file mode 100644
index 000000000..d9a118b0e
--- /dev/null
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -0,0 +1,893 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/lsq_unit.hh"
+#include "base/str.hh"
+
+template <class Impl>
+LSQUnit<Impl>::StoreCompletionEvent::StoreCompletionEvent(int store_idx,
+                                                          Event *wb_event,
+                                                          LSQUnit<Impl> *lsq_ptr)
+    : Event(&mainEventQueue),
+      storeIdx(store_idx),
+      wbEvent(wb_event),
+      lsqPtr(lsq_ptr)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::StoreCompletionEvent::process()
+{
+    DPRINTF(LSQ, "Cache miss complete for store idx:%i\n", storeIdx);
+    DPRINTF(Activity, "Activity: st writeback event idx:%i\n", storeIdx);
+
+    //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum);
+
+    lsqPtr->cpu->wakeCPU();
+    if (wbEvent)
+        wbEvent->process();
+    lsqPtr->completeStore(storeIdx);
+}
+
+template <class Impl>
+const char *
+LSQUnit<Impl>::StoreCompletionEvent::description()
+{
+    return "LSQ store completion event";
+}
+
+template <class Impl>
+LSQUnit<Impl>::LSQUnit()
+    : loads(0), stores(0), storesToWB(0), stalled(false), isLoadBlocked(false),
+      loadBlockedHandled(false)
+{
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
+                    unsigned maxSQEntries, unsigned id)
+
+{
+    DPRINTF(LSQUnit, "Creating LSQUnit%i object.\n",id);
+
+    lsqID = id;
+
+    LQEntries = maxLQEntries;
+    SQEntries = maxSQEntries;
+
+    loadQueue.resize(LQEntries);
+    storeQueue.resize(SQEntries);
+
+
+    // May want to initialize these entries to NULL
+
+    loadHead = loadTail = 0;
+
+    storeHead = storeWBIdx = storeTail = 0;
+
+    usedPorts = 0;
+    cachePorts = params->cachePorts;
+
+    dcacheInterface = params->dcacheInterface;
+
+    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+
+    blockedLoadSeqNum = 0;
+}
+
+template<class Impl>
+std::string
+LSQUnit<Impl>::name() const
+{
+    if (Impl::MaxThreads == 1) {
+        return iewStage->name() + ".lsq";
+    } else {
+        return iewStage->name() + ".lsq.thread." + to_string(lsqID);
+    }
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::clearLQ()
+{
+    loadQueue.clear();
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::clearSQ()
+{
+    storeQueue.clear();
+}
+
+#if 0
+template<class Impl>
+void
+LSQUnit<Impl>::setPageTable(PageTable *pt_ptr)
+{
+    DPRINTF(LSQUnit, "Setting the page table pointer.\n");
+    pTable = pt_ptr;
+}
+#endif
+
+template<class Impl>
+void
+LSQUnit<Impl>::resizeLQ(unsigned size)
+{
+    assert( size >= LQEntries);
+
+    if (size > LQEntries) {
+        while (size > loadQueue.size()) {
+            DynInstPtr dummy;
+            loadQueue.push_back(dummy);
+            LQEntries++;
+        }
+    } else {
+        LQEntries = size;
+    }
+
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::resizeSQ(unsigned size)
+{
+    if (size > SQEntries) {
+        while (size > storeQueue.size()) {
+            SQEntry dummy;
+            storeQueue.push_back(dummy);
+            SQEntries++;
+        }
+    } else {
+        SQEntries = size;
+    }
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::insert(DynInstPtr &inst)
+{
+    // Make sure we really have a memory reference.
+    assert(inst->isMemRef());
+
+    // Make sure it's one of the two classes of memory references.
+    assert(inst->isLoad() || inst->isStore());
+
+    if (inst->isLoad()) {
+        insertLoad(inst);
+    } else {
+        insertStore(inst);
+    }
+
+    inst->setInLSQ();
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::insertLoad(DynInstPtr &load_inst)
+{
+    assert((loadTail + 1) % LQEntries != loadHead && loads < LQEntries);
+
+    DPRINTF(LSQUnit, "Inserting load PC %#x, idx:%i [sn:%lli]\n",
+            load_inst->readPC(), loadTail, load_inst->seqNum);
+
+    load_inst->lqIdx = loadTail;
+
+    if (stores == 0) {
+        load_inst->sqIdx = -1;
+    } else {
+        load_inst->sqIdx = storeTail;
+    }
+
+    loadQueue[loadTail] = load_inst;
+
+    incrLdIdx(loadTail);
+
+    ++loads;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::insertStore(DynInstPtr &store_inst)
+{
+    // Make sure it is not full before inserting an instruction.
+    assert((storeTail + 1) % SQEntries != storeHead);
+    assert(stores < SQEntries);
+
+    DPRINTF(LSQUnit, "Inserting store PC %#x, idx:%i [sn:%lli]\n",
+            store_inst->readPC(), storeTail, store_inst->seqNum);
+
+    store_inst->sqIdx = storeTail;
+    store_inst->lqIdx = loadTail;
+
+    storeQueue[storeTail] = SQEntry(store_inst);
+
+    incrStIdx(storeTail);
+
+    ++stores;
+
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+LSQUnit<Impl>::getMemDepViolator()
+{
+    DynInstPtr temp = memDepViolator;
+
+    memDepViolator = NULL;
+
+    return temp;
+}
+
+template <class Impl>
+unsigned
+LSQUnit<Impl>::numFreeEntries()
+{
+    unsigned free_lq_entries = LQEntries - loads;
+    unsigned free_sq_entries = SQEntries - stores;
+
+    // Both the LQ and SQ entries have an extra dummy entry to differentiate
+    // empty/full conditions.  Subtract 1 from the free entries.
+    if (free_lq_entries < free_sq_entries) {
+        return free_lq_entries - 1;
+    } else {
+        return free_sq_entries - 1;
+    }
+}
+
+template <class Impl>
+int
+LSQUnit<Impl>::numLoadsReady()
+{
+    int load_idx = loadHead;
+    int retval = 0;
+
+    while (load_idx != loadTail) {
+        assert(loadQueue[load_idx]);
+
+        if (loadQueue[load_idx]->readyToIssue()) {
+            ++retval;
+        }
+    }
+
+    return retval;
+}
+
+#if 0
+template <class Impl>
+Fault
+LSQUnit<Impl>::executeLoad()
+{
+    Fault load_fault = NoFault;
+    DynInstPtr load_inst;
+
+    assert(readyLoads.size() != 0);
+
+    // Execute a ready load.
+    LdMapIt ready_it = readyLoads.begin();
+
+    load_inst = (*ready_it).second;
+
+    // Execute the instruction, which is held in the data portion of the
+    // iterator.
+    load_fault = load_inst->execute();
+
+    // If it executed successfully, then switch it over to the executed
+    // loads list.
+    if (load_fault == NoFault) {
+        executedLoads[load_inst->seqNum] = load_inst;
+
+        readyLoads.erase(ready_it);
+    } else {
+        loadFaultInst = load_inst;
+    }
+
+    return load_fault;
+}
+#endif
+
+template <class Impl>
+Fault
+LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
+{
+    // Execute a specific load.
+    Fault load_fault = NoFault;
+
+    DPRINTF(LSQUnit, "Executing load PC %#x, [sn:%lli]\n",
+            inst->readPC(),inst->seqNum);
+
+    // Make sure it's really in the list.
+    // Normally it should always be in the list.  However,
+    /* due to a syscall it may not be the list.
+#ifdef DEBUG
+    int i = loadHead;
+    while (1) {
+        if (i == loadTail && !find(inst)) {
+            assert(0 && "Load not in the queue!");
+        } else if (loadQueue[i] == inst) {
+            break;
+        }
+
+        i = i + 1;
+        if (i >= LQEntries) {
+            i = 0;
+        }
+    }
+#endif // DEBUG*/
+
+//    load_fault = inst->initiateAcc();
+    load_fault = inst->execute();
+
+    // If the instruction faulted, then we need to send it along to commit
+    // without the instruction completing.
+    if (load_fault != NoFault) {
+        // Maybe just set it as can commit here, although that might cause
+        // some other problems with sending traps to the ROB too quickly.
+        iewStage->instToCommit(inst);
+        iewStage->activityThisCycle();
+    }
+
+    return load_fault;
+}
+
+template <class Impl>
+Fault
+LSQUnit<Impl>::executeLoad(int lq_idx)
+{
+    // Very hackish.  Not sure the best way to check that this
+    // instruction is at the head of the ROB.  I should have some sort
+    // of extra information here so that I'm not overloading the
+    // canCommit signal for 15 different things.
+    loadQueue[lq_idx]->setCanCommit();
+    Fault ret_fault = executeLoad(loadQueue[lq_idx]);
+    loadQueue[lq_idx]->clearCanCommit();
+    return ret_fault;
+}
+
+template <class Impl>
+Fault
+LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
+{
+    using namespace TheISA;
+    // Make sure that a store exists.
+    assert(stores != 0);
+
+    int store_idx = store_inst->sqIdx;
+
+    DPRINTF(LSQUnit, "Executing store PC %#x [sn:%lli]\n",
+            store_inst->readPC(), store_inst->seqNum);
+
+    // Check the recently completed loads to see if any match this store's
+    // address.  If so, then we have a memory ordering violation.
+    int load_idx = store_inst->lqIdx;
+
+    Fault store_fault = store_inst->initiateAcc();
+//    Fault store_fault = store_inst->execute();
+
+    // Store size should now be available.  Use it to get proper offset for
+    // addr comparisons.
+    int size = storeQueue[store_idx].size;
+
+    if (size == 0) {
+        DPRINTF(LSQUnit,"Fault on Store PC %#x, [sn:%lli],Size = 0\n",
+                store_inst->readPC(),store_inst->seqNum);
+
+        return store_fault;
+    }
+
+    assert(store_fault == NoFault);
+
+    if (!storeFaultInst) {
+        if (store_fault != NoFault) {
+            panic("Fault in a store instruction!");
+            storeFaultInst = store_inst;
+        } else if (store_inst->isNonSpeculative()) {
+            // Nonspeculative accesses (namely store conditionals)
+            // need to set themselves as able to writeback if we
+            // haven't had a fault by here.
+            storeQueue[store_idx].canWB = true;
+
+            ++storesToWB;
+        }
+    }
+
+    if (!memDepViolator) {
+        while (load_idx != loadTail) {
+            // Actually should only check loads that have actually executed
+            // Might be safe because effAddr is set to InvalAddr when the
+            // dyn inst is created.
+
+            // Must actually check all addrs in the proper size range
+            // Which is more correct than needs to be.  What if for now we just
+            // assume all loads are quad-word loads, and do the addr based
+            // on that.
+            // @todo: Fix this, magic number being used here
+            if ((loadQueue[load_idx]->effAddr >> 8) ==
+                (store_inst->effAddr >> 8)) {
+                // A load incorrectly passed this store.  Squash and refetch.
+                // For now return a fault to show that it was unsuccessful.
+                memDepViolator = loadQueue[load_idx];
+
+                return genMachineCheckFault();
+            }
+
+            incrLdIdx(load_idx);
+        }
+
+        // If we've reached this point, there was no violation.
+        memDepViolator = NULL;
+    }
+
+    return store_fault;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::commitLoad()
+{
+    assert(loadQueue[loadHead]);
+
+    DPRINTF(LSQUnit, "Committing head load instruction, PC %#x\n",
+            loadQueue[loadHead]->readPC());
+
+
+    loadQueue[loadHead] = NULL;
+
+    incrLdIdx(loadHead);
+
+    --loads;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::commitLoad(InstSeqNum &inst)
+{
+    // Hopefully I don't use this function too much
+    panic("Don't use this function!");
+
+    int i = loadHead;
+    while (1) {
+        if (i == loadTail) {
+            assert(0 && "Load not in the queue!");
+        } else if (loadQueue[i]->seqNum == inst) {
+            break;
+        }
+
+        ++i;
+        if (i >= LQEntries) {
+            i = 0;
+        }
+    }
+
+    loadQueue[i]->removeInLSQ();
+    loadQueue[i] = NULL;
+    --loads;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst)
+{
+    assert(loads == 0 || loadQueue[loadHead]);
+
+    while (loads != 0 && loadQueue[loadHead]->seqNum <= youngest_inst) {
+        commitLoad();
+    }
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
+{
+    assert(stores == 0 || storeQueue[storeHead].inst);
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail) {
+        assert(storeQueue[store_idx].inst);
+        if (!storeQueue[store_idx].canWB) {
+            if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
+                break;
+            }
+            DPRINTF(LSQUnit, "Marking store as able to write back, PC "
+                    "%#x [sn:%lli]\n",
+                    storeQueue[store_idx].inst->readPC(),
+                    storeQueue[store_idx].inst->seqNum);
+
+            storeQueue[store_idx].canWB = true;
+
+//            --stores;
+            ++storesToWB;
+        }
+
+        incrStIdx(store_idx);
+    }
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::writebackStores()
+{
+    while (storesToWB > 0 &&
+           storeWBIdx != storeTail &&
+           storeQueue[storeWBIdx].inst &&
+           storeQueue[storeWBIdx].canWB &&
+           usedPorts < cachePorts) {
+
+        if (storeQueue[storeWBIdx].size == 0) {
+            completeStore(storeWBIdx);
+
+            incrStIdx(storeWBIdx);
+
+            continue;
+        }
+
+        if (dcacheInterface && dcacheInterface->isBlocked()) {
+            DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
+                    " is blocked!\n");
+            break;
+        }
+
+        ++usedPorts;
+
+        if (storeQueue[storeWBIdx].inst->isDataPrefetch()) {
+            incrStIdx(storeWBIdx);
+
+            continue;
+        }
+
+        assert(storeQueue[storeWBIdx].req);
+        assert(!storeQueue[storeWBIdx].committed);
+
+        MemReqPtr req = storeQueue[storeWBIdx].req;
+        storeQueue[storeWBIdx].committed = true;
+
+//	Fault fault = cpu->translateDataWriteReq(req);
+        req->cmd = Write;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+        memcpy(req->data, (uint8_t *)&storeQueue[storeWBIdx].data, req->size);
+
+        DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x "
+                "to Addr:%#x, data:%#x [sn:%lli]\n",
+                storeWBIdx,storeQueue[storeWBIdx].inst->readPC(),
+                req->paddr, *(req->data),
+                storeQueue[storeWBIdx].inst->seqNum);
+
+//        if (fault != NoFault) {
+            //What should we do if there is a fault???
+            //for now panic
+//            panic("Page Table Fault!!!!!\n");
+//        }
+        switch(storeQueue[storeWBIdx].size) {
+          case 1:
+            cpu->write(req, (uint8_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 2:
+            cpu->write(req, (uint16_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 4:
+            cpu->write(req, (uint32_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 8:
+            cpu->write(req, (uint64_t &)storeQueue[storeWBIdx].data);
+            break;
+          default:
+            panic("Unexpected store size!\n");
+        }
+
+        if (dcacheInterface) {
+            MemAccessResult result = dcacheInterface->access(req);
+
+            if (isStalled() &&
+                storeQueue[storeWBIdx].inst->seqNum == stallingStoreIsn) {
+                DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
+                        "load idx:%i\n",
+                        stallingStoreIsn, stallingLoadIdx);
+                stalled = false;
+                stallingStoreIsn = 0;
+                iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
+            }
+
+            if (result != MA_HIT && dcacheInterface->doEvents()) {
+                typename IEW::LdWritebackEvent *wb = NULL;
+                if (req->flags & LOCKED) {
+                    // Stx_C does not generate a system port transaction.
+/*
+                    if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
+                        req->result=1;
+                    } else {
+                        req->result = 0;
+                    }
+*/
+                    wb = new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+                                                            iewStage);
+                }
+
+                DPRINTF(LSQUnit,"D-Cache Write Miss!\n");
+
+                DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
+                        storeQueue[storeWBIdx].inst->seqNum);
+
+                // Will stores need their own kind of writeback events?
+                // Do stores even need writeback events?
+                assert(!req->completionEvent);
+                req->completionEvent = new
+                    StoreCompletionEvent(storeWBIdx, wb, this);
+
+                lastDcacheStall = curTick;
+
+                _status = DcacheMissStall;
+
+                //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum);
+
+                //DPRINTF(LSQUnit, "Added MSHR. count = %i\n",mshrSeqNums.size());
+
+                // Increment stat here or something
+            } else {
+                DPRINTF(LSQUnit,"D-Cache: Write Hit on idx:%i !\n",
+                        storeWBIdx);
+
+                DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
+                        storeQueue[storeWBIdx].inst->seqNum);
+
+
+                if (req->flags & LOCKED) {
+                    // Stx_C does not generate a system port transaction.
+/*
+                    if (req->flags & UNCACHEABLE) {
+                        req->result = 2;
+                    } else {
+                        if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
+                            req->result=1;
+                        } else {
+                            req->result = 0;
+                        }
+                    }
+*/
+                    typename IEW::LdWritebackEvent *wb =
+                        new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+                                                           iewStage);
+                    wb->schedule(curTick);
+                }
+
+                completeStore(storeWBIdx);
+            }
+
+            incrStIdx(storeWBIdx);
+        } else {
+            panic("Must HAVE DCACHE!!!!!\n");
+        }
+    }
+
+    // Not sure this should set it to 0.
+    usedPorts = 0;
+
+    assert(stores >= 0 && storesToWB >= 0);
+}
+
+/*template <class Impl>
+void
+LSQUnit<Impl>::removeMSHR(InstSeqNum seqNum)
+{
+    list<InstSeqNum>::iterator mshr_it = find(mshrSeqNums.begin(),
+                                              mshrSeqNums.end(),
+                                              seqNum);
+
+    if (mshr_it != mshrSeqNums.end()) {
+        mshrSeqNums.erase(mshr_it);
+        DPRINTF(LSQUnit, "Removing MSHR. count = %i\n",mshrSeqNums.size());
+    }
+}*/
+
+template <class Impl>
+void
+LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
+{
+    DPRINTF(LSQUnit, "Squashing until [sn:%lli]!"
+            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+
+    int load_idx = loadTail;
+    decrLdIdx(load_idx);
+
+    while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
+
+        // Clear the smart pointer to make sure it is decremented.
+        DPRINTF(LSQUnit,"Load Instruction PC %#x squashed, "
+                "[sn:%lli]\n",
+                loadQueue[load_idx]->readPC(),
+                loadQueue[load_idx]->seqNum);
+
+        if (isStalled() && load_idx == stallingLoadIdx) {
+            stalled = false;
+            stallingStoreIsn = 0;
+            stallingLoadIdx = 0;
+        }
+
+        loadQueue[load_idx]->squashed = true;
+        loadQueue[load_idx] = NULL;
+        --loads;
+
+        // Inefficient!
+        loadTail = load_idx;
+
+        decrLdIdx(load_idx);
+    }
+
+    if (isLoadBlocked) {
+        if (squashed_num < blockedLoadSeqNum) {
+            isLoadBlocked = false;
+            loadBlockedHandled = false;
+            blockedLoadSeqNum = 0;
+        }
+    }
+
+    int store_idx = storeTail;
+    decrStIdx(store_idx);
+
+    while (stores != 0 &&
+           storeQueue[store_idx].inst->seqNum > squashed_num) {
+
+        if (storeQueue[store_idx].canWB) {
+            break;
+        }
+
+        // Clear the smart pointer to make sure it is decremented.
+        DPRINTF(LSQUnit,"Store Instruction PC %#x squashed, "
+                "idx:%i [sn:%lli]\n",
+                storeQueue[store_idx].inst->readPC(),
+                store_idx, storeQueue[store_idx].inst->seqNum);
+
+        // I don't think this can happen.  It should have been cleared by the
+        // stalling load.
+        if (isStalled() &&
+            storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+            panic("Is stalled should have been cleared by stalling load!\n");
+            stalled = false;
+            stallingStoreIsn = 0;
+        }
+
+        storeQueue[store_idx].inst->squashed = true;
+        storeQueue[store_idx].inst = NULL;
+        storeQueue[store_idx].canWB = 0;
+
+        if (storeQueue[store_idx].req) {
+            assert(!storeQueue[store_idx].req->completionEvent);
+        }
+        storeQueue[store_idx].req = NULL;
+        --stores;
+
+        // Inefficient!
+        storeTail = store_idx;
+
+        decrStIdx(store_idx);
+    }
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::dumpInsts()
+{
+    cprintf("Load store queue: Dumping instructions.\n");
+    cprintf("Load queue size: %i\n", loads);
+    cprintf("Load queue: ");
+
+    int load_idx = loadHead;
+
+    while (load_idx != loadTail && loadQueue[load_idx]) {
+        cprintf("%#x ", loadQueue[load_idx]->readPC());
+
+        incrLdIdx(load_idx);
+    }
+
+    cprintf("Store queue size: %i\n", stores);
+    cprintf("Store queue: ");
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail && storeQueue[store_idx].inst) {
+        cprintf("%#x ", storeQueue[store_idx].inst->readPC());
+
+        incrStIdx(store_idx);
+    }
+
+    cprintf("\n");
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::completeStore(int store_idx)
+{
+    assert(storeQueue[store_idx].inst);
+    storeQueue[store_idx].completed = true;
+    --storesToWB;
+    // A bit conservative because a store completion may not free up entries,
+    // but hopefully avoids two store completions in one cycle from making
+    // the CPU tick twice.
+    cpu->activityThisCycle();
+
+    if (store_idx == storeHead) {
+        do {
+            incrStIdx(storeHead);
+
+            --stores;
+        } while (storeQueue[storeHead].completed &&
+                 storeHead != storeTail);
+
+        iewStage->updateLSQNextCycle = true;
+    }
+
+    DPRINTF(LSQUnit, "Store head idx:%i\n", storeHead);
+
+    if (isStalled() &&
+        storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+        DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
+                "load idx:%i\n",
+                stallingStoreIsn, stallingLoadIdx);
+        stalled = false;
+        stallingStoreIsn = 0;
+        iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
+    }
+}
+
+template <class Impl>
+inline void
+LSQUnit<Impl>::incrStIdx(int &store_idx)
+{
+    if (++store_idx >= SQEntries)
+        store_idx = 0;
+}
+
+template <class Impl>
+inline void
+LSQUnit<Impl>::decrStIdx(int &store_idx)
+{
+    if (--store_idx < 0)
+        store_idx += SQEntries;
+}
+
+template <class Impl>
+inline void
+LSQUnit<Impl>::incrLdIdx(int &load_idx)
+{
+    if (++load_idx >= LQEntries)
+        load_idx = 0;
+}
+
+template <class Impl>
+inline void
+LSQUnit<Impl>::decrLdIdx(int &load_idx)
+{
+    if (--load_idx < 0)
+        load_idx += LQEntries;
+}
diff --git a/cpu/o3/mem_dep_unit.cc b/cpu/o3/mem_dep_unit.cc
index 9c1e7f9d8..ccdd1a515 100644
--- a/cpu/o3/mem_dep_unit.cc
+++ b/cpu/o3/mem_dep_unit.cc
@@ -34,3 +34,13 @@
 // Force instantation of memory dependency unit using store sets and
 // AlphaSimpleImpl.
 template class MemDepUnit<StoreSet, AlphaSimpleImpl>;
+
+template <>
+int
+MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_count = 0;
+template <>
+int
+MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_insert = 0;
+template <>
+int
+MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_erase = 0;
diff --git a/cpu/o3/mem_dep_unit.hh b/cpu/o3/mem_dep_unit.hh
index ca63577a1..32ce9f768 100644
--- a/cpu/o3/mem_dep_unit.hh
+++ b/cpu/o3/mem_dep_unit.hh
@@ -26,15 +26,29 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_MEM_DEP_UNIT_HH__
-#define __CPU_O3_CPU_MEM_DEP_UNIT_HH__
+#ifndef __CPU_O3_MEM_DEP_UNIT_HH__
+#define __CPU_O3_MEM_DEP_UNIT_HH__
 
-#include <map>
+#include <list>
 #include <set>
 
+#include "base/hashmap.hh"
+#include "base/refcnt.hh"
 #include "base/statistics.hh"
 #include "cpu/inst_seq.hh"
 
+struct SNHash {
+    size_t operator() (const InstSeqNum &seq_num) const {
+        unsigned a = (unsigned)seq_num;
+        unsigned hash = (((a >> 14) ^ ((a >> 2) & 0xffff))) & 0x7FFFFFFF;
+
+        return hash;
+    }
+};
+
+template <class Impl>
+class InstructionQueue;
+
 /**
  * Memory dependency unit class.  This holds the memory dependence predictor.
  * As memory operations are issued to the IQ, they are also issued to this
@@ -52,101 +66,162 @@ class MemDepUnit {
     typedef typename Impl::Params Params;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
-  public:
-    MemDepUnit(Params &params);
+    /** Empty constructor. Must call init() prior to using in this case. */
+    MemDepUnit() {}
 
+    /** Constructs a MemDepUnit with given parameters. */
+    MemDepUnit(Params *params);
+
+    /** Frees up any memory allocated. */
+    ~MemDepUnit();
+
+    /** Returns the name of the memory dependence unit. */
+    std::string name() const;
+
+    /** Initializes the unit with parameters and a thread id. */
+    void init(Params *params, int tid);
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets the pointer to the IQ. */
+    void setIQ(InstructionQueue<Impl> *iq_ptr);
+
+    /** Inserts a memory instruction. */
     void insert(DynInstPtr &inst);
 
+    /** Inserts a non-speculative memory instruction. */
     void insertNonSpec(DynInstPtr &inst);
 
-    // Will want to make this operation relatively fast.  Right now it
-    // is somewhat slow.
-    DynInstPtr &top();
-
-    void pop();
+    /** Inserts a barrier instruction. */
+    void insertBarrier(DynInstPtr &barr_inst);
 
+    /** Indicate that an instruction has its registers ready. */
     void regsReady(DynInstPtr &inst);
 
+    /** Indicate that a non-speculative instruction is ready. */
     void nonSpecInstReady(DynInstPtr &inst);
 
-    void issue(DynInstPtr &inst);
+    /** Reschedules an instruction to be re-executed. */
+    void reschedule(DynInstPtr &inst);
 
+    /** Replays all instructions that have been rescheduled by moving them to
+     *  the ready list.
+     */
+    void replay(DynInstPtr &inst);
+
+    /** Completes a memory instruction. */
+    void completed(DynInstPtr &inst);
+
+    /** Completes a barrier instruction. */
+    void completeBarrier(DynInstPtr &inst);
+
+    /** Wakes any dependents of a memory instruction. */
     void wakeDependents(DynInstPtr &inst);
 
-    void squash(const InstSeqNum &squashed_num);
+    /** Squashes all instructions up until a given sequence number for a
+     *  specific thread.
+     */
+    void squash(const InstSeqNum &squashed_num, unsigned tid);
 
+    /** Indicates an ordering violation between a store and a younger load. */
     void violation(DynInstPtr &store_inst, DynInstPtr &violating_load);
 
-    inline bool empty()
-    { return readyInsts.empty(); }
+    /** Issues the given instruction */
+    void issue(DynInstPtr &inst);
+
+    /** Debugging function to dump the lists of instructions. */
+    void dumpLists();
 
   private:
-    typedef typename std::set<InstSeqNum>::iterator sn_it_t;
-    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator dyn_it_t;
+    typedef typename std::list<DynInstPtr>::iterator ListIt;
 
-    // Forward declarations so that the following two typedefs work.
-    class Dependency;
-    class ltDependency;
+    class MemDepEntry;
 
-    typedef typename std::set<Dependency, ltDependency>::iterator dep_it_t;
-    typedef typename std::map<InstSeqNum, vector<dep_it_t> >::iterator
-    sd_it_t;
+    typedef RefCountingPtr<MemDepEntry> MemDepEntryPtr;
 
-    struct Dependency {
-        Dependency(const InstSeqNum &_seqNum)
-            : seqNum(_seqNum), regsReady(0), memDepReady(0)
-        { }
+    /** Memory dependence entries that track memory operations, marking
+     *  when the instruction is ready to execute and what instructions depend
+     *  upon it.
+     */
+    class MemDepEntry : public RefCounted {
+      public:
+        /** Constructs a memory dependence entry. */
+        MemDepEntry(DynInstPtr &new_inst)
+            : inst(new_inst), regsReady(false), memDepReady(false),
+              completed(false), squashed(false)
+        {
+            ++memdep_count;
 
-        Dependency(const InstSeqNum &_seqNum, bool _regsReady,
-                   bool _memDepReady)
-            : seqNum(_seqNum), regsReady(_regsReady),
-              memDepReady(_memDepReady)
-        { }
+            DPRINTF(MemDepUnit, "Memory dependency entry created.  "
+                    "memdep_count=%i\n", memdep_count);
+        }
 
-        InstSeqNum seqNum;
-        mutable bool regsReady;
-        mutable bool memDepReady;
-        mutable sd_it_t storeDep;
+        /** Frees any pointers. */
+        ~MemDepEntry()
+        {
+            for (int i = 0; i < dependInsts.size(); ++i) {
+                dependInsts[i] = NULL;
+            }
+
+            --memdep_count;
+
+            DPRINTF(MemDepUnit, "Memory dependency entry deleted.  "
+                    "memdep_count=%i\n", memdep_count);
+        }
+
+        /** Returns the name of the memory dependence entry. */
+        std::string name() const { return "memdepentry"; }
+
+        /** The instruction being tracked. */
+        DynInstPtr inst;
+
+        /** The iterator to the instruction's location inside the list. */
+        ListIt listIt;
+
+        /** A vector of any dependent instructions. */
+        std::vector<MemDepEntryPtr> dependInsts;
+
+        /** If the registers are ready or not. */
+        bool regsReady;
+        /** If all memory dependencies have been satisfied. */
+        bool memDepReady;
+        /** If the instruction is completed. */
+        bool completed;
+        /** If the instruction is squashed. */
+        bool squashed;
+
+        /** For debugging. */
+        static int memdep_count;
+        static int memdep_insert;
+        static int memdep_erase;
     };
 
-    struct ltDependency {
-        bool operator() (const Dependency &lhs, const Dependency &rhs)
+    struct ltMemDepEntry {
+        bool operator() (const MemDepEntryPtr &lhs, const MemDepEntryPtr &rhs)
         {
-            return lhs.seqNum < rhs.seqNum;
+            return lhs->inst->seqNum < rhs->inst->seqNum;
         }
     };
 
-    inline void moveToReady(dep_it_t &woken_inst);
+    /** Finds the memory dependence entry in the hash map. */
+    inline MemDepEntryPtr &findInHash(const DynInstPtr &inst);
 
-    /** List of instructions that have passed through rename, yet are still
-     *  waiting on either a memory dependence to resolve or source registers to
-     *  become available before they can issue.
-     */
-    std::set<Dependency, ltDependency> waitingInsts;
+    /** Moves an entry to the ready list. */
+    inline void moveToReady(MemDepEntryPtr &ready_inst_entry);
 
-    /** List of instructions that have all their predicted memory dependences
-     *  resolved and their source registers ready.
-     */
-    std::set<InstSeqNum> readyInsts;
+    typedef m5::hash_map<InstSeqNum, MemDepEntryPtr, SNHash> MemDepHash;
 
-    // Change this to hold a vector of iterators, which will point to the
-    // entry of the waiting instructions.
-    /** List of stores' sequence numbers, each of which has a vector of
-     *  iterators.  The iterators point to the appropriate node within
-     *  waitingInsts that has the depenendent instruction.
-     */
-    std::map<InstSeqNum, vector<dep_it_t> > storeDependents;
+    typedef typename MemDepHash::iterator MemDepHashIt;
 
-    // For now will implement this as a map...hash table might not be too
-    // bad, or could move to something that mimics the current dependency
-    // graph.
-    std::map<InstSeqNum, DynInstPtr> memInsts;
+    /** A hash map of all memory dependence entries. */
+    MemDepHash memDepHash;
 
-    // Iterator pointer to the top instruction which has is ready.
-    // Is set by the top() call.
-    dyn_it_t topInst;
+    /** A list of all instructions in the memory dependence unit. */
+    std::list<DynInstPtr> instList[Impl::MaxThreads];
+
+    /** A list of all instructions that are going to be replayed. */
+    std::list<DynInstPtr> instsToReplay;
 
     /** The memory dependence predictor.  It is accessed upon new
      *  instructions being added to the IQ, and responds by telling
@@ -155,10 +230,25 @@ class MemDepUnit {
      */
     MemDepPred depPred;
 
+    bool loadBarrier;
+    InstSeqNum loadBarrierSN;
+    bool storeBarrier;
+    InstSeqNum storeBarrierSN;
+
+    /** Pointer to the IQ. */
+    InstructionQueue<Impl> *iqPtr;
+
+    /** The thread id of this memory dependence unit. */
+    int id;
+
+    /** Stat for number of inserted loads. */
     Stats::Scalar<> insertedLoads;
+    /** Stat for number of inserted stores. */
     Stats::Scalar<> insertedStores;
+    /** Stat for number of conflicting loads that had to wait for a store. */
     Stats::Scalar<> conflictingLoads;
+    /** Stat for number of conflicting stores that had to wait for a store. */
     Stats::Scalar<> conflictingStores;
 };
 
-#endif // __CPU_O3_CPU_MEM_DEP_UNIT_HH__
+#endif // __CPU_O3_MEM_DEP_UNIT_HH__
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 296db4c4e..771a0505e 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -28,13 +28,56 @@
 
 #include <map>
 
+#include "cpu/o3/inst_queue.hh"
 #include "cpu/o3/mem_dep_unit.hh"
 
 template <class MemDepPred, class Impl>
-MemDepUnit<MemDepPred, Impl>::MemDepUnit(Params &params)
-    : depPred(params.SSITSize, params.LFSTSize)
+MemDepUnit<MemDepPred, Impl>::MemDepUnit(Params *params)
+    : depPred(params->SSITSize, params->LFSTSize), loadBarrier(false),
+      loadBarrierSN(0), storeBarrier(false), storeBarrierSN(0), iqPtr(NULL)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Creating MemDepUnit object.\n");
+    DPRINTF(MemDepUnit, "Creating MemDepUnit object.\n");
+}
+
+template <class MemDepPred, class Impl>
+MemDepUnit<MemDepPred, Impl>::~MemDepUnit()
+{
+    for (int tid=0; tid < Impl::MaxThreads; tid++) {
+
+        ListIt inst_list_it = instList[tid].begin();
+
+        MemDepHashIt hash_it;
+
+        while (!instList[tid].empty()) {
+            hash_it = memDepHash.find((*inst_list_it)->seqNum);
+
+            assert(hash_it != memDepHash.end());
+
+            memDepHash.erase(hash_it);
+
+            instList[tid].erase(inst_list_it++);
+        }
+    }
+
+    assert(MemDepEntry::memdep_count == 0);
+}
+
+template <class MemDepPred, class Impl>
+std::string
+MemDepUnit<MemDepPred, Impl>::name() const
+{
+    return "memdepunit";
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::init(Params *params, int tid)
+{
+    DPRINTF(MemDepUnit, "Creating MemDepUnit %i object.\n",tid);
+
+    id = tid;
+
+    depPred.init(params->SSITSize, params->LFSTSize);
 }
 
 template <class MemDepPred, class Impl>
@@ -58,58 +101,79 @@ MemDepUnit<MemDepPred, Impl>::regStats()
         .desc("Number of conflicting stores.");
 }
 
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::setIQ(InstructionQueue<Impl> *iq_ptr)
+{
+    iqPtr = iq_ptr;
+}
+
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
 {
-    InstSeqNum inst_seq_num = inst->seqNum;
+    unsigned tid = inst->threadNumber;
 
-    Dependency unresolved_dependencies(inst_seq_num);
+    MemDepEntryPtr inst_entry = new MemDepEntry(inst);
 
-    InstSeqNum producing_store = depPred.checkInst(inst->readPC());
+    // Add the MemDepEntry to the hash.
+    memDepHash.insert(
+        std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
+    MemDepEntry::memdep_insert++;
 
-    if (producing_store == 0 ||
-        storeDependents.find(producing_store) == storeDependents.end()) {
+    // Add the instruction to the instruction list.
+    instList[tid].push_back(inst);
 
-        DPRINTF(MemDepUnit, "MemDepUnit: No dependency for inst PC "
-                "%#x.\n", inst->readPC());
+    inst_entry->listIt = --(instList[tid].end());
 
-        unresolved_dependencies.storeDep = storeDependents.end();
+    // Check the dependence predictor for any producing stores.
+    InstSeqNum producing_store;
+    if (inst->isLoad() && loadBarrier) {
+        producing_store = loadBarrierSN;
+    } else if (inst->isStore() && storeBarrier) {
+        producing_store = storeBarrierSN;
+    } else {
+        producing_store = depPred.checkInst(inst->readPC());
+    }
+
+    MemDepEntryPtr store_entry = NULL;
+
+    // If there is a producing store, try to find the entry.
+    if (producing_store != 0) {
+        MemDepHashIt hash_it = memDepHash.find(producing_store);
+
+        if (hash_it != memDepHash.end()) {
+            store_entry = (*hash_it).second;
+        }
+    }
+
+    // If no store entry, then instruction can issue as soon as the registers
+    // are ready.
+    if (!store_entry) {
+        DPRINTF(MemDepUnit, "No dependency for inst PC "
+                "%#x [sn:%lli].\n", inst->readPC(), inst->seqNum);
+
+        inst_entry->memDepReady = true;
 
         if (inst->readyToIssue()) {
-            readyInsts.insert(inst_seq_num);
-        } else {
-            unresolved_dependencies.memDepReady = true;
+            inst_entry->regsReady = true;
 
-            waitingInsts.insert(unresolved_dependencies);
+            moveToReady(inst_entry);
         }
     } else {
-        DPRINTF(MemDepUnit, "MemDepUnit: Adding to dependency list; "
-                "inst PC %#x is dependent on seq num %i.\n",
+        // Otherwise make the instruction dependent on the store.
+        DPRINTF(MemDepUnit, "Adding to dependency list; "
+                "inst PC %#x is dependent on [sn:%lli].\n",
                 inst->readPC(), producing_store);
 
         if (inst->readyToIssue()) {
-            unresolved_dependencies.regsReady = true;
+            inst_entry->regsReady = true;
         }
 
-        // Find the store that this instruction is dependent on.
-        sd_it_t store_loc = storeDependents.find(producing_store);
-
-        assert(store_loc != storeDependents.end());
-
-        // Record the location of the store that this instruction is
-        // dependent on.
-        unresolved_dependencies.storeDep = store_loc;
-
-        // If it's not already ready, then add it to the renamed
-        // list and the dependencies.
-        dep_it_t inst_loc =
-            (waitingInsts.insert(unresolved_dependencies)).first;
-
         // Add this instruction to the list of dependents.
-        (*store_loc).second.push_back(inst_loc);
+        store_entry->dependInsts.push_back(inst_entry);
 
-        assert(!(*store_loc).second.empty());
+//        inst_entry->producingStore = store_entry;
 
         if (inst->isLoad()) {
             ++conflictingLoads;
@@ -119,127 +183,105 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
     }
 
     if (inst->isStore()) {
-        DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n",
-                inst->readPC());
+        DPRINTF(MemDepUnit, "Inserting store PC %#x [sn:%lli].\n",
+                inst->readPC(), inst->seqNum);
 
-        depPred.insertStore(inst->readPC(), inst_seq_num);
-
-        // Make sure this store isn't already in this list.
-        assert(storeDependents.find(inst_seq_num) == storeDependents.end());
-
-        // Put a dependency entry in at the store's sequence number.
-        // Uh, not sure how this works...I want to create an entry but
-        // I don't have anything to put into the value yet.
-        storeDependents[inst_seq_num];
-
-        assert(storeDependents.size() != 0);
+        depPred.insertStore(inst->readPC(), inst->seqNum, inst->threadNumber);
 
         ++insertedStores;
-
     } else if (inst->isLoad()) {
         ++insertedLoads;
     } else {
-        panic("MemDepUnit: Unknown type! (most likely a barrier).");
+        panic("Unknown type! (most likely a barrier).");
     }
-
-    memInsts[inst_seq_num] = inst;
 }
 
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::insertNonSpec(DynInstPtr &inst)
 {
-    InstSeqNum inst_seq_num = inst->seqNum;
+    unsigned tid = inst->threadNumber;
 
-    Dependency non_spec_inst(inst_seq_num);
+    MemDepEntryPtr inst_entry = new MemDepEntry(inst);
 
-    non_spec_inst.storeDep = storeDependents.end();
+    // Insert the MemDepEntry into the hash.
+    memDepHash.insert(
+        std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
+    MemDepEntry::memdep_insert++;
 
-    waitingInsts.insert(non_spec_inst);
+    // Add the instruction to the list.
+    instList[tid].push_back(inst);
+
+    inst_entry->listIt = --(instList[tid].end());
 
     // Might want to turn this part into an inline function or something.
     // It's shared between both insert functions.
     if (inst->isStore()) {
-        DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n",
-                inst->readPC());
+        DPRINTF(MemDepUnit, "Inserting store PC %#x [sn:%lli].\n",
+                inst->readPC(), inst->seqNum);
 
-        depPred.insertStore(inst->readPC(), inst_seq_num);
-
-        // Make sure this store isn't already in this list.
-        assert(storeDependents.find(inst_seq_num) == storeDependents.end());
-
-        // Put a dependency entry in at the store's sequence number.
-        // Uh, not sure how this works...I want to create an entry but
-        // I don't have anything to put into the value yet.
-        storeDependents[inst_seq_num];
-
-        assert(storeDependents.size() != 0);
+        depPred.insertStore(inst->readPC(), inst->seqNum, inst->threadNumber);
 
         ++insertedStores;
-
     } else if (inst->isLoad()) {
         ++insertedLoads;
     } else {
-        panic("MemDepUnit: Unknown type! (most likely a barrier).");
+        panic("Unknown type! (most likely a barrier).");
     }
-
-    memInsts[inst_seq_num] = inst;
-}
-
-template <class MemDepPred, class Impl>
-typename Impl::DynInstPtr &
-MemDepUnit<MemDepPred, Impl>::top()
-{
-    topInst = memInsts.find( (*readyInsts.begin()) );
-
-    DPRINTF(MemDepUnit, "MemDepUnit: Top instruction is PC %#x.\n",
-            (*topInst).second->readPC());
-
-    return (*topInst).second;
 }
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::pop()
+MemDepUnit<MemDepPred, Impl>::insertBarrier(DynInstPtr &barr_inst)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Removing instruction PC %#x.\n",
-            (*topInst).second->readPC());
+    InstSeqNum barr_sn = barr_inst->seqNum;
+    if (barr_inst->isMemBarrier()) {
+        loadBarrier = true;
+        loadBarrierSN = barr_sn;
+        storeBarrier = true;
+        storeBarrierSN = barr_sn;
+        DPRINTF(MemDepUnit, "Inserted a memory barrier\n");
+    } else if (barr_inst->isWriteBarrier()) {
+        storeBarrier = true;
+        storeBarrierSN = barr_sn;
+        DPRINTF(MemDepUnit, "Inserted a write barrier\n");
+    }
 
-    wakeDependents((*topInst).second);
+    unsigned tid = barr_inst->threadNumber;
 
-    issue((*topInst).second);
+    MemDepEntryPtr inst_entry = new MemDepEntry(barr_inst);
 
-    memInsts.erase(topInst);
+    // Add the MemDepEntry to the hash.
+    memDepHash.insert(
+        std::pair<InstSeqNum, MemDepEntryPtr>(barr_sn, inst_entry));
+    MemDepEntry::memdep_insert++;
 
-    topInst = memInsts.end();
+    // Add the instruction to the instruction list.
+    instList[tid].push_back(barr_inst);
+
+    inst_entry->listIt = --(instList[tid].end());
 }
 
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::regsReady(DynInstPtr &inst)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Marking registers as ready for "
-            "instruction PC %#x.\n",
-            inst->readPC());
+    DPRINTF(MemDepUnit, "Marking registers as ready for "
+            "instruction PC %#x [sn:%lli].\n",
+            inst->readPC(), inst->seqNum);
 
-    InstSeqNum inst_seq_num = inst->seqNum;
+    MemDepEntryPtr inst_entry = findInHash(inst);
 
-    Dependency inst_to_find(inst_seq_num);
+    inst_entry->regsReady = true;
 
-    dep_it_t waiting_inst = waitingInsts.find(inst_to_find);
-
-    assert(waiting_inst != waitingInsts.end());
-
-    if ((*waiting_inst).memDepReady) {
-        DPRINTF(MemDepUnit, "MemDepUnit: Instruction has its memory "
+    if (inst_entry->memDepReady) {
+        DPRINTF(MemDepUnit, "Instruction has its memory "
                 "dependencies resolved, adding it to the ready list.\n");
 
-        moveToReady(waiting_inst);
+        moveToReady(inst_entry);
     } else {
-        DPRINTF(MemDepUnit, "MemDepUnit: Instruction still waiting on "
+        DPRINTF(MemDepUnit, "Instruction still waiting on "
                 "memory dependency.\n");
-
-        (*waiting_inst).regsReady = true;
     }
 }
 
@@ -247,149 +289,182 @@ template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(DynInstPtr &inst)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Marking non speculative "
-            "instruction PC %#x as ready.\n",
-            inst->readPC());
+    DPRINTF(MemDepUnit, "Marking non speculative "
+            "instruction PC %#x as ready [sn:%lli].\n",
+            inst->readPC(), inst->seqNum);
 
-    InstSeqNum inst_seq_num = inst->seqNum;
+    MemDepEntryPtr inst_entry = findInHash(inst);
 
-    Dependency inst_to_find(inst_seq_num);
-
-    dep_it_t waiting_inst = waitingInsts.find(inst_to_find);
-
-    assert(waiting_inst != waitingInsts.end());
-
-    moveToReady(waiting_inst);
+    moveToReady(inst_entry);
 }
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::issue(DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::reschedule(DynInstPtr &inst)
 {
-    assert(readyInsts.find(inst->seqNum) != readyInsts.end());
+    instsToReplay.push_back(inst);
+}
 
-    DPRINTF(MemDepUnit, "MemDepUnit: Issuing instruction PC %#x.\n",
-            inst->readPC());
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::replay(DynInstPtr &inst)
+{
+    DynInstPtr temp_inst;
+    bool found_inst = false;
 
-    // Remove the instruction from the ready list.
-    readyInsts.erase(inst->seqNum);
+    while (!instsToReplay.empty()) {
+        temp_inst = instsToReplay.front();
 
-    depPred.issued(inst->readPC(), inst->seqNum, inst->isStore());
+        MemDepEntryPtr inst_entry = findInHash(temp_inst);
+
+        DPRINTF(MemDepUnit, "Replaying mem instruction PC %#x "
+                "[sn:%lli].\n",
+                temp_inst->readPC(), temp_inst->seqNum);
+
+        moveToReady(inst_entry);
+
+        if (temp_inst == inst) {
+            found_inst = true;
+        }
+
+        instsToReplay.pop_front();
+    }
+
+    assert(found_inst);
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::completed(DynInstPtr &inst)
+{
+    DPRINTF(MemDepUnit, "Completed mem instruction PC %#x "
+            "[sn:%lli].\n",
+            inst->readPC(), inst->seqNum);
+
+    unsigned tid = inst->threadNumber;
+
+    // Remove the instruction from the hash and the list.
+    MemDepHashIt hash_it = memDepHash.find(inst->seqNum);
+
+    assert(hash_it != memDepHash.end());
+
+    instList[tid].erase((*hash_it).second->listIt);
+
+//    (*hash_it).second->inst = NULL;
+
+    (*hash_it).second = NULL;
+
+    memDepHash.erase(hash_it);
+    MemDepEntry::memdep_erase++;
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::completeBarrier(DynInstPtr &inst)
+{
+    wakeDependents(inst);
+    completed(inst);
+
+    InstSeqNum barr_sn = inst->seqNum;
+
+    if (inst->isMemBarrier()) {
+        assert(loadBarrier && storeBarrier);
+        if (loadBarrierSN == barr_sn)
+            loadBarrier = false;
+        if (storeBarrierSN == barr_sn)
+            storeBarrier = false;
+    } else if (inst->isWriteBarrier()) {
+        assert(storeBarrier);
+        if (storeBarrierSN == barr_sn)
+            storeBarrier = false;
+    }
 }
 
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
 {
-    // Only stores have dependents.
-    if (!inst->isStore()) {
+    // Only stores and barriers have dependents.
+    if (!inst->isStore() && !inst->isMemBarrier() && !inst->isWriteBarrier()) {
         return;
     }
 
-    // Wake any dependencies.
-    sd_it_t sd_it = storeDependents.find(inst->seqNum);
+    MemDepEntryPtr inst_entry = findInHash(inst);
 
-    // If there's no entry, then return.  Really there should only be
-    // no entry if the instruction is a load.
-    if (sd_it == storeDependents.end()) {
-        DPRINTF(MemDepUnit, "MemDepUnit: Instruction PC %#x, sequence "
-                "number %i has no dependents.\n",
-                inst->readPC(), inst->seqNum);
+    for (int i = 0; i < inst_entry->dependInsts.size(); ++i ) {
+        MemDepEntryPtr woken_inst = inst_entry->dependInsts[i];
 
-        return;
-    }
-
-    for (int i = 0; i < (*sd_it).second.size(); ++i ) {
-        dep_it_t woken_inst = (*sd_it).second[i];
-
-        DPRINTF(MemDepUnit, "MemDepUnit: Waking up a dependent inst, "
-                "sequence number %i.\n",
-                (*woken_inst).seqNum);
-#if 0
-        // Should we have reached instructions that are actually squashed,
-        // there will be no more useful instructions in this dependency
-        // list.  Break out early.
-        if (waitingInsts.find(woken_inst) == waitingInsts.end()) {
-            DPRINTF(MemDepUnit, "MemDepUnit: Dependents on inst PC %#x "
-                    "are squashed, starting at SN %i.  Breaking early.\n",
-                    inst->readPC(), woken_inst);
-            break;
+        if (!woken_inst->inst) {
+            // Potentially removed mem dep entries could be on this list
+//            inst_entry->dependInsts[i] = NULL;
+            continue;
         }
-#endif
 
-        if ((*woken_inst).regsReady) {
+        DPRINTF(MemDepUnit, "Waking up a dependent inst, "
+                "[sn:%lli].\n",
+                woken_inst->inst->seqNum);
+
+        if (woken_inst->regsReady && !woken_inst->squashed) {
             moveToReady(woken_inst);
         } else {
-            (*woken_inst).memDepReady = true;
+            woken_inst->memDepReady = true;
         }
+//        inst_entry->dependInsts[i] = NULL;
     }
 
-    storeDependents.erase(sd_it);
+    inst_entry->dependInsts.clear();
 }
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num)
+MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,
+                                     unsigned tid)
 {
-
-    if (!waitingInsts.empty()) {
-        dep_it_t waiting_it = waitingInsts.end();
-
-        --waiting_it;
-
-        // Remove entries from the renamed list as long as we haven't reached
-        // the end and the entries continue to be younger than the squashed.
-        while (!waitingInsts.empty() &&
-               (*waiting_it).seqNum > squashed_num)
-        {
-            if (!(*waiting_it).memDepReady &&
-                (*waiting_it).storeDep != storeDependents.end()) {
-                sd_it_t sd_it = (*waiting_it).storeDep;
-
-                // Make sure the iterator that the store has pointing
-                // back is actually to this instruction.
-                assert((*sd_it).second.back() == waiting_it);
-
-                // Now remove this from the store's list of dependent
-                // instructions.
-                (*sd_it).second.pop_back();
+    if (!instsToReplay.empty()) {
+        ListIt replay_it = instsToReplay.begin();
+        while (replay_it != instsToReplay.end()) {
+            if ((*replay_it)->threadNumber == tid &&
+                (*replay_it)->seqNum > squashed_num) {
+                instsToReplay.erase(replay_it++);
+            } else {
+                ++replay_it;
             }
-
-            waitingInsts.erase(waiting_it--);
         }
     }
 
-    if (!readyInsts.empty()) {
-        sn_it_t ready_it = readyInsts.end();
+    ListIt squash_it = instList[tid].end();
+    --squash_it;
 
-        --ready_it;
+    MemDepHashIt hash_it;
 
-        // Same for the ready list.
-        while (!readyInsts.empty() &&
-               (*ready_it) > squashed_num)
-        {
-            readyInsts.erase(ready_it--);
+    while (!instList[tid].empty() &&
+           (*squash_it)->seqNum > squashed_num) {
+
+        DPRINTF(MemDepUnit, "Squashing inst [sn:%lli]\n",
+                (*squash_it)->seqNum);
+
+        hash_it = memDepHash.find((*squash_it)->seqNum);
+
+        assert(hash_it != memDepHash.end());
+
+        (*hash_it).second->squashed = true;
+/*
+        for (int i = 0; i < (*hash_it).second->dependInsts.size(); ++i) {
+            (*hash_it).second->dependInsts[i] = NULL;
         }
-    }
 
-    if (!storeDependents.empty()) {
-        sd_it_t dep_it = storeDependents.end();
+        (*hash_it).second->inst = NULL;
+*/
+        (*hash_it).second = NULL;
 
-        --dep_it;
+        memDepHash.erase(hash_it);
+        MemDepEntry::memdep_erase++;
 
-        // Same for the dependencies list.
-        while (!storeDependents.empty() &&
-               (*dep_it).first > squashed_num)
-        {
-            // This store's list of dependent instructions should be empty.
-            assert((*dep_it).second.empty());
-
-            storeDependents.erase(dep_it--);
-        }
+        instList[tid].erase(squash_it--);
     }
 
     // Tell the dependency predictor to squash as well.
-    depPred.squash(squashed_num);
+    depPred.squash(squashed_num, tid);
 }
 
 template <class MemDepPred, class Impl>
@@ -397,7 +472,7 @@ void
 MemDepUnit<MemDepPred, Impl>::violation(DynInstPtr &store_inst,
                                         DynInstPtr &violating_load)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Passing violating PCs to store sets,"
+    DPRINTF(MemDepUnit, "Passing violating PCs to store sets,"
             " load: %#x, store: %#x\n", violating_load->readPC(),
             store_inst->readPC());
     // Tell the memory dependence unit of the violation.
@@ -405,15 +480,64 @@ MemDepUnit<MemDepPred, Impl>::violation(DynInstPtr &store_inst,
 }
 
 template <class MemDepPred, class Impl>
-inline void
-MemDepUnit<MemDepPred, Impl>::moveToReady(dep_it_t &woken_inst)
+void
+MemDepUnit<MemDepPred, Impl>::issue(DynInstPtr &inst)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Adding instruction sequence number %i "
-            "to the ready list.\n", (*woken_inst).seqNum);
+    DPRINTF(MemDepUnit, "Issuing instruction PC %#x [sn:%lli].\n",
+            inst->readPC(), inst->seqNum);
 
-    // Add it to the ready list.
-    readyInsts.insert((*woken_inst).seqNum);
-
-    // Remove it from the waiting instructions.
-    waitingInsts.erase(woken_inst);
+    depPred.issued(inst->readPC(), inst->seqNum, inst->isStore());
+}
+
+template <class MemDepPred, class Impl>
+inline typename MemDepUnit<MemDepPred,Impl>::MemDepEntryPtr &
+MemDepUnit<MemDepPred, Impl>::findInHash(const DynInstPtr &inst)
+{
+    MemDepHashIt hash_it = memDepHash.find(inst->seqNum);
+
+    assert(hash_it != memDepHash.end());
+
+    return (*hash_it).second;
+}
+
+template <class MemDepPred, class Impl>
+inline void
+MemDepUnit<MemDepPred, Impl>::moveToReady(MemDepEntryPtr &woken_inst_entry)
+{
+    DPRINTF(MemDepUnit, "Adding instruction [sn:%lli] "
+            "to the ready list.\n", woken_inst_entry->inst->seqNum);
+
+    assert(!woken_inst_entry->squashed);
+
+    iqPtr->addReadyMemInst(woken_inst_entry->inst);
+}
+
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::dumpLists()
+{
+    for (unsigned tid=0; tid < Impl::MaxThreads; tid++) {
+        cprintf("Instruction list %i size: %i\n",
+                tid, instList[tid].size());
+
+        ListIt inst_list_it = instList[tid].begin();
+        int num = 0;
+
+        while (inst_list_it != instList[tid].end()) {
+            cprintf("Instruction:%i\nPC:%#x\n[sn:%i]\n[tid:%i]\nIssued:%i\n"
+                    "Squashed:%i\n\n",
+                    num, (*inst_list_it)->readPC(),
+                    (*inst_list_it)->seqNum,
+                    (*inst_list_it)->threadNumber,
+                    (*inst_list_it)->isIssued(),
+                    (*inst_list_it)->isSquashed());
+            inst_list_it++;
+            ++num;
+        }
+    }
+
+    cprintf("Memory dependence hash size: %i\n", memDepHash.size());
+
+    cprintf("Memory dependence entries: %i\n", MemDepEntry::memdep_count);
 }
diff --git a/cpu/o3/ras.cc b/cpu/o3/ras.cc
index 0a7d6ca63..5e7ef38ae 100644
--- a/cpu/o3/ras.cc
+++ b/cpu/o3/ras.cc
@@ -28,14 +28,17 @@
 
 #include "cpu/o3/ras.hh"
 
-ReturnAddrStack::ReturnAddrStack(unsigned _numEntries)
-    : numEntries(_numEntries), usedEntries(0),
-      tos(0)
+void
+ReturnAddrStack::init(unsigned _numEntries)
 {
-    addrStack = new Addr[numEntries];
+     numEntries  = _numEntries;
+     usedEntries = 0;
+     tos = 0;
 
-    for (int i = 0; i < numEntries; ++i)
-        addrStack[i] = 0;
+     addrStack.resize(numEntries);
+
+     for (int i = 0; i < numEntries; ++i)
+         addrStack[i] = 0;
 }
 
 void
@@ -53,9 +56,6 @@ ReturnAddrStack::push(const Addr &return_addr)
 void
 ReturnAddrStack::pop()
 {
-    // Not sure it's possible to really track usedEntries properly.
-//    assert(usedEntries > 0);
-
     if (usedEntries > 0) {
         --usedEntries;
     }
diff --git a/cpu/o3/ras.hh b/cpu/o3/ras.hh
index 46d98181e..5aa4fc05f 100644
--- a/cpu/o3/ras.hh
+++ b/cpu/o3/ras.hh
@@ -26,43 +26,68 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_RAS_HH__
-#define __CPU_O3_CPU_RAS_HH__
+#ifndef __CPU_O3_RAS_HH__
+#define __CPU_O3_RAS_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
+#include <vector>
 
+/** Return address stack class, implements a simple RAS. */
 class ReturnAddrStack
 {
   public:
-    ReturnAddrStack(unsigned numEntries);
+    /** Creates a return address stack, but init() must be called prior to
+     *  use.
+     */
+    ReturnAddrStack() {}
 
+    /** Initializes RAS with a specified number of entries.
+     *  @param numEntries Number of entries in the RAS.
+     */
+    void init(unsigned numEntries);
+
+    /** Returns the top address on the RAS. */
     Addr top()
     { return addrStack[tos]; }
 
+    /** Returns the index of the top of the RAS. */
     unsigned topIdx()
     { return tos; }
 
+    /** Pushes an address onto the RAS. */
     void push(const Addr &return_addr);
 
+    /** Pops the top address from the RAS. */
     void pop();
 
+    /** Changes index to the top of the RAS, and replaces the top address with
+     *  a new target.
+     *  @param top_entry_idx The index of the RAS that will now be the top.
+     *  @param restored_target The new target address of the new top of the RAS.
+     */
     void restore(unsigned top_entry_idx, const Addr &restored_target);
 
   private:
+    /** Increments the top of stack index. */
     inline void incrTos()
     { if (++tos == numEntries) tos = 0; }
 
+    /** Decrements the top of stack index. */
     inline void decrTos()
     { tos = (tos == 0 ? numEntries - 1 : tos - 1); }
 
-    Addr *addrStack;
+    /** The RAS itself. */
+    std::vector<Addr> addrStack;
 
+    /** The number of entries in the RAS. */
     unsigned numEntries;
 
+    /** The number of used entries in the RAS. */
     unsigned usedEntries;
 
+    /** The top of stack index. */
     unsigned tos;
 };
 
-#endif // __CPU_O3_CPU_RAS_HH__
+#endif // __CPU_O3_RAS_HH__
diff --git a/cpu/o3/regfile.hh b/cpu/o3/regfile.hh
index 1e6e10f29..78674c32c 100644
--- a/cpu/o3/regfile.hh
+++ b/cpu/o3/regfile.hh
@@ -26,10 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_REGFILE_HH__
-#define __CPU_O3_CPU_REGFILE_HH__
-
-// @todo: Destructor
+#ifndef __CPU_O3_REGFILE_HH__
+#define __CPU_O3_REGFILE_HH__
 
 #include "arch/isa_traits.hh"
 #include "arch/faults.hh"
@@ -42,11 +40,14 @@
 
 #endif
 
-// This really only depends on the ISA, and not the Impl.  It might be nicer
-// to see if I can make it depend on nothing...
-// Things that are in the ifdef FULL_SYSTEM are pretty dependent on the ISA,
-// and should go in the AlphaFullCPU.
+#include <vector>
 
+/**
+ * Simple physical register file class.
+ * This really only depends on the ISA, and not the Impl. Things that are
+ * in the ifdef FULL_SYSTEM are pretty dependent on the ISA, and probably
+ * should go in the AlphaFullCPU.
+ */
 template <class Impl>
 class PhysRegFile
 {
@@ -55,19 +56,18 @@ class PhysRegFile
     typedef TheISA::FloatReg FloatReg;
     typedef TheISA::MiscRegFile MiscRegFile;
     typedef TheISA::MiscReg MiscReg;
+    // Note that most of the definitions of the IntReg, FloatReg, etc. exist
+    // within the Impl/ISA class and not within this PhysRegFile class.
 
-    //Note that most of the definitions of the IntReg, FloatReg, etc. exist
-    //within the Impl/ISA class and not within this PhysRegFile class.
-
-    //Will need some way to allow stuff like swap_palshadow to access the
-    //correct registers.  Might require code changes to swap_palshadow and
-    //other execution contexts.
-
-    //Will make these registers public for now, but they probably should
-    //be private eventually with some accessor functions.
+    // Will make these registers public for now, but they probably should
+    // be private eventually with some accessor functions.
   public:
     typedef typename Impl::FullCPU FullCPU;
 
+    /**
+     * Constructs a physical register file with the specified amount of
+     * integer and floating point registers.
+     */
     PhysRegFile(unsigned _numPhysicalIntRegs,
                 unsigned _numPhysicalFloatRegs);
 
@@ -80,6 +80,7 @@ class PhysRegFile
 //    void serialize(std::ostream &os);
 //    void unserialize(Checkpoint *cp, const std::string &section);
 
+    /** Reads an integer register. */
     uint64_t readIntReg(PhysRegIndex reg_idx)
     {
         assert(reg_idx < numPhysicalIntRegs);
@@ -89,6 +90,7 @@ class PhysRegFile
         return intRegFile[reg_idx];
     }
 
+    /** Reads a floating point register (single precision). */
     float readFloatRegSingle(PhysRegIndex reg_idx)
     {
         // Remove the base Float reg dependency.
@@ -102,6 +104,7 @@ class PhysRegFile
         return (float)floatRegFile[reg_idx].d;
     }
 
+    /** Reads a floating point register (double precision). */
     double readFloatRegDouble(PhysRegIndex reg_idx)
     {
         // Remove the base Float reg dependency.
@@ -115,6 +118,7 @@ class PhysRegFile
         return floatRegFile[reg_idx].d;
     }
 
+    /** Reads a floating point register as an integer. */
     uint64_t readFloatRegInt(PhysRegIndex reg_idx)
     {
         // Remove the base Float reg dependency.
@@ -128,6 +132,7 @@ class PhysRegFile
         return floatRegFile[reg_idx].q;
     }
 
+    /** Sets an integer register to the given value. */
     void setIntReg(PhysRegIndex reg_idx, uint64_t val)
     {
         assert(reg_idx < numPhysicalIntRegs);
@@ -135,9 +140,11 @@ class PhysRegFile
         DPRINTF(IEW, "RegFile: Setting int register %i to %lli\n",
                 int(reg_idx), val);
 
-        intRegFile[reg_idx] = val;
+        if (reg_idx != TheISA::ZeroReg)
+            intRegFile[reg_idx] = val;
     }
 
+    /** Sets a single precision floating point register to the given value. */
     void setFloatRegSingle(PhysRegIndex reg_idx, float val)
     {
         // Remove the base Float reg dependency.
@@ -148,9 +155,11 @@ class PhysRegFile
         DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                 int(reg_idx), val);
 
-        floatRegFile[reg_idx].d = (double)val;
+        if (reg_idx != TheISA::ZeroReg)
+            floatRegFile[reg_idx].d = (double)val;
     }
 
+    /** Sets a double precision floating point register to the given value. */
     void setFloatRegDouble(PhysRegIndex reg_idx, double val)
     {
         // Remove the base Float reg dependency.
@@ -161,9 +170,11 @@ class PhysRegFile
         DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                 int(reg_idx), val);
 
-        floatRegFile[reg_idx].d = val;
+        if (reg_idx != TheISA::ZeroReg)
+            floatRegFile[reg_idx].d = val;
     }
 
+    /** Sets a floating point register to the given integer value. */
     void setFloatRegInt(PhysRegIndex reg_idx, uint64_t val)
     {
         // Remove the base Float reg dependency.
@@ -174,78 +185,68 @@ class PhysRegFile
         DPRINTF(IEW, "RegFile: Setting float register %i to %lli\n",
                 int(reg_idx), val);
 
-        floatRegFile[reg_idx].q = val;
-    }
-
-    uint64_t readPC()
-    {
-        return pc;
-    }
-
-    void setPC(uint64_t val)
-    {
-        pc = val;
-    }
-
-    void setNextPC(uint64_t val)
-    {
-        npc = val;
+        if (reg_idx != TheISA::ZeroReg)
+            floatRegFile[reg_idx].q = val;
     }
 
     //Consider leaving this stuff and below in some implementation specific
     //file as opposed to the general register file.  Or have a derived class.
-    MiscReg readMiscReg(int misc_reg)
+    MiscReg readMiscReg(int misc_reg, unsigned thread_id)
     {
-        // Dummy function for now.
-        // @todo: Fix this once proxy XC is used.
-        return 0;
+        return miscRegs[thread_id].readReg(misc_reg);
     }
 
-    Fault setMiscReg(int misc_reg, const MiscReg &val)
+    MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault,
+                                  unsigned thread_id)
     {
-        // Dummy function for now.
-        // @todo: Fix this once proxy XC is used.
-        return NoFault;
+        return miscRegs[thread_id].readRegWithEffect(misc_reg, fault,
+                                                     cpu->xcProxies[thread_id]);
+    }
+
+    Fault setMiscReg(int misc_reg, const MiscReg &val, unsigned thread_id)
+    {
+        return miscRegs[thread_id].setReg(misc_reg, val);
+    }
+
+    Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val,
+                               unsigned thread_id)
+    {
+        return miscRegs[thread_id].setRegWithEffect(misc_reg, val,
+                                                    cpu->xcProxies[thread_id]);
     }
 
 #if FULL_SYSTEM
     int readIntrFlag() { return intrflag; }
+    /** Sets an interrupt flag. */
     void setIntrFlag(int val) { intrflag = val; }
 #endif
 
-    // These should be private eventually, but will be public for now
-    // so that I can hack around the initregs issue.
   public:
     /** (signed) integer register file. */
-    IntReg *intRegFile;
+    std::vector<IntReg> intRegFile;
 
     /** Floating point register file. */
-    FloatReg *floatRegFile;
+    std::vector<FloatReg> floatRegFile;
 
     /** Miscellaneous register file. */
-    MiscRegFile miscRegs;
-
-    /** Program counter. */
-    Addr pc;
-
-    /** Next-cycle program counter. */
-    Addr npc;
+    MiscRegFile miscRegs[Impl::MaxThreads];
 
 #if FULL_SYSTEM
   private:
-    // This is ISA specifc stuff; remove it eventually once ISAImpl is used
-//    IntReg palregs[NumIntRegs];	// PAL shadow registers
     int intrflag;			// interrupt flag
-    bool pal_shadow;		// using pal_shadow registers
 #endif
 
   private:
+    /** CPU pointer. */
     FullCPU *cpu;
 
   public:
+    /** Sets the CPU pointer. */
     void setCPU(FullCPU *cpu_ptr) { cpu = cpu_ptr; }
 
+    /** Number of physical integer registers. */
     unsigned numPhysicalIntRegs;
+    /** Number of physical floating point registers. */
     unsigned numPhysicalFloatRegs;
 };
 
@@ -255,11 +256,11 @@ PhysRegFile<Impl>::PhysRegFile(unsigned _numPhysicalIntRegs,
     : numPhysicalIntRegs(_numPhysicalIntRegs),
       numPhysicalFloatRegs(_numPhysicalFloatRegs)
 {
-    intRegFile = new IntReg[numPhysicalIntRegs];
-    floatRegFile = new FloatReg[numPhysicalFloatRegs];
+    intRegFile.resize(numPhysicalIntRegs);
+    floatRegFile.resize(numPhysicalFloatRegs);
 
-    memset(intRegFile, 0, sizeof(*intRegFile));
-    memset(floatRegFile, 0, sizeof(*floatRegFile));
+    //memset(intRegFile, 0, sizeof(*intRegFile));
+    //memset(floatRegFile, 0, sizeof(*floatRegFile));
 }
 
-#endif // __CPU_O3_CPU_REGFILE_HH__
+#endif
diff --git a/cpu/o3/rename.cc b/cpu/o3/rename.cc
index 6e9ee23da..4dc3bf6b2 100644
--- a/cpu/o3/rename.cc
+++ b/cpu/o3/rename.cc
@@ -30,4 +30,4 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/rename_impl.hh"
 
-template class SimpleRename<AlphaSimpleImpl>;
+template class DefaultRename<AlphaSimpleImpl>;
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index 07b442964..d5beccde9 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -26,23 +26,27 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo:
-// Fix up trap and barrier handling.
-// May want to have different statuses to differentiate the different stall
-// conditions.
-
-#ifndef __CPU_O3_CPU_SIMPLE_RENAME_HH__
-#define __CPU_O3_CPU_SIMPLE_RENAME_HH__
+#ifndef __CPU_O3_RENAME_HH__
+#define __CPU_O3_RENAME_HH__
 
 #include <list>
 
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 
-// Will need rename maps for both the int reg file and fp reg file.
-// Or change rename map class to handle both. (RegFile handles both.)
+/**
+ * DefaultRename handles both single threaded and SMT rename. Its width is
+ * specified by the parameters; each cycle it tries to rename that many
+ * instructions. It holds onto the rename history of all instructions with
+ * destination registers, storing the arch. register, the new physical
+ * register, and the old physical register, to allow for undoing of mappings
+ * if squashing happens, or freeing up registers upon commit. Rename handles
+ * blocking if the ROB, IQ, or LSQ is going to be full. Rename also handles
+ * barriers, and does so by stalling on the instruction until the ROB is
+ * empty and there are no instructions in flight to the ROB.
+ */
 template<class Impl>
-class SimpleRename
+class DefaultRename
 {
   public:
     // Typedefs from the Impl.
@@ -51,25 +55,38 @@ class SimpleRename
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::Params Params;
 
-    typedef typename CPUPol::FetchStruct FetchStruct;
+    // Typedefs from the CPUPol
     typedef typename CPUPol::DecodeStruct DecodeStruct;
     typedef typename CPUPol::RenameStruct RenameStruct;
     typedef typename CPUPol::TimeStruct TimeStruct;
-
-    // Typedefs from the CPUPol
     typedef typename CPUPol::FreeList FreeList;
     typedef typename CPUPol::RenameMap RenameMap;
+    // These are used only for initialization.
+    typedef typename CPUPol::IEW IEW;
+    typedef typename CPUPol::Commit Commit;
 
     // Typedefs from the ISA.
     typedef TheISA::RegIndex RegIndex;
 
+    // A deque is used to queue the instructions.  Barrier insts must be
+    // added to the front of the deque, which is the only reason for using
+    // a deque instead of a queue. (Most other stages use a queue)
+    typedef std::list<DynInstPtr> InstQueue;
+
   public:
-    // Rename will block if ROB becomes full or issue queue becomes full,
-    // or there are no free registers to rename to.
-    // Only case where rename squashes is if IEW squashes.
-    enum Status {
+    /** Overall rename status. Used to determine if the CPU can deschedule
+     * itself due to a lack of activity.
+     */
+    enum RenameStatus {
+        Active,
+        Inactive
+    };
+
+    /** Individual thread status. */
+    enum ThreadStatus {
         Running,
         Idle,
+        StartSquash,
         Squashing,
         Blocked,
         Unblocking,
@@ -77,86 +94,191 @@ class SimpleRename
     };
 
   private:
-    Status _status;
+    /** Rename status. */
+    RenameStatus _status;
+
+    /** Per-thread status. */
+    ThreadStatus renameStatus[Impl::MaxThreads];
 
   public:
-    SimpleRename(Params &params);
+    /** DefaultRename constructor. */
+    DefaultRename(Params *params);
 
+    /** Returns the name of rename. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets CPU pointer. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets the main backwards communication time buffer pointer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    /** Sets pointer to time buffer used to communicate to the next stage. */
     void setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr);
 
+    /** Sets pointer to time buffer coming from decode. */
     void setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr);
 
-    void setRenameMap(RenameMap *rm_ptr);
+    /** Sets pointer to IEW stage. Used only for initialization. */
+    void setIEWStage(IEW *iew_stage)
+    { iew_ptr = iew_stage; }
 
-    void setFreeList(FreeList *fl_ptr);
-
-    void dumpHistory();
-
-    void tick();
-
-    void rename();
-
-    void squash();
+    /** Sets pointer to commit stage. Used only for initialization. */
+    void setCommitStage(Commit *commit_stage)
+    { commit_ptr = commit_stage; }
 
   private:
-    void block();
+    /** Pointer to IEW stage. Used only for initialization. */
+    IEW *iew_ptr;
 
-    inline void unblock();
+    /** Pointer to commit stage. Used only for initialization. */
+    Commit *commit_ptr;
 
-    void doSquash();
+  public:
+    /** Initializes variables for the stage. */
+    void initStage();
 
-    void removeFromHistory(InstSeqNum inst_seq_num);
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
 
-    inline void renameSrcRegs(DynInstPtr &inst);
+    /** Sets pointer to rename maps (per-thread structures). */
+    void setRenameMap(RenameMap rm_ptr[Impl::MaxThreads]);
 
-    inline void renameDestRegs(DynInstPtr &inst);
+    /** Sets pointer to the free list. */
+    void setFreeList(FreeList *fl_ptr);
 
-    inline int calcFreeROBEntries();
+    /** Sets pointer to the scoreboard. */
+    void setScoreboard(Scoreboard *_scoreboard);
 
-    inline int calcFreeIQEntries();
+    /** Squashes all instructions in a thread. */
+    void squash(unsigned tid);
 
-    /** Holds the previous information for each rename.
-     *  Note that often times the inst may have been deleted, so only access
-     *  the pointer for the address and do not dereference it.
+    /** Ticks rename, which processes all input signals and attempts to rename
+     * as many instructions as possible.
+     */
+    void tick();
+
+    /** Debugging function used to dump history buffer of renamings. */
+    void dumpHistory();
+
+  private:
+    /** Determines what to do based on rename's current status.
+     * @param status_change rename() sets this variable if there was a status
+     * change (ie switching from blocking to unblocking).
+     * @param tid Thread id to rename instructions from.
+     */
+    void rename(bool &status_change, unsigned tid);
+
+    /** Renames instructions for the given thread. Also handles serializing
+     * instructions.
+     */
+    void renameInsts(unsigned tid);
+
+    /** Inserts unused instructions from a given thread into the skid buffer,
+     * to be renamed once rename unblocks.
+     */
+    void skidInsert(unsigned tid);
+
+    /** Separates instructions from decode into individual lists of instructions
+     * sorted by thread.
+     */
+    void sortInsts();
+
+    /** Returns if all of the skid buffers are empty. */
+    bool skidsEmpty();
+
+    /** Updates overall rename status based on all of the threads' statuses. */
+    void updateStatus();
+
+    /** Switches rename to blocking, and signals back that rename has become
+     * blocked.
+     * @return Returns true if there is a status change.
+     */
+    bool block(unsigned tid);
+
+    /** Switches rename to unblocking if the skid buffer is empty, and signals
+     * back that rename has unblocked.
+     * @return Returns true if there is a status change.
+     */
+    bool unblock(unsigned tid);
+
+    /** Executes actual squash, removing squashed instructions. */
+    void doSquash(unsigned tid);
+
+    /** Removes a committed instruction's rename history. */
+    void removeFromHistory(InstSeqNum inst_seq_num, unsigned tid);
+
+    /** Renames the source registers of an instruction. */
+    inline void renameSrcRegs(DynInstPtr &inst, unsigned tid);
+
+    /** Renames the destination registers of an instruction. */
+    inline void renameDestRegs(DynInstPtr &inst, unsigned tid);
+
+    /** Calculates the number of free ROB entries for a specific thread. */
+    inline int calcFreeROBEntries(unsigned tid);
+
+    /** Calculates the number of free IQ entries for a specific thread. */
+    inline int calcFreeIQEntries(unsigned tid);
+
+    /** Calculates the number of free LSQ entries for a specific thread. */
+    inline int calcFreeLSQEntries(unsigned tid);
+
+    /** Returns the number of valid instructions coming from decode. */
+    unsigned validInsts();
+
+    /** Reads signals telling rename to block/unblock. */
+    void readStallSignals(unsigned tid);
+
+    /** Checks if any stages are telling rename to block. */
+    bool checkStall(unsigned tid);
+
+    void readFreeEntries(unsigned tid);
+
+    bool checkSignalsAndUpdate(unsigned tid);
+
+    /** Either serializes on the next instruction available in the InstQueue,
+     * or records that it must serialize on the next instruction to enter
+     * rename.
+     * @param inst_list The list of younger, unprocessed instructions for the
+     * thread that has the serializeAfter instruction.
+     * @param tid The thread id.
+     */
+    void serializeAfter(InstQueue &inst_list, unsigned tid);
+
+    /** Holds the information for each destination register rename. It holds
+     * the instruction's sequence number, the arch register, the old physical
+     * register for that arch. register, and the new physical register.
      */
     struct RenameHistory {
         RenameHistory(InstSeqNum _instSeqNum, RegIndex _archReg,
                       PhysRegIndex _newPhysReg, PhysRegIndex _prevPhysReg)
             : instSeqNum(_instSeqNum), archReg(_archReg),
-              newPhysReg(_newPhysReg), prevPhysReg(_prevPhysReg),
-              placeHolder(false)
-        {
-        }
-
-        /** Constructor used specifically for cases where a place holder
-         *  rename history entry is being made.
-         */
-        RenameHistory(InstSeqNum _instSeqNum)
-            : instSeqNum(_instSeqNum), archReg(0), newPhysReg(0),
-              prevPhysReg(0), placeHolder(true)
+              newPhysReg(_newPhysReg), prevPhysReg(_prevPhysReg)
         {
         }
 
+        /** The sequence number of the instruction that renamed. */
         InstSeqNum instSeqNum;
+        /** The architectural register index that was renamed. */
         RegIndex archReg;
+        /** The new physical register that the arch. register is renamed to. */
         PhysRegIndex newPhysReg;
+        /** The old physical register that the arch. register was renamed to. */
         PhysRegIndex prevPhysReg;
-        bool placeHolder;
     };
 
-    std::list<RenameHistory> historyBuffer;
+    /** A per-thread list of all destination register renames, used to either
+     * undo rename mappings or free old physical registers.
+     */
+    std::list<RenameHistory> historyBuffer[Impl::MaxThreads];
 
-    /** CPU interface. */
+    /** Pointer to CPU. */
     FullCPU *cpu;
 
-    // Interfaces to objects outside of rename.
-    /** Time buffer interface. */
+    /** Pointer to main time buffer used for backwards communication. */
     TimeBuffer<TimeStruct> *timeBuffer;
 
     /** Wire to get IEW's output from backwards time buffer. */
@@ -166,7 +288,6 @@ class SimpleRename
     typename TimeBuffer<TimeStruct>::wire fromCommit;
 
     /** Wire to write infromation heading to previous stages. */
-    // Might not be the best name as not only decode will read it.
     typename TimeBuffer<TimeStruct>::wire toDecode;
 
     /** Rename instruction queue. */
@@ -181,15 +302,71 @@ class SimpleRename
     /** Wire to get decode's output from decode queue. */
     typename TimeBuffer<DecodeStruct>::wire fromDecode;
 
+    /** Queue of all instructions coming from decode this cycle. */
+    InstQueue insts[Impl::MaxThreads];
+
     /** Skid buffer between rename and decode. */
-    std::queue<DecodeStruct> skidBuffer;
+    InstQueue skidBuffer[Impl::MaxThreads];
 
     /** Rename map interface. */
-    SimpleRenameMap *renameMap;
+    RenameMap *renameMap[Impl::MaxThreads];
 
     /** Free list interface. */
     FreeList *freeList;
 
+    /** Pointer to the list of active threads. */
+    std::list<unsigned> *activeThreads;
+
+    /** Pointer to the scoreboard. */
+    Scoreboard *scoreboard;
+
+    /** Count of instructions in progress that have been sent off to the IQ
+     * and ROB, but are not yet included in their occupancy counts.
+     */
+    int instsInProgress[Impl::MaxThreads];
+
+    /** Variable that tracks if decode has written to the time buffer this
+     * cycle. Used to tell CPU if there is activity this cycle.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Structures whose free entries impact the amount of instructions that
+     * can be renamed.
+     */
+    struct FreeEntries {
+        unsigned iqEntries;
+        unsigned lsqEntries;
+        unsigned robEntries;
+    };
+
+    /** Per-thread tracking of the number of free entries of back-end
+     * structures.
+     */
+    FreeEntries freeEntries[Impl::MaxThreads];
+
+    /** Records if the ROB is empty. In SMT mode the ROB may be dynamically
+     * partitioned between threads, so the ROB must tell rename when it is
+     * empty.
+     */
+    bool emptyROB[Impl::MaxThreads];
+
+    /** Source of possible stalls. */
+    struct Stalls {
+        bool iew;
+        bool commit;
+    };
+
+    /** Tracks which stages are telling decode to stall. */
+    Stalls stalls[Impl::MaxThreads];
+
+    /** The barrier instruction that rename has stalled on. */
+    DynInstPtr barrierInst[Impl::MaxThreads];
+
+    /** Records if rename needs to serialize on the next instruction for any
+     * thread.
+     */
+    bool serializeOnNextInst[Impl::MaxThreads];
+
     /** Delay between iew and rename, in ticks. */
     int iewToRenameDelay;
 
@@ -207,27 +384,68 @@ class SimpleRename
      */
     unsigned commitWidth;
 
-    /** The instruction that rename is currently on.  It needs to have
-     *  persistent state so that when a stall occurs in the middle of a
-     *  group of instructions, it can restart at the proper instruction.
+    /** The index of the instruction in the time buffer to IEW that rename is
+     * currently using.
      */
-    unsigned numInst;
+    unsigned toIEWIndex;
 
+    /** Whether or not rename needs to block this cycle. */
+    bool blockThisCycle;
+
+    /** The number of threads active in rename. */
+    unsigned numThreads;
+
+    /** The maximum skid buffer size. */
+    unsigned skidBufferMax;
+
+    /** Enum to record the source of a structure full stall.  Can come from
+     * either ROB, IQ, LSQ, and it is priortized in that order.
+     */
+    enum FullSource {
+        ROB,
+        IQ,
+        LSQ,
+        NONE
+    };
+
+    /** Function used to increment the stat that corresponds to the source of
+     * the stall.
+     */
+    inline void incrFullStat(const FullSource &source);
+
+    /** Stat for total number of cycles spent squashing. */
     Stats::Scalar<> renameSquashCycles;
+    /** Stat for total number of cycles spent idle. */
     Stats::Scalar<> renameIdleCycles;
+    /** Stat for total number of cycles spent blocking. */
     Stats::Scalar<> renameBlockCycles;
+    /** Stat for total number of cycles spent stalling for a barrier. */
+    Stats::Scalar<> renameBarrierCycles;
+    /** Stat for total number of cycles spent running normally. */
+    Stats::Scalar<> renameRunCycles;
+    /** Stat for total number of cycles spent unblocking. */
     Stats::Scalar<> renameUnblockCycles;
+    /** Stat for total number of renamed instructions. */
     Stats::Scalar<> renameRenamedInsts;
+    /** Stat for total number of squashed instructions that rename discards. */
     Stats::Scalar<> renameSquashedInsts;
+    /** Stat for total number of times that the ROB starts a stall in rename. */
     Stats::Scalar<> renameROBFullEvents;
+    /** Stat for total number of times that the IQ starts a stall in rename. */
     Stats::Scalar<> renameIQFullEvents;
+    /** Stat for total number of times that the LSQ starts a stall in rename. */
+    Stats::Scalar<> renameLSQFullEvents;
+    /** Stat for total number of times that rename runs out of free registers
+     * to use to rename. */
     Stats::Scalar<> renameFullRegistersEvents;
+    /** Stat for total number of renamed destination registers. */
     Stats::Scalar<> renameRenamedOperands;
+    /** Stat for total number of source register rename lookups. */
     Stats::Scalar<> renameRenameLookups;
-    Stats::Scalar<> renameHBPlaceHolders;
+    /** Stat for total number of committed renaming mappings. */
     Stats::Scalar<> renameCommittedMaps;
+    /** Stat for total number of mappings that were undone due to a squash. */
     Stats::Scalar<> renameUndoneMaps;
-    Stats::Scalar<> renameValidUndoneMaps;
 };
 
-#endif // __CPU_O3_CPU_SIMPLE_RENAME_HH__
+#endif // __CPU_O3_RENAME_HH__
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index 2068b36ab..441118ef1 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -31,21 +31,51 @@
 #include "config/full_system.hh"
 #include "cpu/o3/rename.hh"
 
+using namespace std;
+
 template <class Impl>
-SimpleRename<Impl>::SimpleRename(Params &params)
-    : iewToRenameDelay(params.iewToRenameDelay),
-      decodeToRenameDelay(params.decodeToRenameDelay),
-      commitToRenameDelay(params.commitToRenameDelay),
-      renameWidth(params.renameWidth),
-      commitWidth(params.commitWidth),
-      numInst(0)
+DefaultRename<Impl>::DefaultRename(Params *params)
+    : iewToRenameDelay(params->iewToRenameDelay),
+      decodeToRenameDelay(params->decodeToRenameDelay),
+      commitToRenameDelay(params->commitToRenameDelay),
+      renameWidth(params->renameWidth),
+      commitWidth(params->commitWidth),
+      numThreads(params->numberOfThreads)
 {
-    _status = Idle;
+    _status = Inactive;
+
+    for (int i=0; i< numThreads; i++) {
+        renameStatus[i] = Idle;
+
+        freeEntries[i].iqEntries = 0;
+        freeEntries[i].lsqEntries = 0;
+        freeEntries[i].robEntries = 0;
+
+        stalls[i].iew = false;
+        stalls[i].commit = false;
+        barrierInst[i] = NULL;
+
+        instsInProgress[i] = 0;
+
+        emptyROB[i] = true;
+
+        serializeOnNextInst[i] = false;
+    }
+
+    // @todo: Make into a parameter.
+    skidBufferMax = (2 * (iewToRenameDelay * params->decodeWidth)) + renameWidth;
+}
+
+template <class Impl>
+std::string
+DefaultRename<Impl>::name() const
+{
+    return cpu->name() + ".rename";
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::regStats()
+DefaultRename<Impl>::regStats()
 {
     renameSquashCycles
         .name(name() + ".renameSquashCycles")
@@ -59,6 +89,14 @@ SimpleRename<Impl>::regStats()
         .name(name() + ".renameBlockCycles")
         .desc("Number of cycles rename is blocking")
         .prereq(renameBlockCycles);
+    renameBarrierCycles
+        .name(name() + ".renameBarrierCycles")
+        .desc("Number of cycles rename is blocking due to a barrier stall")
+        .prereq(renameBarrierCycles);
+    renameRunCycles
+        .name(name() + ".renameRunCycles")
+        .desc("Number of cycles rename is running")
+        .prereq(renameIdleCycles);
     renameUnblockCycles
         .name(name() + ".renameUnblockCycles")
         .desc("Number of cycles rename is unblocking")
@@ -73,12 +111,16 @@ SimpleRename<Impl>::regStats()
         .prereq(renameSquashedInsts);
     renameROBFullEvents
         .name(name() + ".renameROBFullEvents")
-        .desc("Number of times rename has considered the ROB 'full'")
+        .desc("Number of times rename has blocked due to ROB full")
         .prereq(renameROBFullEvents);
     renameIQFullEvents
         .name(name() + ".renameIQFullEvents")
-        .desc("Number of times rename has considered the IQ 'full'")
+        .desc("Number of times rename has blocked due to IQ full")
         .prereq(renameIQFullEvents);
+    renameLSQFullEvents
+        .name(name() + ".renameLSQFullEvents")
+        .desc("Number of times rename has blocked due to LSQ full")
+        .prereq(renameLSQFullEvents);
     renameFullRegistersEvents
         .name(name() + ".renameFullRegisterEvents")
         .desc("Number of times there has been no free registers")
@@ -91,10 +133,6 @@ SimpleRename<Impl>::regStats()
         .name(name() + ".renameRenameLookups")
         .desc("Number of register rename lookups that rename has made")
         .prereq(renameRenameLookups);
-    renameHBPlaceHolders
-        .name(name() + ".renameHBPlaceHolders")
-        .desc("Number of place holders added to the history buffer")
-        .prereq(renameHBPlaceHolders);
     renameCommittedMaps
         .name(name() + ".renameCommittedMaps")
         .desc("Number of HB maps that are committed")
@@ -103,25 +141,21 @@ SimpleRename<Impl>::regStats()
         .name(name() + ".renameUndoneMaps")
         .desc("Number of HB maps that are undone due to squashing")
         .prereq(renameUndoneMaps);
-    renameValidUndoneMaps
-        .name(name() + ".renameValidUndoneMaps")
-        .desc("Number of HB maps that are undone, and are not place holders")
-        .prereq(renameValidUndoneMaps);
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultRename<Impl>::setCPU(FullCPU *cpu_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting CPU pointer.\n");
+    DPRINTF(Rename, "Setting CPU pointer.\n");
     cpu = cpu_ptr;
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+DefaultRename<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting time buffer pointer.\n");
+    DPRINTF(Rename, "Setting time buffer pointer.\n");
     timeBuffer = tb_ptr;
 
     // Setup wire to read information from time buffer, from IEW stage.
@@ -136,9 +170,9 @@ SimpleRename<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 
 template <class Impl>
 void
-SimpleRename<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
+DefaultRename<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting rename queue pointer.\n");
+    DPRINTF(Rename, "Setting rename queue pointer.\n");
     renameQueue = rq_ptr;
 
     // Setup wire to write information to future stages.
@@ -147,9 +181,9 @@ SimpleRename<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 
 template <class Impl>
 void
-SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
+DefaultRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting decode queue pointer.\n");
+    DPRINTF(Rename, "Setting decode queue pointer.\n");
     decodeQueue = dq_ptr;
 
     // Setup wire to get information from decode.
@@ -158,214 +192,670 @@ SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 
 template <class Impl>
 void
-SimpleRename<Impl>::setRenameMap(RenameMap *rm_ptr)
+DefaultRename<Impl>::initStage()
 {
-    DPRINTF(Rename, "Rename: Setting rename map pointer.\n");
-    renameMap = rm_ptr;
+    for (int tid=0; tid < numThreads; tid++) {
+        freeEntries[tid].iqEntries = iew_ptr->instQueue.numFreeEntries(tid);
+        freeEntries[tid].lsqEntries = iew_ptr->ldstQueue.numFreeEntries(tid);
+        freeEntries[tid].robEntries = commit_ptr->numROBFreeEntries(tid);
+        emptyROB[tid] = true;
+    }
+
+    // Clear these pointers so they are not accidentally used in
+    // non-initialization code.
+    iew_ptr = NULL;
+    commit_ptr = NULL;
+}
+
+template<class Impl>
+void
+DefaultRename<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(Rename, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+
+
+template <class Impl>
+void
+DefaultRename<Impl>::setRenameMap(RenameMap rm_ptr[])
+{
+    DPRINTF(Rename, "Setting rename map pointers.\n");
+
+    for (int i=0; i<numThreads; i++) {
+        renameMap[i] = &rm_ptr[i];
+    }
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::setFreeList(FreeList *fl_ptr)
+DefaultRename<Impl>::setFreeList(FreeList *fl_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting free list pointer.\n");
+    DPRINTF(Rename, "Setting free list pointer.\n");
     freeList = fl_ptr;
 }
 
+template<class Impl>
+void
+DefaultRename<Impl>::setScoreboard(Scoreboard *_scoreboard)
+{
+    DPRINTF(Rename, "Setting scoreboard pointer.\n");
+    scoreboard = _scoreboard;
+}
+
 template <class Impl>
 void
-SimpleRename<Impl>::dumpHistory()
+DefaultRename<Impl>::squash(unsigned tid)
 {
-    typename list<RenameHistory>::iterator buf_it = historyBuffer.begin();
+    DPRINTF(Rename, "[tid:%u]: Squashing instructions.\n",tid);
 
-    while (buf_it != historyBuffer.end())
-    {
-        cprintf("Seq num: %i\nArch reg: %i New phys reg: %i Old phys "
-                "reg: %i\n", (*buf_it).instSeqNum, (int)(*buf_it).archReg,
-                (int)(*buf_it).newPhysReg, (int)(*buf_it).prevPhysReg);
+    // Clear the stall signal if rename was blocked or unblocking before.
+    // If it still needs to block, the blocking should happen the next
+    // cycle and there should be space to hold everything due to the squash.
+    if (renameStatus[tid] == Blocked ||
+        renameStatus[tid] == Unblocking ||
+        renameStatus[tid] == BarrierStall) {
+#if !FULL_SYSTEM
+        // In syscall emulation, we can have both a block and a squash due
+        // to a syscall in the same cycle.  This would cause both signals to
+        // be high.  This shouldn't happen in full system.
+        if (toDecode->renameBlock[tid]) {
+            toDecode->renameBlock[tid] = 0;
+        } else {
+            toDecode->renameUnblock[tid] = 1;
+        }
+#else
+        toDecode->renameUnblock[tid] = 1;
+#endif
+        barrierInst[tid] = NULL;
+    }
 
-        buf_it++;
+    // Set the status to Squashing.
+    renameStatus[tid] = Squashing;
+
+    // Clear the skid buffer in case it has any data in it.
+    unsigned squashCount = 0;
+
+    for (int i=0; i<fromDecode->size; i++) {
+        if (fromDecode->insts[i]->threadNumber == tid) {
+            fromDecode->insts[i]->squashed = true;
+            wroteToTimeBuffer = true;
+            squashCount++;
+        }
+    }
+
+    insts[tid].clear();
+
+    // Clear the skid buffer in case it has any data in it.
+    skidBuffer[tid].clear();
+
+    doSquash(tid);
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::tick()
+{
+    // Rename will need to try to rename as many instructions as it
+    // has bandwidth, unless it is blocked.
+
+    wroteToTimeBuffer = false;
+
+    blockThisCycle = false;
+
+    bool status_change = false;
+
+    toIEWIndex = 0;
+
+    sortInsts();
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    // Check stall and squash signals.
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        DPRINTF(Rename, "Processing [tid:%i]\n", tid);
+
+        status_change = checkSignalsAndUpdate(tid) || status_change;
+
+        rename(status_change, tid);
+    }
+
+    if (status_change) {
+        updateStatus();
+    }
+
+    if (wroteToTimeBuffer) {
+        DPRINTF(Activity, "Activity this cycle.\n");
+        cpu->activityThisCycle();
+    }
+
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        // If we committed this cycle then doneSeqNum will be > 0
+        if (fromCommit->commitInfo[tid].doneSeqNum != 0 &&
+            !fromCommit->commitInfo[tid].squash &&
+            renameStatus[tid] != Squashing) {
+
+            removeFromHistory(fromCommit->commitInfo[tid].doneSeqNum,
+                                  tid);
+        }
+    }
+
+    // @todo: make into updateProgress function
+    for (int tid=0; tid < numThreads; tid++) {
+        instsInProgress[tid] -= fromIEW->iewInfo[tid].dispatched;
+
+        assert(instsInProgress[tid] >=0);
+    }
+
+}
+
+template<class Impl>
+void
+DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
+{
+    // If status is Running or idle,
+    //     call renameInsts()
+    // If status is Unblocking,
+    //     buffer any instructions coming from decode
+    //     continue trying to empty skid buffer
+    //     check if stall conditions have passed
+
+    if (renameStatus[tid] == Blocked) {
+        ++renameBlockCycles;
+    } else if (renameStatus[tid] == Squashing) {
+        ++renameSquashCycles;
+    } else if (renameStatus[tid] == BarrierStall) {
+        ++renameBarrierCycles;
+    }
+
+    if (renameStatus[tid] == Running ||
+        renameStatus[tid] == Idle) {
+        DPRINTF(Rename, "[tid:%u]: Not blocked, so attempting to run "
+                "stage.\n", tid);
+
+        renameInsts(tid);
+    } else if (renameStatus[tid] == Unblocking) {
+        renameInsts(tid);
+
+        ++renameUnblockCycles;
+
+        if (validInsts()) {
+            // Add the current inputs to the skid buffer so they can be
+            // reprocessed when this stage unblocks.
+            skidInsert(tid);
+        }
+
+        // If we switched over to blocking, then there's a potential for
+        // an overall status change.
+        status_change = unblock(tid) || status_change || blockThisCycle;
     }
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::block()
+DefaultRename<Impl>::renameInsts(unsigned tid)
 {
-    DPRINTF(Rename, "Rename: Blocking.\n");
-    // Set status to Blocked.
-    _status = Blocked;
+    // Instructions can be either in the skid buffer or the queue of
+    // instructions coming from decode, depending on the status.
+    int insts_available = renameStatus[tid] == Unblocking ?
+        skidBuffer[tid].size() : insts[tid].size();
+
+    // Check the decode queue to see if instructions are available.
+    // If there are no available instructions to rename, then do nothing.
+    if (insts_available == 0) {
+        DPRINTF(Rename, "[tid:%u]: Nothing to do, breaking out early.\n",
+                tid);
+        // Should I change status to idle?
+        ++renameIdleCycles;
+        return;
+    } else if (renameStatus[tid] == Unblocking) {
+        ++renameUnblockCycles;
+    } else if (renameStatus[tid] == Running) {
+        ++renameRunCycles;
+    }
+
+    DynInstPtr inst;
+
+    // Will have to do a different calculation for the number of free
+    // entries.
+    int free_rob_entries = calcFreeROBEntries(tid);
+    int free_iq_entries  = calcFreeIQEntries(tid);
+    int free_lsq_entries = calcFreeLSQEntries(tid);
+    int min_free_entries = free_rob_entries;
+
+    FullSource source = ROB;
+
+    if (free_iq_entries < min_free_entries) {
+        min_free_entries = free_iq_entries;
+        source = IQ;
+    }
+
+    if (free_lsq_entries < min_free_entries) {
+        min_free_entries = free_lsq_entries;
+        source = LSQ;
+    }
+
+    // Check if there's any space left.
+    if (min_free_entries <= 0) {
+        DPRINTF(Rename, "[tid:%u]: Blocking due to no free ROB/IQ/LSQ "
+                "entries.\n"
+                "ROB has %i free entries.\n"
+                "IQ has %i free entries.\n"
+                "LSQ has %i free entries.\n",
+                tid,
+                free_rob_entries,
+                free_iq_entries,
+                free_lsq_entries);
+
+        blockThisCycle = true;
+
+        block(tid);
+
+        incrFullStat(source);
+
+        return;
+    } else if (min_free_entries < insts_available) {
+        DPRINTF(Rename, "[tid:%u]: Will have to block this cycle."
+                "%i insts available, but only %i insts can be "
+                "renamed due to ROB/IQ/LSQ limits.\n",
+                tid, insts_available, min_free_entries);
+
+        insts_available = min_free_entries;
+
+        blockThisCycle = true;
+
+        incrFullStat(source);
+    }
+
+    InstQueue &insts_to_rename = renameStatus[tid] == Unblocking ?
+        skidBuffer[tid] : insts[tid];
+
+    DPRINTF(Rename, "[tid:%u]: %i available instructions to "
+            "send iew.\n", tid, insts_available);
+
+    DPRINTF(Rename, "[tid:%u]: %i insts pipelining from Rename | %i insts "
+            "dispatched to IQ last cycle.\n",
+            tid, instsInProgress[tid], fromIEW->iewInfo[tid].dispatched);
+
+    // Handle serializing the next instruction if necessary.
+    if (serializeOnNextInst[tid]) {
+        if (emptyROB[tid] && instsInProgress[tid] == 0) {
+            // ROB already empty; no need to serialize.
+            serializeOnNextInst[tid] = false;
+        } else if (!insts_to_rename.empty()) {
+            insts_to_rename.front()->setSerializeBefore();
+        }
+    }
+
+    int renamed_insts = 0;
+
+    while (insts_available > 0 &&  toIEWIndex < renameWidth) {
+        DPRINTF(Rename, "[tid:%u]: Sending instructions to IEW.\n", tid);
+
+        assert(!insts_to_rename.empty());
+
+        inst = insts_to_rename.front();
+
+        insts_to_rename.pop_front();
+
+        //Use skidBuffer with oldest instructions
+        if (renameStatus[tid] == Unblocking) {
+            DPRINTF(Rename,"[tid:%u]: Removing [sn:%lli] PC:%#x from rename "
+                    "skidBuffer\n",
+                    tid, inst->seqNum, inst->readPC());
+        }
+
+        if (inst->isSquashed()) {
+            DPRINTF(Rename, "[tid:%u]: instruction %i with PC %#x is "
+                    "squashed, skipping.\n",
+                    tid, inst->seqNum, inst->threadNumber,inst->readPC());
+
+            ++renameSquashedInsts;
+
+            // Decrement how many instructions are available.
+            --insts_available;
+
+            continue;
+        }
+
+        DPRINTF(Rename, "[tid:%u]: Processing instruction [sn:%lli] with "
+                "PC %#x.\n",
+                tid, inst->seqNum, inst->readPC());
+
+        // Handle serializeAfter/serializeBefore instructions.
+        // serializeAfter marks the next instruction as serializeBefore.
+        // serializeBefore makes the instruction wait in rename until the ROB
+        // is empty.
+        if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
+            DPRINTF(Rename, "Serialize before instruction encountered.\n");
+
+            if (!inst->isTempSerializeBefore())
+                inst->setSerializeHandled();
+
+            // Change status over to BarrierStall so that other stages know
+            // what this is blocked on.
+            renameStatus[tid] = BarrierStall;
+
+            barrierInst[tid] = inst;
+
+            blockThisCycle = true;
+
+            break;
+        } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) {
+            DPRINTF(Rename, "Serialize after instruction encountered.\n");
+
+            inst->setSerializeHandled();
+
+            serializeAfter(insts_to_rename, tid);
+        }
+
+        // Check here to make sure there are enough destination registers
+        // to rename to.  Otherwise block.
+        if (renameMap[tid]->numFreeEntries() < inst->numDestRegs()) {
+            DPRINTF(Rename, "Blocking due to lack of free "
+                    "physical registers to rename to.\n");
+            blockThisCycle = true;
+
+            ++renameFullRegistersEvents;
+
+            break;
+        }
+
+        renameSrcRegs(inst, inst->threadNumber);
+
+        renameDestRegs(inst, inst->threadNumber);
+
+        ++renamed_insts;
+
+        // Put instruction in rename queue.
+        toIEW->insts[toIEWIndex] = inst;
+        ++(toIEW->size);
+
+        // Increment which instruction we're on.
+        ++toIEWIndex;
+
+        ++renameRenamedInsts;
+
+        // Decrement how many instructions are available.
+        --insts_available;
+    }
+
+    instsInProgress[tid] += renamed_insts;
+
+    // If we wrote to the time buffer, record this.
+    if (toIEWIndex) {
+        wroteToTimeBuffer = true;
+    }
+
+    // Check if there's any instructions left that haven't yet been renamed.
+    // If so then block.
+    if (insts_available) {
+        blockThisCycle = true;
+    }
+
+    if (blockThisCycle) {
+        block(tid);
+        toDecode->renameUnblock[tid] = false;
+    }
+}
+
+template<class Impl>
+void
+DefaultRename<Impl>::skidInsert(unsigned tid)
+{
+    DynInstPtr inst = NULL;
+
+    while (!insts[tid].empty()) {
+        inst = insts[tid].front();
+
+        insts[tid].pop_front();
+
+        assert(tid == inst->threadNumber);
+
+        DPRINTF(Rename, "[tid:%u]: Inserting [sn:%lli] PC:%#x into Rename "
+                "skidBuffer\n", tid, inst->seqNum, inst->readPC());
+
+        skidBuffer[tid].push_back(inst);
+    }
+
+    if (skidBuffer[tid].size() > skidBufferMax)
+        panic("Skidbuffer Exceeded Max Size");
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::sortInsts()
+{
+    int insts_from_decode = fromDecode->size;
+
+    for (int i=0; i < numThreads; i++)
+        assert(insts[i].empty());
+
+    for (int i = 0; i < insts_from_decode; ++i) {
+        DynInstPtr inst = fromDecode->insts[i];
+        insts[inst->threadNumber].push_back(inst);
+    }
+}
+
+template<class Impl>
+bool
+DefaultRename<Impl>::skidsEmpty()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        if (!skidBuffer[*threads++].empty())
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+void
+DefaultRename<Impl>::updateStatus()
+{
+    bool any_unblocking = false;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (renameStatus[tid] == Unblocking) {
+            any_unblocking = true;
+            break;
+        }
+    }
+
+    // Rename will have activity if it's unblocking.
+    if (any_unblocking) {
+        if (_status == Inactive) {
+            _status = Active;
+
+            DPRINTF(Activity, "Activating stage.\n");
+
+            cpu->activateStage(FullCPU::RenameIdx);
+        }
+    } else {
+        // If it's not unblocking, then rename will not have any internal
+        // activity.  Switch it to inactive.
+        if (_status == Active) {
+            _status = Inactive;
+            DPRINTF(Activity, "Deactivating stage.\n");
+
+            cpu->deactivateStage(FullCPU::RenameIdx);
+        }
+    }
+}
+
+template <class Impl>
+bool
+DefaultRename<Impl>::block(unsigned tid)
+{
+    DPRINTF(Rename, "[tid:%u]: Blocking.\n", tid);
 
     // Add the current inputs onto the skid buffer, so they can be
     // reprocessed when this stage unblocks.
-    skidBuffer.push(*fromDecode);
+    skidInsert(tid);
 
-    // Note that this stage only signals previous stages to stall when
-    // it is the cause of the stall originates at this stage.  Otherwise
-    // the previous stages are expected to check all possible stall signals.
+    // Only signal backwards to block if the previous stages do not think
+    // rename is already blocked.
+    if (renameStatus[tid] != Blocked) {
+        if (renameStatus[tid] != Unblocking) {
+            toDecode->renameBlock[tid] = true;
+            toDecode->renameUnblock[tid] = false;
+            wroteToTimeBuffer = true;
+        }
+
+        // Rename can not go from BarrierStall to Blocked, otherwise it would
+        // not know to complete the barrier stall.
+        if (renameStatus[tid] != BarrierStall) {
+            // Set status to Blocked.
+            renameStatus[tid] = Blocked;
+            return true;
+        }
+    }
+
+    return false;
 }
 
 template <class Impl>
-inline void
-SimpleRename<Impl>::unblock()
+bool
+DefaultRename<Impl>::unblock(unsigned tid)
 {
-    DPRINTF(Rename, "Rename: Read instructions out of skid buffer this "
-            "cycle.\n");
-    // Remove the now processed instructions from the skid buffer.
-    skidBuffer.pop();
+    DPRINTF(Rename, "[tid:%u]: Trying to unblock.\n", tid);
 
-    // If there's still information in the skid buffer, then
-    // continue to tell previous stages to stall.  They will be
-    // able to restart once the skid buffer is empty.
-    if (!skidBuffer.empty()) {
-        toDecode->renameInfo.stall = true;
-    } else {
-        DPRINTF(Rename, "Rename: Done unblocking.\n");
-        _status = Running;
+    // Rename is done unblocking if the skid buffer is empty.
+    if (skidBuffer[tid].empty() && renameStatus[tid] != BarrierStall) {
+
+        DPRINTF(Rename, "[tid:%u]: Done unblocking.\n", tid);
+
+        toDecode->renameUnblock[tid] = true;
+        wroteToTimeBuffer = true;
+
+        renameStatus[tid] = Running;
+        return true;
     }
+
+    return false;
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::doSquash()
+DefaultRename<Impl>::doSquash(unsigned tid)
 {
-    typename list<RenameHistory>::iterator hb_it = historyBuffer.begin();
+    typename list<RenameHistory>::iterator hb_it = historyBuffer[tid].begin();
 
-    InstSeqNum squashed_seq_num = fromCommit->commitInfo.doneSeqNum;
+    InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].doneSeqNum;
 
-#if FULL_SYSTEM
-    assert(!historyBuffer.empty());
-#else
+//#if FULL_SYSTEM
+//    assert(!historyBuffer[tid].empty());
+//#else
     // After a syscall squashes everything, the history buffer may be empty
     // but the ROB may still be squashing instructions.
-    if (historyBuffer.empty()) {
+    if (historyBuffer[tid].empty()) {
         return;
     }
-#endif // FULL_SYSTEM
+//#endif // FULL_SYSTEM
 
     // Go through the most recent instructions, undoing the mappings
     // they did and freeing up the registers.
-    while ((*hb_it).instSeqNum > squashed_seq_num)
-    {
-        assert(hb_it != historyBuffer.end());
+    while (!historyBuffer[tid].empty() &&
+           (*hb_it).instSeqNum > squashed_seq_num) {
+        assert(hb_it != historyBuffer[tid].end());
 
-        DPRINTF(Rename, "Rename: Removing history entry with sequence "
-                "number %i.\n", (*hb_it).instSeqNum);
+        DPRINTF(Rename, "[tid:%u]: Removing history entry with sequence "
+                "number %i.\n", tid, (*hb_it).instSeqNum);
 
-        // If it's not simply a place holder, then add the registers.
-        if (!(*hb_it).placeHolder) {
-            // Tell the rename map to set the architected register to the
-            // previous physical register that it was renamed to.
-            renameMap->setEntry(hb_it->archReg, hb_it->prevPhysReg);
+        // Tell the rename map to set the architected register to the
+        // previous physical register that it was renamed to.
+        renameMap[tid]->setEntry(hb_it->archReg, hb_it->prevPhysReg);
 
-            // Put the renamed physical register back on the free list.
-            freeList->addReg(hb_it->newPhysReg);
+        // Put the renamed physical register back on the free list.
+        freeList->addReg(hb_it->newPhysReg);
 
-            ++renameValidUndoneMaps;
-        }
-
-        historyBuffer.erase(hb_it++);
+        historyBuffer[tid].erase(hb_it++);
 
         ++renameUndoneMaps;
     }
 }
 
-template <class Impl>
-void
-SimpleRename<Impl>::squash()
-{
-    DPRINTF(Rename, "Rename: Squashing instructions.\n");
-    // Set the status to Squashing.
-    _status = Squashing;
-
-    numInst = 0;
-
-    // Clear the skid buffer in case it has any data in it.
-    while (!skidBuffer.empty())
-    {
-        skidBuffer.pop();
-    }
-
-    doSquash();
-}
-
 template<class Impl>
 void
-SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
+DefaultRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num, unsigned tid)
 {
-    DPRINTF(Rename, "Rename: Removing a committed instruction from the "
-            "history buffer, until sequence number %lli.\n", inst_seq_num);
-    typename list<RenameHistory>::iterator hb_it = historyBuffer.end();
+    DPRINTF(Rename, "[tid:%u]: Removing a committed instruction from the "
+            "history buffer %u (size=%i), until [sn:%lli].\n",
+            tid, tid, historyBuffer[tid].size(), inst_seq_num);
+
+    typename list<RenameHistory>::iterator hb_it = historyBuffer[tid].end();
 
     --hb_it;
 
-    if (hb_it->instSeqNum > inst_seq_num) {
-        DPRINTF(Rename, "Rename: Old sequence number encountered.  Ensure "
-                "that a syscall happened recently.\n");
+    if (historyBuffer[tid].empty()) {
+        DPRINTF(Rename, "[tid:%u]: History buffer is empty.\n", tid);
+        return;
+    } else if (hb_it->instSeqNum > inst_seq_num) {
+        DPRINTF(Rename, "[tid:%u]: Old sequence number encountered.  Ensure "
+                "that a syscall happened recently.\n", tid);
         return;
     }
 
-    while ((*hb_it).instSeqNum != inst_seq_num)
-    {
-        // Make sure we haven't gone off the end of the list.
-        assert(hb_it != historyBuffer.end());
+    // Commit all the renames up until (and including) the committed sequence
+    // number. Some or even all of the committed instructions may not have
+    // rename histories if they did not have destination registers that were
+    // renamed.
+    while (!historyBuffer[tid].empty() &&
+           hb_it != historyBuffer[tid].end() &&
+           (*hb_it).instSeqNum <= inst_seq_num) {
 
-        // In theory instructions at the end of the history buffer
-        // should be older than the instruction being removed, which
-        // means they will have a lower sequence number.  Also the
-        // instruction being removed from the history really should
-        // be the last instruction in the list, as it is the instruction
-        // that was just committed that is being removed.
-        assert(hb_it->instSeqNum < inst_seq_num);
-        DPRINTF(Rename, "Rename: Freeing up older rename of reg %i, sequence"
+        DPRINTF(Rename, "[tid:%u]: Freeing up older rename of reg %i, sequence"
                 " number %i.\n",
-                (*hb_it).prevPhysReg, (*hb_it).instSeqNum);
+                tid, (*hb_it).prevPhysReg, (*hb_it).instSeqNum);
 
-        if (!(*hb_it).placeHolder) {
-            freeList->addReg((*hb_it).prevPhysReg);
-            ++renameCommittedMaps;
-        }
-
-        historyBuffer.erase(hb_it--);
-    }
-
-    // Finally free up the previous register of the finished instruction
-    // itself.
-    if (!(*hb_it).placeHolder) {
-        freeList->addReg(hb_it->prevPhysReg);
+        freeList->addReg((*hb_it).prevPhysReg);
         ++renameCommittedMaps;
-    }
 
-    historyBuffer.erase(hb_it);
+        historyBuffer[tid].erase(hb_it--);
+    }
 }
 
 template <class Impl>
 inline void
-SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)
+DefaultRename<Impl>::renameSrcRegs(DynInstPtr &inst,unsigned tid)
 {
+    assert(renameMap[tid] != 0);
+
     unsigned num_src_regs = inst->numSrcRegs();
 
     // Get the architectual register numbers from the source and
     // destination operands, and redirect them to the right register.
     // Will need to mark dependencies though.
-    for (int src_idx = 0; src_idx < num_src_regs; src_idx++)
-    {
+    for (int src_idx = 0; src_idx < num_src_regs; src_idx++) {
         RegIndex src_reg = inst->srcRegIdx(src_idx);
 
         // Look up the source registers to get the phys. register they've
         // been renamed to, and set the sources to those registers.
-        PhysRegIndex renamed_reg = renameMap->lookup(src_reg);
+        PhysRegIndex renamed_reg = renameMap[tid]->lookup(src_reg);
 
-        DPRINTF(Rename, "Rename: Looking up arch reg %i, got "
-                "physical reg %i.\n", (int)src_reg, (int)renamed_reg);
+        DPRINTF(Rename, "[tid:%u]: Looking up arch reg %i, got "
+                "physical reg %i.\n", tid, (int)src_reg,
+                (int)renamed_reg);
 
         inst->renameSrcReg(src_idx, renamed_reg);
 
-        // Either incorporate it into the info passed back,
-        // or make another function call to see if that register is
-        // ready or not.
-        if (renameMap->isReady(renamed_reg)) {
-            DPRINTF(Rename, "Rename: Register is ready.\n");
+        // See if the register is ready or not.
+        if (scoreboard->getReg(renamed_reg) == true) {
+            DPRINTF(Rename, "[tid:%u]: Register is ready.\n", tid);
 
             inst->markSrcRegReady(src_idx);
         }
@@ -376,379 +866,341 @@ SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)
 
 template <class Impl>
 inline void
-SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst)
+DefaultRename<Impl>::renameDestRegs(DynInstPtr &inst,unsigned tid)
 {
-    typename SimpleRenameMap::RenameInfo rename_result;
+    typename RenameMap::RenameInfo rename_result;
 
     unsigned num_dest_regs = inst->numDestRegs();
 
-    // If it's an instruction with no destination registers, then put
-    // a placeholder within the history buffer.  It might be better
-    // to not put it in the history buffer at all (other than branches,
-    // which always need at least a place holder), and differentiate
-    // between instructions with and without destination registers
-    // when getting from commit the instructions that committed.
-    if (num_dest_regs == 0) {
-        RenameHistory hb_entry(inst->seqNum);
+    // Rename the destination registers.
+    for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++) {
+        RegIndex dest_reg = inst->destRegIdx(dest_idx);
 
-        historyBuffer.push_front(hb_entry);
+        // Get the physical register that the destination will be
+        // renamed to.
+        rename_result = renameMap[tid]->rename(dest_reg);
 
-        DPRINTF(Rename, "Rename: Adding placeholder instruction to "
-                "history buffer, sequence number %lli.\n",
-                inst->seqNum);
+        //Mark Scoreboard entry as not ready
+        scoreboard->unsetReg(rename_result.first);
 
-        ++renameHBPlaceHolders;
-    } else {
+        DPRINTF(Rename, "[tid:%u]: Renaming arch reg %i to physical "
+                "reg %i.\n", tid, (int)dest_reg,
+                (int)rename_result.first);
 
-        // Rename the destination registers.
-        for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++)
-        {
-            RegIndex dest_reg = inst->destRegIdx(dest_idx);
+        // Record the rename information so that a history can be kept.
+        RenameHistory hb_entry(inst->seqNum, dest_reg,
+                               rename_result.first,
+                               rename_result.second);
 
-            // Get the physical register that the destination will be
-            // renamed to.
-            rename_result = renameMap->rename(dest_reg);
+        historyBuffer[tid].push_front(hb_entry);
 
-            DPRINTF(Rename, "Rename: Renaming arch reg %i to physical "
-                    "reg %i.\n", (int)dest_reg,
-                    (int)rename_result.first);
+        DPRINTF(Rename, "[tid:%u]: Adding instruction to history buffer, "
+                "[sn:%lli].\n",tid,
+                (*historyBuffer[tid].begin()).instSeqNum);
 
-            // Record the rename information so that a history can be kept.
-            RenameHistory hb_entry(inst->seqNum, dest_reg,
-                                   rename_result.first,
-                                   rename_result.second);
+        // Tell the instruction to rename the appropriate destination
+        // register (dest_idx) to the new physical register
+        // (rename_result.first), and record the previous physical
+        // register that the same logical register was renamed to
+        // (rename_result.second).
+        inst->renameDestReg(dest_idx,
+                            rename_result.first,
+                            rename_result.second);
 
-            historyBuffer.push_front(hb_entry);
-
-            DPRINTF(Rename, "Rename: Adding instruction to history buffer, "
-                    "sequence number %lli.\n",
-                    (*historyBuffer.begin()).instSeqNum);
-
-            // Tell the instruction to rename the appropriate destination
-            // register (dest_idx) to the new physical register
-            // (rename_result.first), and record the previous physical
-            // register that the same logical register was renamed to
-            // (rename_result.second).
-            inst->renameDestReg(dest_idx,
-                                rename_result.first,
-                                rename_result.second);
-
-            ++renameRenamedOperands;
-        }
+        ++renameRenamedOperands;
     }
 }
 
 template <class Impl>
 inline int
-SimpleRename<Impl>::calcFreeROBEntries()
+DefaultRename<Impl>::calcFreeROBEntries(unsigned tid)
 {
-    return fromCommit->commitInfo.freeROBEntries -
-        renameWidth * iewToRenameDelay;
+    int num_free = freeEntries[tid].robEntries -
+                  (instsInProgress[tid] - fromIEW->iewInfo[tid].dispatched);
+
+    //DPRINTF(Rename,"[tid:%i]: %i rob free\n",tid,num_free);
+
+    return num_free;
 }
 
 template <class Impl>
 inline int
-SimpleRename<Impl>::calcFreeIQEntries()
+DefaultRename<Impl>::calcFreeIQEntries(unsigned tid)
 {
-    return fromIEW->iewInfo.freeIQEntries - renameWidth * iewToRenameDelay;
+    int num_free = freeEntries[tid].iqEntries -
+                  (instsInProgress[tid] - fromIEW->iewInfo[tid].dispatched);
+
+    //DPRINTF(Rename,"[tid:%i]: %i iq free\n",tid,num_free);
+
+    return num_free;
+}
+
+template <class Impl>
+inline int
+DefaultRename<Impl>::calcFreeLSQEntries(unsigned tid)
+{
+    int num_free = freeEntries[tid].lsqEntries -
+                  (instsInProgress[tid] - fromIEW->iewInfo[tid].dispatchedToLSQ);
+
+    //DPRINTF(Rename,"[tid:%i]: %i lsq free\n",tid,num_free);
+
+    return num_free;
+}
+
+template <class Impl>
+unsigned
+DefaultRename<Impl>::validInsts()
+{
+    unsigned inst_count = 0;
+
+    for (int i=0; i<fromDecode->size; i++) {
+        if (!fromDecode->insts[i]->squashed)
+            inst_count++;
+    }
+
+    return inst_count;
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::readStallSignals(unsigned tid)
+{
+    if (fromIEW->iewBlock[tid]) {
+        stalls[tid].iew = true;
+    }
+
+    if (fromIEW->iewUnblock[tid]) {
+        assert(stalls[tid].iew);
+        stalls[tid].iew = false;
+    }
+
+    if (fromCommit->commitBlock[tid]) {
+        stalls[tid].commit = true;
+    }
+
+    if (fromCommit->commitUnblock[tid]) {
+        assert(stalls[tid].commit);
+        stalls[tid].commit = false;
+    }
+}
+
+template <class Impl>
+bool
+DefaultRename<Impl>::checkStall(unsigned tid)
+{
+    bool ret_val = false;
+
+    if (stalls[tid].iew) {
+        DPRINTF(Rename,"[tid:%i]: Stall from IEW stage detected.\n", tid);
+        ret_val = true;
+    } else if (stalls[tid].commit) {
+        DPRINTF(Rename,"[tid:%i]: Stall from Commit stage detected.\n", tid);
+        ret_val = true;
+    } else if (calcFreeROBEntries(tid) <= 0) {
+        DPRINTF(Rename,"[tid:%i]: Stall: ROB has 0 free entries.\n", tid);
+        ret_val = true;
+    } else if (calcFreeIQEntries(tid) <= 0) {
+        DPRINTF(Rename,"[tid:%i]: Stall: IQ has 0 free entries.\n", tid);
+        ret_val = true;
+    } else if (calcFreeLSQEntries(tid) <= 0) {
+        DPRINTF(Rename,"[tid:%i]: Stall: LSQ has 0 free entries.\n", tid);
+        ret_val = true;
+    } else if (renameMap[tid]->numFreeEntries() <= 0) {
+        DPRINTF(Rename,"[tid:%i]: Stall: RenameMap has 0 free entries.\n", tid);
+        ret_val = true;
+    } else if (renameStatus[tid] == BarrierStall &&
+               (!emptyROB[tid] || instsInProgress[tid])) {
+        DPRINTF(Rename,"[tid:%i]: Stall: Barrier stall and ROB is not "
+                "empty.\n",
+                tid);
+        ret_val = true;
+    }
+
+    return ret_val;
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::readFreeEntries(unsigned tid)
+{
+    bool updated = false;
+    if (fromIEW->iewInfo[tid].usedIQ) {
+        freeEntries[tid].iqEntries =
+            fromIEW->iewInfo[tid].freeIQEntries;
+        updated = true;
+    }
+
+    if (fromIEW->iewInfo[tid].usedLSQ) {
+        freeEntries[tid].lsqEntries =
+            fromIEW->iewInfo[tid].freeLSQEntries;
+        updated = true;
+    }
+
+    if (fromCommit->commitInfo[tid].usedROB) {
+        freeEntries[tid].robEntries =
+            fromCommit->commitInfo[tid].freeROBEntries;
+        emptyROB[tid] = fromCommit->commitInfo[tid].emptyROB;
+        updated = true;
+    }
+
+    DPRINTF(Rename, "[tid:%i]: Free IQ: %i, Free ROB: %i, Free LSQ: %i\n",
+            tid,
+            freeEntries[tid].iqEntries,
+            freeEntries[tid].robEntries,
+            freeEntries[tid].lsqEntries);
+
+    DPRINTF(Rename, "[tid:%i]: %i instructions not yet in ROB\n",
+            tid, instsInProgress[tid]);
+}
+
+template <class Impl>
+bool
+DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
+{
+    // Check if there's a squash signal, squash if there is
+    // Check stall signals, block if necessary.
+    // If status was blocked
+    //     check if stall conditions have passed
+    //         if so then go to unblocking
+    // If status was Squashing
+    //     check if squashing is not high.  Switch to running this cycle.
+    // If status was barrier stall
+    //     check if ROB is empty and no insts are in flight to the ROB
+
+    readFreeEntries(tid);
+    readStallSignals(tid);
+
+    if (fromCommit->commitInfo[tid].squash) {
+        DPRINTF(Rename, "[tid:%u]: Squashing instructions due to squash from "
+                "commit.\n", tid);
+
+        squash(tid);
+
+        return true;
+    }
+
+    if (fromCommit->commitInfo[tid].robSquashing) {
+        DPRINTF(Rename, "[tid:%u]: ROB is still squashing.\n", tid);
+
+        renameStatus[tid] = Squashing;
+
+        return true;
+    }
+
+    if (checkStall(tid)) {
+        return block(tid);
+    }
+
+    if (renameStatus[tid] == Blocked) {
+        DPRINTF(Rename, "[tid:%u]: Done blocking, switching to unblocking.\n",
+                tid);
+
+        renameStatus[tid] = Unblocking;
+
+        unblock(tid);
+
+        return true;
+    }
+
+    if (renameStatus[tid] == Squashing) {
+        // Switch status to running if rename isn't being told to block or
+        // squash this cycle.
+        DPRINTF(Rename, "[tid:%u]: Done squashing, switching to running.\n",
+                tid);
+
+        renameStatus[tid] = Running;
+
+        return false;
+    }
+
+    if (renameStatus[tid] == BarrierStall) {
+        // Stall ends once the ROB is free.
+        DPRINTF(Rename, "[tid:%u]: Done with barrier stall, switching to "
+                "unblocking.\n", tid);
+
+        DynInstPtr barr_inst = barrierInst[tid];
+
+        renameStatus[tid] = Unblocking;
+
+        unblock(tid);
+
+        DPRINTF(Rename, "[tid:%u]: Processing instruction [%lli] with "
+                "PC %#x.\n",
+                tid, barr_inst->seqNum, barr_inst->readPC());
+
+        // Put instruction into queue here.
+        barr_inst->clearSerializeBefore();
+
+        if (!skidBuffer[tid].empty()) {
+            skidBuffer[tid].push_front(barr_inst);
+        } else {
+            insts[tid].push_front(barr_inst);
+        }
+
+        DPRINTF(Rename, "[tid:%u]: Instruction must be processed by rename."
+                " Adding to front of list.", tid);
+
+        barrierInst[tid] = NULL;
+
+        return true;
+    }
+
+    // If we've reached this point, we have not gotten any signals that
+    // cause rename to change its status.  Rename remains the same as before.
+    return false;
 }
 
 template<class Impl>
 void
-SimpleRename<Impl>::tick()
+DefaultRename<Impl>::serializeAfter(InstQueue &inst_list,
+                                   unsigned tid)
 {
-    // Rename will need to try to rename as many instructions as it
-    // has bandwidth, unless it is blocked.
-
-    // Check if _status is BarrierStall.  If so, then check if the number
-    // of free ROB entries is equal to the number of total ROB entries.
-    // Once equal then wake this stage up.  Set status to unblocking maybe.
-
-    if (_status != Blocked && _status != Squashing) {
-        DPRINTF(Rename, "Rename: Status is not blocked, will attempt to "
-                        "run stage.\n");
-        // Make sure that the skid buffer has something in it if the
-        // status is unblocking.
-        assert(_status == Unblocking ? !skidBuffer.empty() : 1);
-
-        rename();
-
-        // If the status was unblocking, then instructions from the skid
-        // buffer were used.  Remove those instructions and handle
-        // the rest of unblocking.
-        if (_status == Unblocking) {
-            ++renameUnblockCycles;
-
-            if (fromDecode->size > 0) {
-                // Add the current inputs onto the skid buffer, so they can be
-                // reprocessed when this stage unblocks.
-                skidBuffer.push(*fromDecode);
-            }
-
-            unblock();
-        }
-    } else if (_status == Blocked) {
-        ++renameBlockCycles;
-
-        // If stage is blocked and still receiving valid instructions,
-        // make sure to store them in the skid buffer.
-        if (fromDecode->size > 0) {
-
-            block();
-
-            // Continue to tell previous stage to stall.
-            toDecode->renameInfo.stall = true;
-        }
-
-        if (!fromIEW->iewInfo.stall &&
-            !fromCommit->commitInfo.stall &&
-            calcFreeROBEntries() > 0 &&
-            calcFreeIQEntries() > 0 &&
-            renameMap->numFreeEntries() > 0) {
-
-            // Need to be sure to check all blocking conditions above.
-            // If they have cleared, then start unblocking.
-            DPRINTF(Rename, "Rename: Stall signals cleared, going to "
-                    "unblock.\n");
-            _status = Unblocking;
-
-            // Continue to tell previous stage to block until this stage
-            // is done unblocking.
-            toDecode->renameInfo.stall = true;
-        } else {
-            // Otherwise no conditions have changed.  Tell previous
-            // stage to continue blocking.
-            toDecode->renameInfo.stall = true;
-        }
-
-        if (fromCommit->commitInfo.squash ||
-            fromCommit->commitInfo.robSquashing) {
-            squash();
-            return;
-        }
-    } else if (_status == Squashing) {
-        ++renameSquashCycles;
-
-        if (fromCommit->commitInfo.squash) {
-            squash();
-        } else if (!fromCommit->commitInfo.squash &&
-                   !fromCommit->commitInfo.robSquashing) {
-
-            DPRINTF(Rename, "Rename: Done squashing, going to running.\n");
-            _status = Running;
-            rename();
-        } else {
-            doSquash();
-        }
-    }
-
-    // Ugly code, revamp all of the tick() functions eventually.
-    if (fromCommit->commitInfo.doneSeqNum != 0 && _status != Squashing) {
-#if !FULL_SYSTEM
-        if (!fromCommit->commitInfo.squash) {
-            removeFromHistory(fromCommit->commitInfo.doneSeqNum);
-        }
-#else
-        removeFromHistory(fromCommit->commitInfo.doneSeqNum);
-#endif
+    if (inst_list.empty()) {
+        // Mark a bit to say that I must serialize on the next instruction.
+        serializeOnNextInst[tid] = true;
+        return;
     }
 
+    // Set the next instruction as serializing.
+    inst_list.front()->setSerializeBefore();
 }
 
-template<class Impl>
+template <class Impl>
+inline void
+DefaultRename<Impl>::incrFullStat(const FullSource &source)
+{
+    switch (source) {
+      case ROB:
+        ++renameROBFullEvents;
+        break;
+      case IQ:
+        ++renameIQFullEvents;
+        break;
+      case LSQ:
+        ++renameLSQFullEvents;
+        break;
+      default:
+        panic("Rename full stall stat should be incremented for a reason!");
+        break;
+    }
+}
+
+template <class Impl>
 void
-SimpleRename<Impl>::rename()
+DefaultRename<Impl>::dumpHistory()
 {
-    // Check if any of the stages ahead of rename are telling rename
-    // to squash.  The squash() function will also take care of fixing up
-    // the rename map and the free list.
-    if (fromCommit->commitInfo.squash ||
-        fromCommit->commitInfo.robSquashing) {
-        DPRINTF(Rename, "Rename: Receiving signal from Commit to squash.\n");
-        squash();
-        return;
-    }
+    typename list<RenameHistory>::iterator buf_it;
 
-    // Check if time buffer is telling this stage to stall.
-    if (fromIEW->iewInfo.stall ||
-        fromCommit->commitInfo.stall) {
-        DPRINTF(Rename, "Rename: Receiving signal from IEW/Commit to "
-                        "stall.\n");
-        block();
-        return;
-    }
+    for (int i = 0; i < numThreads; i++) {
 
-    // Check if the current status is squashing.  If so, set its status
-    // to running and resume execution the next cycle.
-    if (_status == Squashing) {
-        DPRINTF(Rename, "Rename: Done squashing.\n");
-        _status = Running;
-        return;
-    }
+        buf_it = historyBuffer[i].begin();
 
-    // Check the decode queue to see if instructions are available.
-    // If there are no available instructions to rename, then do nothing.
-    // Or, if the stage is currently unblocking, then go ahead and run it.
-    if (fromDecode->size == 0 && _status != Unblocking) {
-        DPRINTF(Rename, "Rename: Nothing to do, breaking out early.\n");
-        // Should I change status to idle?
-        return;
-    }
+        while (buf_it != historyBuffer[i].end()) {
+            cprintf("Seq num: %i\nArch reg: %i New phys reg: %i Old phys "
+                    "reg: %i\n", (*buf_it).instSeqNum, (int)(*buf_it).archReg,
+                    (int)(*buf_it).newPhysReg, (int)(*buf_it).prevPhysReg);
 
-    ////////////////////////////////////
-    // Actual rename part.
-    ////////////////////////////////////
-
-    DynInstPtr inst;
-
-    // If we're unblocking, then we may be in the middle of an instruction
-    // group.  Subtract off numInst to get the proper number of instructions
-    // left.
-    int insts_available = _status == Unblocking ?
-        skidBuffer.front().size - numInst :
-        fromDecode->size;
-
-    bool block_this_cycle = false;
-
-    // Will have to do a different calculation for the number of free
-    // entries.  Number of free entries recorded on this cycle -
-    // renameWidth * renameToDecodeDelay
-    int free_rob_entries = calcFreeROBEntries();
-    int free_iq_entries = calcFreeIQEntries();
-    int min_iq_rob = min(free_rob_entries, free_iq_entries);
-
-    unsigned to_iew_index = 0;
-
-    // Check if there's any space left.
-    if (min_iq_rob <= 0) {
-        DPRINTF(Rename, "Rename: Blocking due to no free ROB or IQ "
-                "entries.\n"
-                "Rename: ROB has %d free entries.\n"
-                "Rename: IQ has %d free entries.\n",
-                free_rob_entries,
-                free_iq_entries);
-        block();
-        // Tell previous stage to stall.
-        toDecode->renameInfo.stall = true;
-
-        if (free_rob_entries <= 0) {
-            ++renameROBFullEvents;
-        } else {
-            ++renameIQFullEvents;
+            buf_it++;
         }
-
-        return;
-    } else if (min_iq_rob < insts_available) {
-        DPRINTF(Rename, "Rename: Will have to block this cycle.  Only "
-                "%i insts can be renamed due to IQ/ROB limits.\n",
-                min_iq_rob);
-
-        insts_available = min_iq_rob;
-
-        block_this_cycle = true;
-
-        if (free_rob_entries < free_iq_entries) {
-            ++renameROBFullEvents;
-        } else {
-            ++renameIQFullEvents;
-        }
-    }
-
-    while (insts_available > 0) {
-        DPRINTF(Rename, "Rename: Sending instructions to iew.\n");
-
-        // Get the next instruction either from the skid buffer or the
-        // decode queue.
-        inst = _status == Unblocking ? skidBuffer.front().insts[numInst] :
-               fromDecode->insts[numInst];
-
-        if (inst->isSquashed()) {
-            DPRINTF(Rename, "Rename: instruction %i with PC %#x is "
-                    "squashed, skipping.\n",
-                    inst->seqNum, inst->readPC());
-
-            // Go to the next instruction.
-            ++numInst;
-
-            ++renameSquashedInsts;
-
-            // Decrement how many instructions are available.
-            --insts_available;
-
-            continue;
-        }
-
-        DPRINTF(Rename, "Rename: Processing instruction %i with PC %#x.\n",
-                inst->seqNum, inst->readPC());
-
-        // If it's a trap instruction, then it needs to wait here within
-        // rename until the ROB is empty.  Needs a way to detect that the
-        // ROB is empty.  Maybe an event?
-        // Would be nice if it could be avoided putting this into a
-        // specific stage and instead just put it into the AlphaFullCPU.
-        // Might not really be feasible though...
-        // (EXCB, TRAPB)
-        if (inst->isSerializing()) {
-            panic("Rename: Serializing instruction encountered.\n");
-            DPRINTF(Rename, "Rename: Serializing instruction "
-                            "encountered.\n");
-
-            // Change status over to BarrierStall so that other stages know
-            // what this is blocked on.
-            _status = BarrierStall;
-
-            block_this_cycle = true;
-
-            break;
-        }
-
-        // Check here to make sure there are enough destination registers
-        // to rename to.  Otherwise block.
-        if (renameMap->numFreeEntries() < inst->numDestRegs())
-        {
-            DPRINTF(Rename, "Rename: Blocking due to lack of free "
-                            "physical registers to rename to.\n");
-            // Need some sort of event based on a register being freed.
-
-            block_this_cycle = true;
-
-            ++renameFullRegistersEvents;
-
-            break;
-        }
-
-        renameSrcRegs(inst);
-
-        renameDestRegs(inst);
-
-        // Put instruction in rename queue.
-        toIEW->insts[to_iew_index] = inst;
-        ++(toIEW->size);
-
-        // Decrease the number of free ROB and IQ entries.
-        --free_rob_entries;
-        --free_iq_entries;
-
-        // Increment which instruction we're on.
-        ++to_iew_index;
-        ++numInst;
-
-        ++renameRenamedInsts;
-
-        // Decrement how many instructions are available.
-        --insts_available;
-    }
-
-    // Check if there's any instructions left that haven't yet been renamed.
-    // If so then block.
-    if (block_this_cycle) {
-        block();
-
-        toDecode->renameInfo.stall = true;
-    } else {
-        // If we had a successful rename and didn't have to exit early, then
-        // reset numInst so it will refer to the correct instruction on next
-        // run.
-        numInst = 0;
     }
 }
diff --git a/cpu/o3/rename_map.cc b/cpu/o3/rename_map.cc
index 10963f7de..8ba632e65 100644
--- a/cpu/o3/rename_map.cc
+++ b/cpu/o3/rename_map.cc
@@ -39,98 +39,94 @@ using namespace std;
 // determine if the register is a logical int, logical fp, physical int,
 // physical fp, etc.
 
-SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
-                                 unsigned _numPhysicalIntRegs,
-                                 unsigned _numLogicalFloatRegs,
-                                 unsigned _numPhysicalFloatRegs,
-                                 unsigned _numMiscRegs,
-                                 RegIndex _intZeroReg,
-                                 RegIndex _floatZeroReg)
-    : numLogicalIntRegs(_numLogicalIntRegs),
-      numPhysicalIntRegs(_numPhysicalIntRegs),
-      numLogicalFloatRegs(_numLogicalFloatRegs),
-      numPhysicalFloatRegs(_numPhysicalFloatRegs),
-      numMiscRegs(_numMiscRegs),
-      intZeroReg(_intZeroReg),
-      floatZeroReg(_floatZeroReg)
+SimpleRenameMap::~SimpleRenameMap()
 {
-    DPRINTF(Rename, "Rename: Creating rename map.  Phys: %i / %i, Float: "
-            "%i / %i.\n", numLogicalIntRegs, numPhysicalIntRegs,
+    // Delete the rename maps as they were allocated with new.
+    //delete [] intRenameMap;
+    //delete [] floatRenameMap;
+}
+
+void
+SimpleRenameMap::init(unsigned _numLogicalIntRegs,
+                      unsigned _numPhysicalIntRegs,
+                      PhysRegIndex &ireg_idx,
+
+                      unsigned _numLogicalFloatRegs,
+                      unsigned _numPhysicalFloatRegs,
+                      PhysRegIndex &freg_idx,
+
+                      unsigned _numMiscRegs,
+
+                      RegIndex _intZeroReg,
+                      RegIndex _floatZeroReg,
+
+                      int map_id,
+                      bool bindRegs)
+{
+    id = map_id;
+
+    numLogicalIntRegs = _numLogicalIntRegs;
+
+    numLogicalFloatRegs = _numLogicalFloatRegs;
+
+    numPhysicalIntRegs = _numPhysicalIntRegs;
+
+    numPhysicalFloatRegs = _numPhysicalFloatRegs;
+
+    numMiscRegs = _numMiscRegs;
+
+    intZeroReg = _intZeroReg;
+    floatZeroReg = _floatZeroReg;
+
+    DPRINTF(Rename, "Creating rename map %i.  Phys: %i / %i, Float: "
+            "%i / %i.\n", id, numLogicalIntRegs, numPhysicalIntRegs,
             numLogicalFloatRegs, numPhysicalFloatRegs);
 
     numLogicalRegs = numLogicalIntRegs + numLogicalFloatRegs;
 
     numPhysicalRegs = numPhysicalIntRegs + numPhysicalFloatRegs;
 
-    //Create the rename maps, and their scoreboards.
-    intRenameMap = new RenameEntry[numLogicalIntRegs];
-    floatRenameMap = new RenameEntry[numLogicalRegs];
+    //Create the rename maps
+    intRenameMap.resize(numLogicalIntRegs);
+    floatRenameMap.resize(numLogicalRegs);
 
-    // Should combine this into one scoreboard.
-    intScoreboard.resize(numPhysicalIntRegs);
-    floatScoreboard.resize(numPhysicalRegs);
-    miscScoreboard.resize(numPhysicalRegs + numMiscRegs);
+    if (bindRegs) {
+        DPRINTF(Rename, "Binding registers into rename map %i",id);
 
-    // Initialize the entries in the integer rename map to point to the
-    // physical registers of the same index, and consider each register
-    // ready until the first rename occurs.
-    for (RegIndex index = 0; index < numLogicalIntRegs; ++index)
-    {
-        intRenameMap[index].physical_reg = index;
-        intScoreboard[index] = 1;
+        // Initialize the entries in the integer rename map to point to the
+        // physical registers of the same index
+        for (RegIndex index = 0; index < numLogicalIntRegs; ++index)
+        {
+            intRenameMap[index].physical_reg = ireg_idx++;
+        }
+
+        // Initialize the entries in the floating point rename map to point to
+        // the physical registers of the same index
+        // Although the index refers purely to architected registers, because
+        // the floating reg indices come after the integer reg indices, they
+        // may exceed the size of a normal RegIndex (short).
+        for (PhysRegIndex index = numLogicalIntRegs; index < numLogicalRegs; ++index)
+        {
+            floatRenameMap[index].physical_reg = freg_idx++;
+        }
+    } else {
+        DPRINTF(Rename, "Binding registers into rename map %i",id);
+
+        PhysRegIndex temp_ireg = ireg_idx;
+
+        for (RegIndex index = 0; index < numLogicalIntRegs; ++index)
+        {
+            intRenameMap[index].physical_reg = temp_ireg++;
+        }
+
+        PhysRegIndex temp_freg = freg_idx;
+
+        for (PhysRegIndex index = numLogicalIntRegs;
+             index < numLogicalRegs; ++index)
+        {
+            floatRenameMap[index].physical_reg = temp_freg++;
+        }
     }
-
-    // Initialize the rest of the physical registers (the ones that don't
-    // directly map to a logical register) as unready.
-    for (PhysRegIndex index = numLogicalIntRegs;
-         index < numPhysicalIntRegs;
-         ++index)
-    {
-        intScoreboard[index] = 0;
-    }
-
-    int float_reg_idx = numPhysicalIntRegs;
-
-    // Initialize the entries in the floating point rename map to point to
-    // the physical registers of the same index, and consider each register
-    // ready until the first rename occurs.
-    // Although the index refers purely to architected registers, because
-    // the floating reg indices come after the integer reg indices, they
-    // may exceed the size of a normal RegIndex (short).
-    for (PhysRegIndex index = numLogicalIntRegs;
-         index < numLogicalRegs; ++index)
-    {
-        floatRenameMap[index].physical_reg = float_reg_idx++;
-    }
-
-    for (PhysRegIndex index = numPhysicalIntRegs;
-         index < numPhysicalIntRegs + numLogicalFloatRegs; ++index)
-    {
-        floatScoreboard[index] = 1;
-    }
-
-    // Initialize the rest of the physical registers (the ones that don't
-    // directly map to a logical register) as unready.
-    for (PhysRegIndex index = numPhysicalIntRegs + numLogicalFloatRegs;
-         index < numPhysicalRegs;
-         ++index)
-    {
-        floatScoreboard[index] = 0;
-    }
-
-    // Initialize the entries in the misc register scoreboard to be ready.
-    for (PhysRegIndex index = numPhysicalRegs;
-         index < numPhysicalRegs + numMiscRegs; ++index)
-    {
-        miscScoreboard[index] = 1;
-    }
-}
-
-SimpleRenameMap::~SimpleRenameMap()
-{
-    // Delete the rename maps as they were allocated with new.
-    delete [] intRenameMap;
-    delete [] floatRenameMap;
 }
 
 void
@@ -167,8 +163,6 @@ SimpleRenameMap::rename(RegIndex arch_reg)
 
             assert(renamed_reg >= 0 && renamed_reg < numPhysicalIntRegs);
 
-            // Mark register as not ready.
-            intScoreboard[renamed_reg] = false;
         } else {
             // Otherwise return the zero register so nothing bad happens.
             renamed_reg = intZeroReg;
@@ -192,9 +186,6 @@ SimpleRenameMap::rename(RegIndex arch_reg)
 
             assert(renamed_reg < numPhysicalRegs &&
                    renamed_reg >= numPhysicalIntRegs);
-
-            // Mark register as not ready.
-            floatScoreboard[renamed_reg] = false;
         } else {
             // Otherwise return the zero register so nothing bad happens.
             renamed_reg = floatZeroReg;
@@ -215,8 +206,6 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         prev_reg = renamed_reg;
 
         assert(renamed_reg < numPhysicalRegs + numMiscRegs);
-
-        miscScoreboard[renamed_reg] = false;
     }
 
     return RenameInfo(renamed_reg, prev_reg);
@@ -244,25 +233,6 @@ SimpleRenameMap::lookup(RegIndex arch_reg)
     }
 }
 
-bool
-SimpleRenameMap::isReady(PhysRegIndex phys_reg)
-{
-    if (phys_reg < numPhysicalIntRegs) {
-        return intScoreboard[phys_reg];
-    } else if (phys_reg < numPhysicalRegs) {
-
-        // Subtract off the base FP offset.
-//        phys_reg = phys_reg - numPhysicalIntRegs;
-
-        return floatScoreboard[phys_reg];
-    } else {
-        // Subtract off the misc registers offset.
-//        phys_reg = phys_reg - numPhysicalRegs;
-
-        return miscScoreboard[phys_reg];
-    }
-}
-
 // In this implementation the miscellaneous registers do not actually rename,
 // so this function does not allow you to try to change their mappings.
 void
@@ -273,14 +243,16 @@ SimpleRenameMap::setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg)
                 (int)arch_reg, renamed_reg);
 
         intRenameMap[arch_reg].physical_reg = renamed_reg;
-    } else {
-        assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
+    } else if (arch_reg < numLogicalIntRegs + numLogicalFloatRegs) {
+
 
         DPRINTF(Rename, "Rename Map: Float register %i being set to %i.\n",
                 (int)arch_reg - numLogicalIntRegs, renamed_reg);
 
         floatRenameMap[arch_reg].physical_reg = renamed_reg;
     }
+
+    //assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
 }
 
 void
@@ -308,30 +280,6 @@ SimpleRenameMap::squash(vector<RegIndex> freed_regs,
     // Take unmap info and roll back the rename map.
 }
 
-void
-SimpleRenameMap::markAsReady(PhysRegIndex ready_reg)
-{
-    DPRINTF(Rename, "Rename map: Marking register %i as ready.\n",
-            (int)ready_reg);
-
-    if (ready_reg < numPhysicalIntRegs) {
-        assert(ready_reg >= 0);
-
-        intScoreboard[ready_reg] = 1;
-    } else if (ready_reg < numPhysicalRegs) {
-
-        // Subtract off the base FP offset.
-//        ready_reg = ready_reg - numPhysicalIntRegs;
-
-        floatScoreboard[ready_reg] = 1;
-    } else {
-        //Subtract off the misc registers offset.
-//        ready_reg = ready_reg - numPhysicalRegs;
-
-        miscScoreboard[ready_reg] = 1;
-    }
-}
-
 int
 SimpleRenameMap::numFreeEntries()
 {
diff --git a/cpu/o3/rename_map.hh b/cpu/o3/rename_map.hh
index 57be4a64a..3ecbe45c3 100644
--- a/cpu/o3/rename_map.hh
+++ b/cpu/o3/rename_map.hh
@@ -30,8 +30,8 @@
 // Have it so that there's a more meaningful name given to the variable
 // that marks the beginning of the FP registers.
 
-#ifndef __CPU_O3_CPU_RENAME_MAP_HH__
-#define __CPU_O3_CPU_RENAME_MAP_HH__
+#ifndef __CPU_O3_RENAME_MAP_HH__
+#define __CPU_O3_RENAME_MAP_HH__
 
 #include <iostream>
 #include <utility>
@@ -63,17 +63,27 @@ class SimpleRenameMap
 
   public:
     //Constructor
-    SimpleRenameMap(unsigned _numLogicalIntRegs,
-                    unsigned _numPhysicalIntRegs,
-                    unsigned _numLogicalFloatRegs,
-                    unsigned _numPhysicalFloatRegs,
-                    unsigned _numMiscRegs,
-                    RegIndex _intZeroReg,
-                    RegIndex _floatZeroReg);
+     SimpleRenameMap() {};
 
     /** Destructor. */
     ~SimpleRenameMap();
 
+    void init(unsigned _numLogicalIntRegs,
+              unsigned _numPhysicalIntRegs,
+              PhysRegIndex &_int_reg_start,
+
+              unsigned _numLogicalFloatRegs,
+              unsigned _numPhysicalFloatRegs,
+              PhysRegIndex &_float_reg_start,
+
+              unsigned _numMiscRegs,
+
+              RegIndex _intZeroReg,
+              RegIndex _floatZeroReg,
+
+              int id,
+              bool bindRegs);
+
     void setFreeList(SimpleFreeList *fl_ptr);
 
     //Tell rename map to get a free physical register for a given
@@ -84,15 +94,11 @@ class SimpleRenameMap
 
     PhysRegIndex lookup(RegIndex phys_reg);
 
-    bool isReady(PhysRegIndex arch_reg);
-
     /**
      * Marks the given register as ready, meaning that its value has been
      * calculated and written to the register file.
      * @param ready_reg The index of the physical register that is now ready.
      */
-    void markAsReady(PhysRegIndex ready_reg);
-
     void setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg);
 
     void squash(std::vector<RegIndex> freed_regs,
@@ -101,6 +107,9 @@ class SimpleRenameMap
     int numFreeEntries();
 
   private:
+    /** Rename Map ID  */
+    int id;
+
     /** Number of logical integer registers. */
     int numLogicalIntRegs;
 
@@ -143,31 +152,17 @@ class SimpleRenameMap
         { }
     };
 
+    //Change this to private
+  public:
     /** Integer rename map. */
-    RenameEntry *intRenameMap;
+    std::vector<RenameEntry> intRenameMap;
 
     /** Floating point rename map. */
-    RenameEntry *floatRenameMap;
+    std::vector<RenameEntry> floatRenameMap;
 
+  private:
     /** Free list interface. */
     SimpleFreeList *freeList;
-
-    // Might want to make all these scoreboards into one large scoreboard.
-
-    /** Scoreboard of physical integer registers, saying whether or not they
-     *  are ready.
-     */
-    std::vector<bool> intScoreboard;
-
-    /** Scoreboard of physical floating registers, saying whether or not they
-     *  are ready.
-     */
-    std::vector<bool> floatScoreboard;
-
-    /** Scoreboard of miscellaneous registers, saying whether or not they
-     *  are ready.
-     */
-    std::vector<bool> miscScoreboard;
 };
 
-#endif //__CPU_O3_CPU_RENAME_MAP_HH__
+#endif //__CPU_O3_RENAME_MAP_HH__
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index 1185564ad..48199915f 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -26,23 +26,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo: Probably add in support for scheduling events (more than one as
-// well) on the case of the ROB being empty or full.  Considering tracking
-// free entries instead of insts in ROB.  Differentiate between squashing
-// all instructions after the instruction, and all instructions after *and*
-// including that instruction.
-
-#ifndef __CPU_O3_CPU_ROB_HH__
-#define __CPU_O3_CPU_ROB_HH__
+#ifndef __CPU_O3_ROB_HH__
+#define __CPU_O3_ROB_HH__
 
+#include <string>
 #include <utility>
 #include <vector>
 
 /**
- * ROB class.  Uses the instruction list that exists within the CPU to
- * represent the ROB.  This class doesn't contain that list, but instead
- * a pointer to the CPU to get access to the list.  The ROB, in this first
- * implementation, is largely what drives squashing.
+ * ROB class.  The ROB is largely what drives squashing.
  */
 template <class Impl>
 class ROB
@@ -54,16 +46,45 @@ class ROB
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
-    typedef std::pair<RegIndex, PhysRegIndex> UnmapInfo_t;
-    typedef typename list<DynInstPtr>::iterator InstIt_t;
+    typedef std::pair<RegIndex, PhysRegIndex> UnmapInfo;
+    typedef typename std::list<DynInstPtr>::iterator InstIt;
+
+    /** Possible ROB statuses. */
+    enum Status {
+        Running,
+        Idle,
+        ROBSquashing,
+        DcacheMissStall,
+        DcacheMissComplete
+    };
+
+    /** SMT ROB Sharing Policy */
+    enum ROBPolicy{
+        Dynamic,
+        Partitioned,
+        Threshold
+    };
+
+  private:
+    /** Per-thread ROB status. */
+    Status robStatus[Impl::MaxThreads];
+
+    /** ROB resource sharing policy for SMT mode. */
+    ROBPolicy robPolicy;
 
   public:
     /** ROB constructor.
-     *  @param _numEntries Number of entries in ROB.
-     *  @param _squashWidth Number of instructions that can be squashed in a
-     *                       single cycle.
+     *  @param _numEntries      Number of entries in ROB.
+     *  @param _squashWidth     Number of instructions that can be squashed in a
+     *                          single cycle.
+     *  @param _smtROBPolicy    ROB Partitioning Scheme for SMT.
+     *  @param _smtROBThreshold Max Resources(by %) a thread can have in the ROB.
+     *  @param _numThreads      The number of active threads.
      */
-    ROB(unsigned _numEntries, unsigned _squashWidth);
+    ROB(unsigned _numEntries, unsigned _squashWidth, std::string smtROBPolicy,
+        unsigned _smtROBThreshold, unsigned _numThreads);
+
+    std::string name() const;
 
     /** Function to set the CPU pointer, necessary due to which object the ROB
      *  is created within.
@@ -71,12 +92,15 @@ class ROB
      */
     void setCPU(FullCPU *cpu_ptr);
 
-    /** Function to insert an instruction into the ROB.  The parameter inst is
-     *  not truly required, but is useful for checking correctness.  Note
-     *  that whatever calls this function must ensure that there is enough
-     *  space within the ROB for the new instruction.
+    /** Sets pointer to the list of active threads.
+     *  @param at_ptr Pointer to the list of active threads.
+     */
+    void setActiveThreads(std::list<unsigned>* at_ptr);
+
+    /** Function to insert an instruction into the ROB. Note that whatever
+     *  calls this function must ensure that there is enough space within the
+     *  ROB for the new instruction.
      *  @param inst The instruction being inserted into the ROB.
-     *  @todo Remove the parameter once correctness is ensured.
      */
     void insertInst(DynInstPtr &inst);
 
@@ -84,40 +108,134 @@ class ROB
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the head of the ROB.
      */
-    DynInstPtr readHeadInst() { return cpu->instList.front(); }
+    DynInstPtr readHeadInst();
 
-    DynInstPtr readTailInst() { return (*tail); }
+    /** Returns a pointer to the head instruction of a specific thread within
+     *  the ROB.
+     *  @return Pointer to the DynInst that is at the head of the ROB.
+     */
+    DynInstPtr readHeadInst(unsigned tid);
 
+    /** Returns pointer to the tail instruction within the ROB.  There is
+     *  no guarantee as to the return value if the ROB is empty.
+     *  @retval Pointer to the DynInst that is at the tail of the ROB.
+     */
+    DynInstPtr readTailInst();
+
+    /** Returns a pointer to the tail instruction of a specific thread within
+     *  the ROB.
+     *  @return Pointer to the DynInst that is at the tail of the ROB.
+     */
+    DynInstPtr readTailInst(unsigned tid);
+
+    /** Retires the head instruction, removing it from the ROB. */
     void retireHead();
 
+    /** Retires the head instruction of a specific thread, removing it from the
+     *  ROB.
+     */
+    void retireHead(unsigned tid);
+
+    /** Is the oldest instruction across all threads ready. */
     bool isHeadReady();
 
+    /** Is the oldest instruction across a particular thread ready. */
+    bool isHeadReady(unsigned tid);
+
+    /** Is there any commitable head instruction across all threads ready. */
+    bool canCommit();
+
+    /** Re-adjust ROB partitioning. */
+    void resetEntries();
+
+    /** Number of entries needed For 'num_threads' amount of threads. */
+    int entryAmount(int num_threads);
+
+    /** Returns the number of total free entries in the ROB. */
     unsigned numFreeEntries();
 
+    /** Returns the number of free entries in a specific ROB paritition. */
+    unsigned numFreeEntries(unsigned tid);
+
+    /** Returns the maximum number of entries for a specific thread. */
+    unsigned getMaxEntries(unsigned tid)
+    { return maxEntries[tid]; }
+
+    /** Returns the number of entries being used by a specific thread. */
+    unsigned getThreadEntries(unsigned tid)
+    { return threadEntries[tid]; }
+
+    /** Returns if the ROB is full. */
     bool isFull()
     { return numInstsInROB == numEntries; }
 
+    /** Returns if a specific thread's partition is full. */
+    bool isFull(unsigned tid)
+    { return threadEntries[tid] == numEntries; }
+
+    /** Returns if the ROB is empty. */
     bool isEmpty()
     { return numInstsInROB == 0; }
 
-    void doSquash();
+    /** Returns if a specific thread's partition is empty. */
+    bool isEmpty(unsigned tid)
+    { return threadEntries[tid] == 0; }
 
-    void squash(InstSeqNum squash_num);
+    /** Executes the squash, marking squashed instructions. */
+    void doSquash(unsigned tid);
 
+    /** Squashes all instructions younger than the given sequence number for
+     *  the specific thread.
+     */
+    void squash(InstSeqNum squash_num, unsigned tid);
+
+    /** Updates the head instruction with the new oldest instruction. */
+    void updateHead();
+
+    /** Updates the tail instruction with the new youngest instruction. */
+    void updateTail();
+
+    /** Reads the PC of the oldest head instruction. */
     uint64_t readHeadPC();
 
+    /** Reads the PC of the head instruction of a specific thread. */
+    uint64_t readHeadPC(unsigned tid);
+
+    /** Reads the next PC of the oldest head instruction. */
     uint64_t readHeadNextPC();
 
+    /** Reads the next PC of the head instruction of a specific thread. */
+    uint64_t readHeadNextPC(unsigned tid);
+
+    /** Reads the sequence number of the oldest head instruction. */
     InstSeqNum readHeadSeqNum();
 
+    /** Reads the sequence number of the head instruction of a specific thread.
+     */
+    InstSeqNum readHeadSeqNum(unsigned tid);
+
+    /** Reads the PC of the youngest tail instruction. */
     uint64_t readTailPC();
 
+    /** Reads the PC of the tail instruction of a specific thread. */
+    uint64_t readTailPC(unsigned tid);
+
+    /** Reads the sequence number of the youngest tail instruction. */
     InstSeqNum readTailSeqNum();
 
+    /** Reads the sequence number of tail instruction of a specific thread. */
+    InstSeqNum readTailSeqNum(unsigned tid);
+
     /** Checks if the ROB is still in the process of squashing instructions.
      *  @retval Whether or not the ROB is done squashing.
      */
-    bool isDoneSquashing() const { return doneSquashing; }
+    bool isDoneSquashing(unsigned tid) const
+    { return doneSquashing[tid]; }
+
+    /** Checks if the ROB is still in the process of squashing instructions for
+     *  any thread.
+     */
+    bool isDoneSquashing();
 
     /** This is more of a debugging function than anything.  Use
      *  numInstsInROB to get the instructions in the ROB unless you are
@@ -125,23 +243,46 @@ class ROB
      */
     int countInsts();
 
-  private:
+    /** This is more of a debugging function than anything.  Use
+     *  threadEntries to get the instructions in the ROB unless you are
+     *  double checking that variable.
+     */
+    int countInsts(unsigned tid);
 
+  private:
     /** Pointer to the CPU. */
     FullCPU *cpu;
 
+    /** Active Threads in CPU */
+    std::list<unsigned>* activeThreads;
+
     /** Number of instructions in the ROB. */
     unsigned numEntries;
 
+    /** Entries Per Thread */
+    unsigned threadEntries[Impl::MaxThreads];
+
+    /** Max Insts a Thread Can Have in the ROB */
+    unsigned maxEntries[Impl::MaxThreads];
+
+    /** ROB List of Instructions */
+    std::list<DynInstPtr> instList[Impl::MaxThreads];
+
     /** Number of instructions that can be squashed in a single cycle. */
     unsigned squashWidth;
 
+  public:
     /** Iterator pointing to the instruction which is the last instruction
      *  in the ROB.  This may at times be invalid (ie when the ROB is empty),
      *  however it should never be incorrect.
      */
-    InstIt_t tail;
+    InstIt tail;
 
+    /** Iterator pointing to the instruction which is the first instruction in
+     *  in the ROB*/
+    InstIt head;
+
+  private:
     /** Iterator used for walking through the list of instructions when
      *  squashing.  Used so that there is persistent state between cycles;
      *  when squashing, the instructions are marked as squashed but not
@@ -149,16 +290,23 @@ class ROB
      *  and after a squash.
      *  This will always be set to cpu->instList.end() if it is invalid.
      */
-    InstIt_t squashIt;
+    InstIt squashIt[Impl::MaxThreads];
 
+  public:
     /** Number of instructions in the ROB. */
     int numInstsInROB;
 
+    DynInstPtr dummyInst;
+
+  private:
     /** The sequence number of the squashed instruction. */
     InstSeqNum squashedSeqNum;
 
     /** Is the ROB done squashing. */
-    bool doneSquashing;
+    bool doneSquashing[Impl::MaxThreads];
+
+    /** Number of active threads. */
+    unsigned numThreads;
 };
 
-#endif //__CPU_O3_CPU_ROB_HH__
+#endif //__CPU_O3_ROB_HH__
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index e7a5671d9..96d907cda 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -26,20 +26,74 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_ROB_IMPL_HH__
-#define __CPU_O3_CPU_ROB_IMPL_HH__
-
 #include "config/full_system.hh"
 #include "cpu/o3/rob.hh"
 
+using namespace std;
+
 template <class Impl>
-ROB<Impl>::ROB(unsigned _numEntries, unsigned _squashWidth)
+ROB<Impl>::ROB(unsigned _numEntries, unsigned _squashWidth,
+               string _smtROBPolicy, unsigned _smtROBThreshold,
+               unsigned _numThreads)
     : numEntries(_numEntries),
       squashWidth(_squashWidth),
       numInstsInROB(0),
-      squashedSeqNum(0)
+      squashedSeqNum(0),
+      numThreads(_numThreads)
 {
-    doneSquashing = true;
+    for (int tid=0; tid  < numThreads; tid++) {
+        doneSquashing[tid] = true;
+        threadEntries[tid] = 0;
+    }
+
+    string policy = _smtROBPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Figure out rob policy
+    if (policy == "dynamic") {
+        robPolicy = Dynamic;
+
+        //Set Max Entries to Total ROB Capacity
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i]=numEntries;
+        }
+
+    } else if (policy == "partitioned") {
+        robPolicy = Partitioned;
+        DPRINTF(Fetch, "ROB sharing policy set to Partitioned\n");
+
+        //@todo:make work if part_amt doesnt divide evenly.
+        int part_amt = numEntries / numThreads;
+
+        //Divide ROB up evenly
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i]=part_amt;
+        }
+
+    } else if (policy == "threshold") {
+        robPolicy = Threshold;
+        DPRINTF(Fetch, "ROB sharing policy set to Threshold\n");
+
+        int threshold =  _smtROBThreshold;;
+
+        //Divide up by threshold amount
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i]=threshold;
+        }
+    } else {
+        assert(0 && "Invalid ROB Sharing Policy.Options Are:{Dynamic,"
+                    "Partitioned, Threshold}");
+    }
+}
+
+template <class Impl>
+std::string
+ROB<Impl>::name() const
+{
+    return cpu->name() + ".rob";
 }
 
 template <class Impl>
@@ -48,49 +102,74 @@ ROB<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     cpu = cpu_ptr;
 
-    // Set the tail to the beginning of the CPU instruction list so that
-    // upon the first instruction being inserted into the ROB, the tail
-    // iterator can simply be incremented.
-    tail = cpu->instList.begin();
+    // Set the per-thread iterators to the end of the instruction list.
+    for (int i=0; i < numThreads;i++) {
+        squashIt[i] = instList[i].end();
+    }
 
-    // Set the squash iterator to the end of the instruction list.
-    squashIt = cpu->instList.end();
+    // Initialize the "universal" ROB head & tail point to invalid
+    // pointers
+    head = instList[0].end();
+    tail = instList[0].end();
+}
+
+template <class Impl>
+void
+ROB<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(ROB, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+
+
+template <class Impl>
+void
+ROB<Impl>::resetEntries()
+{
+    if (robPolicy != Dynamic || numThreads > 1) {
+        int active_threads = (*activeThreads).size();
+
+        list<unsigned>::iterator threads  = (*activeThreads).begin();
+        list<unsigned>::iterator list_end = (*activeThreads).end();
+
+        while (threads != list_end) {
+            if (robPolicy == Partitioned) {
+                maxEntries[*threads++] = numEntries / active_threads;
+            } else if (robPolicy == Threshold && active_threads == 1) {
+                maxEntries[*threads++] = numEntries;
+            }
+        }
+    }
+}
+
+template <class Impl>
+int
+ROB<Impl>::entryAmount(int num_threads)
+{
+    if (robPolicy == Partitioned) {
+        return numEntries / num_threads;
+    } else {
+        return 0;
+    }
 }
 
 template <class Impl>
 int
 ROB<Impl>::countInsts()
 {
-    // Start at 1; if the tail matches cpu->instList.begin(), then there is
-    // one inst in the ROB.
-    int return_val = 1;
+    int total=0;
 
-    // There are quite a few special cases.  Do not use this function other
-    // than for debugging purposes.
-    if (cpu->instList.begin() == cpu->instList.end()) {
-        // In this case there are no instructions in the list.  The ROB
-        // must be empty.
-        return 0;
-    } else if (tail == cpu->instList.end()) {
-        // In this case, the tail is not yet pointing to anything valid.
-        // The ROB must be empty.
-        return 0;
-    }
+    for (int i=0;i < numThreads;i++)
+        total += countInsts(i);
 
-    // Iterate through the ROB from the head to the tail, counting the
-    // entries.
-    for (InstIt_t i = cpu->instList.begin(); i != tail; ++i)
-    {
-        assert(i != cpu->instList.end());
-        ++return_val;
-    }
+    return total;
+}
 
-    return return_val;
-
-    // Because the head won't be tracked properly until the ROB gets the
-    // first instruction, and any time that the ROB is empty and has not
-    // yet gotten the instruction, this function doesn't work.
-//    return numInstsInROB;
+template <class Impl>
+int
+ROB<Impl>::countInsts(unsigned tid)
+{
+    return instList[tid].size();
 }
 
 template <class Impl>
@@ -98,33 +177,42 @@ void
 ROB<Impl>::insertInst(DynInstPtr &inst)
 {
     // Make sure we have the right number of instructions.
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
+
     // Make sure the instruction is valid.
     assert(inst);
 
-    DPRINTF(ROB, "ROB: Adding inst PC %#x to the ROB.\n", inst->readPC());
+    DPRINTF(ROB, "Adding inst PC %#x to the ROB.\n", inst->readPC());
 
     // If the ROB is full then exit.
     assert(numInstsInROB != numEntries);
 
-    ++numInstsInROB;
+    int tid = inst->threadNumber;
 
-    // Increment the tail iterator, moving it one instruction back.
-    // There is a special case if the ROB was empty prior to this insertion,
-    // in which case the tail will be pointing at instList.end().  If that
-    // happens, then reset the tail to the beginning of the list.
-    if (tail != cpu->instList.end()) {
-        ++tail;
-    } else {
-        tail = cpu->instList.begin();
+    // Place into ROB
+    instList[tid].push_back(inst);
+
+    //Set Up head iterator if this is the 1st instruction in the ROB
+    if (numInstsInROB == 0) {
+        head = instList[tid].begin();
+        assert((*head) == inst);
     }
 
-    // Make sure the tail iterator is actually pointing at the instruction
-    // added.
+    //Must Decrement for iterator to actually be valid  since __.end()
+    //actually points to 1 after the last inst
+    tail = instList[tid].end();
+    tail--;
+
+    // Mark as set in ROB
+    inst->setInROB();
+
+    // Increment ROB count
+    ++numInstsInROB;
+    ++threadEntries[tid];
+
     assert((*tail) == inst);
 
-    DPRINTF(ROB, "ROB: Now has %d instructions.\n", numInstsInROB);
-
+    DPRINTF(ROB, "[tid:%i] Now has %d instructions.\n", tid, threadEntries[tid]);
 }
 
 // Whatever calls this function needs to ensure that it properly frees up
@@ -133,31 +221,55 @@ template <class Impl>
 void
 ROB<Impl>::retireHead()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
+    assert(numInstsInROB > 0);
+
+    // Get the head ROB instruction's TID.
+    int tid = (*head)->threadNumber;
+
+    retireHead(tid);
+
+    if (numInstsInROB == 0) {
+        tail = instList[tid].end();
+    }
+}
+
+template <class Impl>
+void
+ROB<Impl>::retireHead(unsigned tid)
+{
+    //assert(numInstsInROB == countInsts());
     assert(numInstsInROB > 0);
 
     // Get the head ROB instruction.
-    DynInstPtr head_inst = cpu->instList.front();
+    InstIt head_it = instList[tid].begin();
+
+    DynInstPtr head_inst = (*head_it);
 
     // Make certain this can retire.
     assert(head_inst->readyToCommit());
 
-    DPRINTF(ROB, "ROB: Retiring head instruction of the ROB, "
-            "instruction PC %#x, seq num %i\n", head_inst->readPC(),
+    DPRINTF(ROB, "[tid:%u]: Retiring head instruction, "
+            "instruction PC %#x,[sn:%lli]\n", tid, head_inst->readPC(),
             head_inst->seqNum);
 
     // Keep track of how many instructions are in the ROB.
     --numInstsInROB;
+    --threadEntries[tid];
+
+    //Mark DynInstFlags
+    head_inst->removeInROB();
+    head_inst->setCommitted();
+
+    instList[tid].erase(head_it);
+
+    //Update "Global" Head of ROB
+    updateHead();
 
-    // Tell CPU to remove the instruction from the list of instructions.
     // A special case is needed if the instruction being retired is the
     // only instruction in the ROB; otherwise the tail iterator will become
     // invalidated.
     cpu->removeFrontInst(head_inst);
-
-    if (numInstsInROB == 0) {
-        tail = cpu->instList.end();
-    }
 }
 
 template <class Impl>
@@ -165,7 +277,36 @@ bool
 ROB<Impl>::isHeadReady()
 {
     if (numInstsInROB != 0) {
-        return cpu->instList.front()->readyToCommit();
+        return (*head)->readyToCommit();
+    }
+
+    return false;
+}
+
+template <class Impl>
+bool
+ROB<Impl>::isHeadReady(unsigned tid)
+{
+    if (threadEntries[tid] != 0) {
+        return instList[tid].front()->readyToCommit();
+    }
+
+    return false;
+}
+
+template <class Impl>
+bool
+ROB<Impl>::canCommit()
+{
+    //@todo: set ActiveThreads through ROB or CPU
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (isHeadReady(tid)) {
+            return true;
+        }
     }
 
     return false;
@@ -175,130 +316,339 @@ template <class Impl>
 unsigned
 ROB<Impl>::numFreeEntries()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
 
     return numEntries - numInstsInROB;
 }
 
 template <class Impl>
-void
-ROB<Impl>::doSquash()
+unsigned
+ROB<Impl>::numFreeEntries(unsigned tid)
 {
-    DPRINTF(ROB, "ROB: Squashing instructions.\n");
-
-    assert(squashIt != cpu->instList.end());
-
-    for (int numSquashed = 0;
-         numSquashed < squashWidth && (*squashIt)->seqNum != squashedSeqNum;
-         ++numSquashed)
-    {
-        // Ensure that the instruction is younger.
-        assert((*squashIt)->seqNum > squashedSeqNum);
-
-        DPRINTF(ROB, "ROB: Squashing instruction PC %#x, seq num %i.\n",
-                (*squashIt)->readPC(), (*squashIt)->seqNum);
-
-        // Mark the instruction as squashed, and ready to commit so that
-        // it can drain out of the pipeline.
-        (*squashIt)->setSquashed();
-
-        (*squashIt)->setCanCommit();
-
-        // Special case for when squashing due to a syscall.  It's possible
-        // that the squash happened after the head instruction was already
-        // committed, meaning that (*squashIt)->seqNum != squashedSeqNum
-        // will never be false.  Normally the squash would never be able
-        // to go past the head of the ROB; in this case it might, so it
-        // must be handled otherwise it will segfault.
-#if !FULL_SYSTEM
-        if (squashIt == cpu->instList.begin()) {
-            DPRINTF(ROB, "ROB: Reached head of instruction list while "
-                    "squashing.\n");
-
-            squashIt = cpu->instList.end();
-
-            doneSquashing = true;
-
-            return;
-        }
-#endif
-
-        // Move the tail iterator to the next instruction.
-        squashIt--;
-    }
-
-
-    // Check if ROB is done squashing.
-    if ((*squashIt)->seqNum == squashedSeqNum) {
-        DPRINTF(ROB, "ROB: Done squashing instructions.\n");
-
-        squashIt = cpu->instList.end();
-
-        doneSquashing = true;
-    }
+    return maxEntries[tid] - threadEntries[tid];
 }
 
 template <class Impl>
 void
-ROB<Impl>::squash(InstSeqNum squash_num)
+ROB<Impl>::doSquash(unsigned tid)
 {
-    DPRINTF(ROB, "ROB: Starting to squash within the ROB.\n");
-    doneSquashing = false;
+    DPRINTF(ROB, "[tid:%u]: Squashing instructions until [sn:%i].\n",
+            tid, squashedSeqNum);
+
+    assert(squashIt[tid] != instList[tid].end());
+
+    if ((*squashIt[tid])->seqNum < squashedSeqNum) {
+        DPRINTF(ROB, "[tid:%u]: Done squashing instructions.\n",
+                tid);
+
+        squashIt[tid] = instList[tid].end();
+
+        doneSquashing[tid] = true;
+        return;
+    }
+
+    bool robTailUpdate = false;
+
+    for (int numSquashed = 0;
+         numSquashed < squashWidth &&
+         squashIt[tid] != instList[tid].end() &&
+         (*squashIt[tid])->seqNum > squashedSeqNum;
+         ++numSquashed)
+    {
+        DPRINTF(ROB, "[tid:%u]: Squashing instruction PC %#x, seq num %i.\n",
+                (*squashIt[tid])->threadNumber,
+                (*squashIt[tid])->readPC(),
+                (*squashIt[tid])->seqNum);
+
+        // Mark the instruction as squashed, and ready to commit so that
+        // it can drain out of the pipeline.
+        (*squashIt[tid])->setSquashed();
+
+        (*squashIt[tid])->setCanCommit();
+
+
+        if (squashIt[tid] == instList[tid].begin()) {
+            DPRINTF(ROB, "Reached head of instruction list while "
+                    "squashing.\n");
+
+            squashIt[tid] = instList[tid].end();
+
+            doneSquashing[tid] = true;
+
+            return;
+        }
+
+        InstIt tail_thread = instList[tid].end();
+        tail_thread--;
+
+        if ((*squashIt[tid]) == (*tail_thread))
+            robTailUpdate = true;
+
+        squashIt[tid]--;
+    }
+
+
+    // Check if ROB is done squashing.
+    if ((*squashIt[tid])->seqNum <= squashedSeqNum) {
+        DPRINTF(ROB, "[tid:%u]: Done squashing instructions.\n",
+                tid);
+
+        squashIt[tid] = instList[tid].end();
+
+        doneSquashing[tid] = true;
+    }
+
+    if (robTailUpdate) {
+        updateTail();
+    }
+}
+
+
+template <class Impl>
+void
+ROB<Impl>::updateHead()
+{
+    DynInstPtr head_inst;
+    InstSeqNum lowest_num = 0;
+    bool first_valid = true;
+
+    // @todo: set ActiveThreads through ROB or CPU
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned thread_num = *threads++;
+
+        if (instList[thread_num].empty())
+            continue;
+
+        if (first_valid) {
+            head = instList[thread_num].begin();
+            lowest_num = (*head)->seqNum;
+            first_valid = false;
+            continue;
+        }
+
+        InstIt head_thread = instList[thread_num].begin();
+
+        DynInstPtr head_inst = (*head_thread);
+
+        assert(head_inst != 0);
+
+        if (head_inst->seqNum < lowest_num) {
+            head = head_thread;
+            lowest_num = head_inst->seqNum;
+        }
+    }
+
+    if (first_valid) {
+        head = instList[0].end();
+    }
+
+}
+
+template <class Impl>
+void
+ROB<Impl>::updateTail()
+{
+    tail = instList[0].end();
+    bool first_valid = true;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (instList[tid].empty()) {
+            continue;
+        }
+
+        // If this is the first valid then assign w/out
+        // comparison
+        if (first_valid) {
+            tail = instList[tid].end();
+            tail--;
+            first_valid = false;
+            continue;
+        }
+
+        // Assign new tail if this thread's tail is younger
+        // than our current "tail high"
+        InstIt tail_thread = instList[tid].end();
+        tail_thread--;
+
+        if ((*tail_thread)->seqNum > (*tail)->seqNum) {
+            tail = tail_thread;
+        }
+    }
+}
+
+
+template <class Impl>
+void
+ROB<Impl>::squash(InstSeqNum squash_num,unsigned tid)
+{
+    if (isEmpty()) {
+        DPRINTF(ROB, "Does not need to squash due to being empty "
+                "[sn:%i]\n",
+                squash_num);
+
+        return;
+    }
+
+    DPRINTF(ROB, "Starting to squash within the ROB.\n");
+
+    robStatus[tid] = ROBSquashing;
+
+    doneSquashing[tid] = false;
 
     squashedSeqNum = squash_num;
 
-    assert(tail != cpu->instList.end());
+    if (!instList[tid].empty()) {
+        InstIt tail_thread = instList[tid].end();
+        tail_thread--;
 
-    squashIt = tail;
+        squashIt[tid] = tail_thread;
 
-    doSquash();
+        doSquash(tid);
+    }
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+ROB<Impl>::readHeadInst()
+{
+    if (numInstsInROB != 0) {
+        assert((*head)->isInROB()==true);
+        return *head;
+    } else {
+        return dummyInst;
+    }
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+ROB<Impl>::readHeadInst(unsigned tid)
+{
+    if (threadEntries[tid] != 0) {
+        InstIt head_thread = instList[tid].begin();
+
+        assert((*head_thread)->isInROB()==true);
+
+        return *head_thread;
+    } else {
+        return dummyInst;
+    }
 }
 
 template <class Impl>
 uint64_t
 ROB<Impl>::readHeadPC()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
 
-    DynInstPtr head_inst = cpu->instList.front();
+    DynInstPtr head_inst = *head;
 
     return head_inst->readPC();
 }
 
+template <class Impl>
+uint64_t
+ROB<Impl>::readHeadPC(unsigned tid)
+{
+    //assert(numInstsInROB == countInsts());
+    InstIt head_thread = instList[tid].begin();
+
+    return (*head_thread)->readPC();
+}
+
+
 template <class Impl>
 uint64_t
 ROB<Impl>::readHeadNextPC()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
 
-    DynInstPtr head_inst = cpu->instList.front();
+    DynInstPtr head_inst = *head;
 
     return head_inst->readNextPC();
 }
 
+template <class Impl>
+uint64_t
+ROB<Impl>::readHeadNextPC(unsigned tid)
+{
+    //assert(numInstsInROB == countInsts());
+    InstIt head_thread = instList[tid].begin();
+
+    return (*head_thread)->readNextPC();
+}
+
+
 template <class Impl>
 InstSeqNum
 ROB<Impl>::readHeadSeqNum()
 {
-    // Return the last sequence number that has not been squashed.  Other
-    // stages can use it to squash any instructions younger than the current
-    // tail.
-    DynInstPtr head_inst = cpu->instList.front();
+    //assert(numInstsInROB == countInsts());
+    DynInstPtr head_inst = *head;
 
     return head_inst->seqNum;
 }
 
+template <class Impl>
+InstSeqNum
+ROB<Impl>::readHeadSeqNum(unsigned tid)
+{
+    InstIt head_thread = instList[tid].begin();
+
+    return ((*head_thread)->seqNum);
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+ROB<Impl>::readTailInst()
+{
+    //assert(numInstsInROB == countInsts());
+    //assert(tail != instList[0].end());
+
+    return (*tail);
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+ROB<Impl>::readTailInst(unsigned tid)
+{
+    //assert(tail_thread[tid] != instList[tid].end());
+
+    InstIt tail_thread = instList[tid].end();
+    tail_thread--;
+
+    return *tail_thread;
+}
+
+
 template <class Impl>
 uint64_t
 ROB<Impl>::readTailPC()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
 
-    assert(tail != cpu->instList.end());
+    //assert(tail != instList[0].end());
 
     return (*tail)->readPC();
 }
 
+template <class Impl>
+uint64_t
+ROB<Impl>::readTailPC(unsigned tid)
+{
+    //assert(tail_thread[tid] != instList[tid].end());
+
+    InstIt tail_thread = instList[tid].end();
+    tail_thread--;
+
+    return (*tail_thread)->readPC();
+}
+
 template <class Impl>
 InstSeqNum
 ROB<Impl>::readTailSeqNum()
@@ -309,4 +659,18 @@ ROB<Impl>::readTailSeqNum()
     return (*tail)->seqNum;
 }
 
-#endif // __CPU_O3_CPU_ROB_IMPL_HH__
+template <class Impl>
+InstSeqNum
+ROB<Impl>::readTailSeqNum(unsigned tid)
+{
+    // Return the last sequence number that has not been squashed.  Other
+    // stages can use it to squash any instructions younger than the current
+    // tail.
+    //    assert(tail_thread[tid] != instList[tid].end());
+
+    InstIt tail_thread = instList[tid].end();
+    tail_thread--;
+
+    return (*tail_thread)->seqNum;
+}
+
diff --git a/cpu/o3/sat_counter.cc b/cpu/o3/sat_counter.cc
index d20fff650..a6e131483 100644
--- a/cpu/o3/sat_counter.cc
+++ b/cpu/o3/sat_counter.cc
@@ -44,7 +44,7 @@ SatCounter::SatCounter(unsigned bits, unsigned initial_val)
 {
     // Check to make sure initial value doesn't exceed the max counter value.
     if (initial_val > maxVal) {
-        panic("BP: Initial counter value exceeds max size.");
+        fatal("BP: Initial counter value exceeds max size.");
     }
 }
 
@@ -57,7 +57,7 @@ SatCounter::setBits(unsigned bits)
 void
 SatCounter::increment()
 {
-    if(counter < maxVal) {
+    if (counter < maxVal) {
         ++counter;
     }
 }
@@ -65,7 +65,7 @@ SatCounter::increment()
 void
 SatCounter::decrement()
 {
-    if(counter > 0) {
+    if (counter > 0) {
         --counter;
     }
 }
diff --git a/cpu/o3/sat_counter.hh b/cpu/o3/sat_counter.hh
index b7cfe6423..952f1f86d 100644
--- a/cpu/o3/sat_counter.hh
+++ b/cpu/o3/sat_counter.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_SAT_COUNTER_HH__
-#define __CPU_O3_CPU_SAT_COUNTER_HH__
+#ifndef __CPU_O3_SAT_COUNTER_HH__
+#define __CPU_O3_SAT_COUNTER_HH__
 
 #include "sim/host.hh"
 
@@ -78,13 +78,11 @@ class SatCounter
      * Read the counter's value.
      */
     const uint8_t read() const
-    {
-        return counter;
-    }
+    { return counter; }
 
   private:
     uint8_t maxVal;
     uint8_t counter;
 };
 
-#endif // __CPU_O3_CPU_SAT_COUNTER_HH__
+#endif // __CPU_O3_SAT_COUNTER_HH__
diff --git a/cpu/o3/scoreboard.cc b/cpu/o3/scoreboard.cc
new file mode 100644
index 000000000..87b0aee94
--- /dev/null
+++ b/cpu/o3/scoreboard.cc
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/scoreboard.hh"
+
+Scoreboard::Scoreboard(unsigned activeThreads,
+                       unsigned _numLogicalIntRegs,
+                       unsigned _numPhysicalIntRegs,
+                       unsigned _numLogicalFloatRegs,
+                       unsigned _numPhysicalFloatRegs,
+                       unsigned _numMiscRegs,
+                       unsigned _zeroRegIdx)
+    : numLogicalIntRegs(_numLogicalIntRegs),
+      numPhysicalIntRegs(_numPhysicalIntRegs),
+      numLogicalFloatRegs(_numLogicalFloatRegs),
+      numPhysicalFloatRegs(_numPhysicalFloatRegs),
+      numMiscRegs(_numMiscRegs),
+      zeroRegIdx(_zeroRegIdx)
+{
+    //Get Register Sizes
+    numLogicalRegs = numLogicalIntRegs  + numLogicalFloatRegs;
+    numPhysicalRegs = numPhysicalIntRegs  + numPhysicalFloatRegs;
+
+    //Resize scoreboard appropriately
+    regScoreBoard.resize(numPhysicalRegs + (numMiscRegs * activeThreads));
+
+    //Initialize values
+    for (int i=0; i < numLogicalIntRegs * activeThreads; i++) {
+        regScoreBoard[i] = 1;
+    }
+
+    for (int i= numPhysicalIntRegs;
+         i < numPhysicalIntRegs + (numLogicalFloatRegs * activeThreads);
+         i++) {
+        regScoreBoard[i] = 1;
+    }
+
+    for (int i = numPhysicalRegs;
+         i < numPhysicalRegs + (numMiscRegs * activeThreads);
+         i++) {
+        regScoreBoard[i] = 1;
+    }
+}
+
+std::string
+Scoreboard::name() const
+{
+    return "cpu.scoreboard";
+}
+
+bool
+Scoreboard::getReg(PhysRegIndex phys_reg)
+{
+    // Always ready if int or fp zero reg.
+    if (phys_reg == zeroRegIdx ||
+        phys_reg == (zeroRegIdx + numPhysicalIntRegs)) {
+        return 1;
+    }
+
+    return regScoreBoard[phys_reg];
+}
+
+void
+Scoreboard::setReg(PhysRegIndex phys_reg)
+{
+    DPRINTF(Scoreboard, "Setting reg %i as ready\n", phys_reg);
+
+    regScoreBoard[phys_reg] = 1;
+}
+
+void
+Scoreboard::unsetReg(PhysRegIndex ready_reg)
+{
+    if (ready_reg == zeroRegIdx ||
+        ready_reg == (zeroRegIdx + numPhysicalIntRegs)) {
+        // Don't do anything if int or fp zero reg.
+    }
+
+    regScoreBoard[ready_reg] = 0;
+}
diff --git a/cpu/o3/scoreboard.hh b/cpu/o3/scoreboard.hh
new file mode 100644
index 000000000..77f2cf157
--- /dev/null
+++ b/cpu/o3/scoreboard.hh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_SCOREBOARD_HH__
+#define __CPU_O3_SCOREBOARD_HH__
+
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "arch/alpha/isa_traits.hh"
+#include "base/trace.hh"
+#include "base/traceflags.hh"
+#include "cpu/o3/comm.hh"
+
+/**
+ * Implements a simple scoreboard to track which registers are ready.
+ * This class assumes that the fp registers start, index wise, right after
+ * the integer registers. The misc. registers start, index wise, right after
+ * the fp registers.
+ * @todo: Fix up handling of the zero register in case the decoder does not
+ * automatically make insts that write the zero register into nops.
+ */
+class Scoreboard
+{
+  public:
+    /** Constructs a scoreboard.
+     *  @param activeThreads The number of active threads.
+     *  @param _numLogicalIntRegs Number of logical integer registers.
+     *  @param _numPhysicalIntRegs Number of physical integer registers.
+     *  @param _numLogicalFloatRegs Number of logical fp registers.
+     *  @param _numPhysicalFloatRegs Number of physical fp registers.
+     *  @param _numMiscRegs Number of miscellaneous registers.
+     *  @param _zeroRegIdx Index of the zero register.
+     */
+    Scoreboard(unsigned activeThreads,
+               unsigned _numLogicalIntRegs,
+               unsigned _numPhysicalIntRegs,
+               unsigned _numLogicalFloatRegs,
+               unsigned _numPhysicalFloatRegs,
+               unsigned _numMiscRegs,
+               unsigned _zeroRegIdx);
+
+    /** Destructor. */
+    ~Scoreboard() {}
+
+    /** Returns the name of the scoreboard. */
+    std::string name() const;
+
+    /** Checks if the register is ready. */
+    bool getReg(PhysRegIndex ready_reg);
+
+    /** Sets the register as ready. */
+    void setReg(PhysRegIndex phys_reg);
+
+    /** Sets the register as not ready. */
+    void unsetReg(PhysRegIndex ready_reg);
+
+  private:
+    /** Scoreboard of physical integer registers, saying whether or not they
+     *  are ready.
+     */
+    std::vector<bool> regScoreBoard;
+
+    /** Number of logical integer registers. */
+    int numLogicalIntRegs;
+
+    /** Number of physical integer registers. */
+    int numPhysicalIntRegs;
+
+    /** Number of logical floating point registers. */
+    int numLogicalFloatRegs;
+
+    /** Number of physical floating point registers. */
+    int numPhysicalFloatRegs;
+
+    /** Number of miscellaneous registers. */
+    int numMiscRegs;
+
+    /** Number of logical integer + float registers. */
+    int numLogicalRegs;
+
+    /** Number of physical integer + float registers. */
+    int numPhysicalRegs;
+
+    /** The logical index of the zero register. */
+    int zeroRegIdx;
+};
+
+#endif
diff --git a/cpu/o3/store_set.cc b/cpu/o3/store_set.cc
index 11023f4a8..a685646f3 100644
--- a/cpu/o3/store_set.cc
+++ b/cpu/o3/store_set.cc
@@ -30,43 +30,76 @@
 #include "cpu/o3/store_set.hh"
 
 StoreSet::StoreSet(int _SSIT_size, int _LFST_size)
-    : SSIT_size(_SSIT_size), LFST_size(_LFST_size)
+    : SSITSize(_SSIT_size), LFSTSize(_LFST_size)
 {
     DPRINTF(StoreSet, "StoreSet: Creating store set object.\n");
     DPRINTF(StoreSet, "StoreSet: SSIT size: %i, LFST size: %i.\n",
-            SSIT_size, LFST_size);
+            SSITSize, LFSTSize);
 
-    SSIT = new SSID[SSIT_size];
+    SSIT.resize(SSITSize);
 
-    validSSIT.resize(SSIT_size);
+    validSSIT.resize(SSITSize);
 
-    for (int i = 0; i < SSIT_size; ++i)
+    for (int i = 0; i < SSITSize; ++i)
         validSSIT[i] = false;
 
-    LFST = new InstSeqNum[LFST_size];
+    LFST.resize(LFSTSize);
 
-    validLFST.resize(LFST_size);
+    validLFST.resize(LFSTSize);
 
-    SSCounters = new int[LFST_size];
-
-    for (int i = 0; i < LFST_size; ++i)
-    {
+    for (int i = 0; i < LFSTSize; ++i) {
         validLFST[i] = false;
-        SSCounters[i] = 0;
+        LFST[i] = 0;
     }
 
-    index_mask = SSIT_size - 1;
+    indexMask = SSITSize - 1;
 
-    offset_bits = 2;
+    offsetBits = 2;
 }
 
+StoreSet::~StoreSet()
+{
+}
+
+void
+StoreSet::init(int _SSIT_size, int _LFST_size)
+{
+    SSITSize = _SSIT_size;
+    LFSTSize = _LFST_size;
+
+    DPRINTF(StoreSet, "StoreSet: Creating store set object.\n");
+    DPRINTF(StoreSet, "StoreSet: SSIT size: %i, LFST size: %i.\n",
+            SSITSize, LFSTSize);
+
+    SSIT.resize(SSITSize);
+
+    validSSIT.resize(SSITSize);
+
+    for (int i = 0; i < SSITSize; ++i)
+        validSSIT[i] = false;
+
+    LFST.resize(LFSTSize);
+
+    validLFST.resize(LFSTSize);
+
+    for (int i = 0; i < LFSTSize; ++i) {
+        validLFST[i] = false;
+        LFST[i] = 0;
+    }
+
+    indexMask = SSITSize - 1;
+
+    offsetBits = 2;
+}
+
+
 void
 StoreSet::violation(Addr store_PC, Addr load_PC)
 {
     int load_index = calcIndex(load_PC);
     int store_index = calcIndex(store_PC);
 
-    assert(load_index < SSIT_size && store_index < SSIT_size);
+    assert(load_index < SSITSize && store_index < SSITSize);
 
     bool valid_load_SSID = validSSIT[load_index];
     bool valid_store_SSID = validSSIT[store_index];
@@ -83,10 +116,7 @@ StoreSet::violation(Addr store_PC, Addr load_PC)
 
         SSIT[store_index] = new_set;
 
-        assert(new_set < LFST_size);
-
-        SSCounters[new_set]++;
-
+        assert(new_set < LFSTSize);
 
         DPRINTF(StoreSet, "StoreSet: Neither load nor store had a valid "
                 "storeset, creating a new one: %i for load %#x, store %#x\n",
@@ -98,9 +128,7 @@ StoreSet::violation(Addr store_PC, Addr load_PC)
 
         SSIT[store_index] = load_SSID;
 
-        assert(load_SSID < LFST_size);
-
-        SSCounters[load_SSID]++;
+        assert(load_SSID < LFSTSize);
 
         DPRINTF(StoreSet, "StoreSet: Load had a valid store set.  Adding "
                 "store to that set: %i for load %#x, store %#x\n",
@@ -112,9 +140,6 @@ StoreSet::violation(Addr store_PC, Addr load_PC)
 
         SSIT[load_index] = store_SSID;
 
-        // Because we are having a load point to an already existing set,
-        // the size of the store set is not incremented.
-
         DPRINTF(StoreSet, "StoreSet: Store had a valid store set: %i for "
                 "load %#x, store %#x\n",
                 store_SSID, load_PC, store_PC);
@@ -122,29 +147,19 @@ StoreSet::violation(Addr store_PC, Addr load_PC)
         SSID load_SSID = SSIT[load_index];
         SSID store_SSID = SSIT[store_index];
 
-        assert(load_SSID < LFST_size && store_SSID < LFST_size);
+        assert(load_SSID < LFSTSize && store_SSID < LFSTSize);
 
-        int load_SS_size = SSCounters[load_SSID];
-        int store_SS_size = SSCounters[store_SSID];
-
-        // If the load has the bigger store set, then assign the store
-        // to the same store set as the load.  Otherwise vice-versa.
-        if (load_SS_size > store_SS_size) {
+        // The store set with the lower number wins
+        if (store_SSID > load_SSID) {
             SSIT[store_index] = load_SSID;
 
-            SSCounters[load_SSID]++;
-            SSCounters[store_SSID]--;
-
-            DPRINTF(StoreSet, "StoreSet: Load had bigger store set: %i; "
+            DPRINTF(StoreSet, "StoreSet: Load had smaller store set: %i; "
                     "for load %#x, store %#x\n",
                     load_SSID, load_PC, store_PC);
         } else {
             SSIT[load_index] = store_SSID;
 
-            SSCounters[store_SSID]++;
-            SSCounters[load_SSID]--;
-
-            DPRINTF(StoreSet, "StoreSet: Store had bigger store set: %i; "
+            DPRINTF(StoreSet, "StoreSet: Store had smaller store set: %i; "
                     "for load %#x, store %#x\n",
                     store_SSID, load_PC, store_PC);
         }
@@ -159,13 +174,14 @@ StoreSet::insertLoad(Addr load_PC, InstSeqNum load_seq_num)
 }
 
 void
-StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)
+StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num,
+                      unsigned tid)
 {
     int index = calcIndex(store_PC);
 
     int store_SSID;
 
-    assert(index < SSIT_size);
+    assert(index < SSITSize);
 
     if (!validSSIT[index]) {
         // Do nothing if there's no valid entry.
@@ -173,13 +189,15 @@ StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)
     } else {
         store_SSID = SSIT[index];
 
-        assert(store_SSID < LFST_size);
+        assert(store_SSID < LFSTSize);
 
         // Update the last store that was fetched with the current one.
         LFST[store_SSID] = store_seq_num;
 
         validLFST[store_SSID] = 1;
 
+        storeList[store_seq_num] = store_SSID;
+
         DPRINTF(StoreSet, "Store %#x updated the LFST, SSID: %i\n",
                 store_PC, store_SSID);
     }
@@ -192,7 +210,7 @@ StoreSet::checkInst(Addr PC)
 
     int inst_SSID;
 
-    assert(index < SSIT_size);
+    assert(index < SSITSize);
 
     if (!validSSIT[index]) {
         DPRINTF(StoreSet, "Inst %#x with index %i had no SSID\n",
@@ -203,7 +221,7 @@ StoreSet::checkInst(Addr PC)
     } else {
         inst_SSID = SSIT[index];
 
-        assert(inst_SSID < LFST_size);
+        assert(inst_SSID < LFSTSize);
 
         if (!validLFST[inst_SSID]) {
 
@@ -232,7 +250,13 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 
     int store_SSID;
 
-    assert(index < SSIT_size);
+    assert(index < SSITSize);
+
+    SeqNumMapIt store_list_it = storeList.find(issued_seq_num);
+
+    if (store_list_it != storeList.end()) {
+        storeList.erase(store_list_it);
+    }
 
     // Make sure the SSIT still has a valid entry for the issued store.
     if (!validSSIT[index]) {
@@ -241,7 +265,7 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 
     store_SSID = SSIT[index];
 
-    assert(store_SSID < LFST_size);
+    assert(store_SSID < LFSTSize);
 
     // If the last fetched store in the store set refers to the store that
     // was just issued, then invalidate the entry.
@@ -252,18 +276,36 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 }
 
 void
-StoreSet::squash(InstSeqNum squashed_num)
+StoreSet::squash(InstSeqNum squashed_num, unsigned tid)
 {
     // Not really sure how to do this well.
     // Generally this is small enough that it should be okay; short circuit
     // evaluation should take care of invalid entries.
+    // Maybe keep a list of valid LFST's?  Really ugly either way...
 
     DPRINTF(StoreSet, "StoreSet: Squashing until inum %i\n",
             squashed_num);
 
-    for (int i = 0; i < LFST_size; ++i) {
-        if (validLFST[i] && LFST[i] < squashed_num) {
-            validLFST[i] = false;
+    int idx;
+    SeqNumMapIt store_list_it = storeList.begin();
+
+    //@todo:Fix to only delete from correct thread
+    while (!storeList.empty()) {
+        idx = (*store_list_it).second;
+
+        if ((*store_list_it).first <= squashed_num) {
+            break;
+        }
+
+        bool younger = LFST[idx] > squashed_num;
+
+        if (validLFST[idx] && younger) {
+            DPRINTF(StoreSet, "Squashed [sn:%lli]\n", LFST[idx]);
+            validLFST[idx] = false;
+
+            storeList.erase(store_list_it++);
+        } else if (!validLFST[idx] && younger) {
+            storeList.erase(store_list_it++);
         }
     }
 }
@@ -271,12 +313,13 @@ StoreSet::squash(InstSeqNum squashed_num)
 void
 StoreSet::clear()
 {
-    for (int i = 0; i < SSIT_size; ++i) {
+    for (int i = 0; i < SSITSize; ++i) {
         validSSIT[i] = false;
     }
 
-    for (int i = 0; i < LFST_size; ++i) {
+    for (int i = 0; i < LFSTSize; ++i) {
         validLFST[i] = false;
     }
-}
 
+    storeList.clear();
+}
diff --git a/cpu/o3/store_set.hh b/cpu/o3/store_set.hh
index 5a885d838..7189db3ab 100644
--- a/cpu/o3/store_set.hh
+++ b/cpu/o3/store_set.hh
@@ -26,61 +26,80 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_STORE_SET_HH__
-#define __CPU_O3_CPU_STORE_SET_HH__
+#ifndef __CPU_O3_STORE_SET_HH__
+#define __CPU_O3_STORE_SET_HH__
 
+#include <list>
+#include <map>
+#include <utility>
 #include <vector>
 
 #include "arch/isa_traits.hh"
 #include "cpu/inst_seq.hh"
 
+struct ltseqnum {
+    bool operator()(const InstSeqNum &lhs, const InstSeqNum &rhs) const
+    {
+        return lhs > rhs;
+    }
+};
+
 class StoreSet
 {
   public:
     typedef unsigned SSID;
 
   public:
+    StoreSet() { };
+
     StoreSet(int SSIT_size, int LFST_size);
 
+    ~StoreSet();
+
+    void init(int SSIT_size, int LFST_size);
+
     void violation(Addr store_PC, Addr load_PC);
 
     void insertLoad(Addr load_PC, InstSeqNum load_seq_num);
 
-    void insertStore(Addr store_PC, InstSeqNum store_seq_num);
+    void insertStore(Addr store_PC, InstSeqNum store_seq_num,
+                     unsigned tid);
 
     InstSeqNum checkInst(Addr PC);
 
     void issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store);
 
-    void squash(InstSeqNum squashed_num);
+    void squash(InstSeqNum squashed_num, unsigned tid);
 
     void clear();
 
   private:
     inline int calcIndex(Addr PC)
-    { return (PC >> offset_bits) & index_mask; }
+    { return (PC >> offsetBits) & indexMask; }
 
     inline SSID calcSSID(Addr PC)
-    { return ((PC ^ (PC >> 10)) % LFST_size); }
+    { return ((PC ^ (PC >> 10)) % LFSTSize); }
 
-    SSID *SSIT;
+    std::vector<SSID> SSIT;
 
     std::vector<bool> validSSIT;
 
-    InstSeqNum *LFST;
+    std::vector<InstSeqNum> LFST;
 
     std::vector<bool> validLFST;
 
-    int *SSCounters;
+    std::map<InstSeqNum, int, ltseqnum> storeList;
 
-    int SSIT_size;
+    typedef std::map<InstSeqNum, int, ltseqnum>::iterator SeqNumMapIt;
 
-    int LFST_size;
+    int SSITSize;
 
-    int index_mask;
+    int LFSTSize;
+
+    int indexMask;
 
     // HACK: Hardcoded for now.
-    int offset_bits;
+    int offsetBits;
 };
 
-#endif // __CPU_O3_CPU_STORE_SET_HH__
+#endif // __CPU_O3_STORE_SET_HH__
diff --git a/cpu/o3/thread_state.hh b/cpu/o3/thread_state.hh
new file mode 100644
index 000000000..846f44176
--- /dev/null
+++ b/cpu/o3/thread_state.hh
@@ -0,0 +1,143 @@
+
+#ifndef __CPU_O3_THREAD_STATE_HH__
+#define __CPU_O3_THREAD_STATE_HH__
+
+#include "arch/faults.hh"
+#include "arch/isa_traits.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/thread_state.hh"
+
+class Event;
+class Process;
+
+#if FULL_SYSTEM
+class EndQuiesceEvent;
+class FunctionProfile;
+class ProfileNode;
+#else
+class Process;
+class FunctionalMemory;
+#endif
+
+// In the new CPU case this may be quite small...It depends on what I define
+// ThreadState to be.  Currently it's only the state that exists within
+// ExecContext basically.  Leaves the interface and manipulation up to the
+// CPU.  Not sure this is useful/flexible...probably can be if I can avoid
+// including state here that parts of the pipeline can't modify directly,
+// or at least don't let them.  The only problem is for state that's needed
+// per thread, per structure.  I.e. rename table, memreqs.
+// On the other hand, it might be nice to not have to pay the extra pointer
+// lookup to get frequently used state such as a memreq (that isn't used much
+// elsewhere)...
+
+// Maybe this ozone thread state should only really have committed state?
+// I need to think about why I'm using this and what it's useful for.  Clearly
+// has benefits for SMT; basically serves same use as CPUExecContext.
+// Makes the ExecContext proxy easier.  Gives organization/central access point
+// to state of a thread that can be accessed normally (i.e. not in-flight
+// stuff within a OoO processor).  Does this need an XC proxy within it?
+template <class Impl>
+struct O3ThreadState : public ThreadState {
+    typedef ExecContext::Status Status;
+    typedef typename Impl::FullCPU FullCPU;
+
+    Status _status;
+
+    // Current instruction?
+    TheISA::MachInst inst;
+  private:
+    FullCPU *cpu;
+  public:
+
+    bool inSyscall;
+
+    bool trapPending;
+
+#if FULL_SYSTEM
+    O3ThreadState(FullCPU *_cpu, int _thread_num, FunctionalMemory *_mem)
+        : ThreadState(-1, _thread_num, _mem),
+          inSyscall(0), trapPending(0)
+    { }
+#else
+    O3ThreadState(FullCPU *_cpu, int _thread_num, Process *_process, int _asid)
+        : ThreadState(-1, _thread_num, NULL, _process, _asid),
+          cpu(_cpu), inSyscall(0), trapPending(0)
+    { }
+
+    O3ThreadState(FullCPU *_cpu, int _thread_num, FunctionalMemory *_mem,
+                  int _asid)
+        : ThreadState(-1, _thread_num, _mem, NULL, _asid),
+          cpu(_cpu), inSyscall(0), trapPending(0)
+    { }
+#endif
+
+    ExecContext *xcProxy;
+
+    ExecContext *getXCProxy() { return xcProxy; }
+
+    Status status() const { return _status; }
+
+    void setStatus(Status new_status) { _status = new_status; }
+
+#if !FULL_SYSTEM
+
+    Fault dummyTranslation(MemReqPtr &req)
+    {
+#if 0
+        assert((req->vaddr >> 48 & 0xffff) == 0);
+#endif
+
+        // put the asid in the upper 16 bits of the paddr
+        req->paddr = req->vaddr & ~((Addr)0xffff << sizeof(Addr) * 8 - 16);
+        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
+        return NoFault;
+    }
+    Fault translateInstReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+    Fault translateDataReadReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+    Fault translateDataWriteReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+
+    bool validInstAddr(Addr addr)
+    { return process->validInstAddr(addr); }
+
+    bool validDataAddr(Addr addr)
+    { return process->validDataAddr(addr); }
+#else
+    Fault translateInstReq(MemReqPtr &req)
+    {
+        return cpu->itb->translate(req);
+    }
+
+    Fault translateDataReadReq(MemReqPtr &req)
+    {
+        return cpu->dtb->translate(req, false);
+    }
+
+    Fault translateDataWriteReq(MemReqPtr &req)
+    {
+        return cpu->dtb->translate(req, true);
+    }
+#endif
+
+    bool misspeculating() { return false; }
+
+    void setInst(TheISA::MachInst _inst) { inst = _inst; }
+
+    Counter readFuncExeInst() { return funcExeInst; }
+
+    void setFuncExeInst(Counter new_val) { funcExeInst = new_val; }
+
+#if !FULL_SYSTEM
+    void syscall() { process->syscall(xcProxy); }
+#endif
+};
+
+#endif // __CPU_O3_THREAD_STATE_HH__
diff --git a/cpu/o3/tournament_pred.cc b/cpu/o3/tournament_pred.cc
index 3fb580510..89da7b9f5 100644
--- a/cpu/o3/tournament_pred.cc
+++ b/cpu/o3/tournament_pred.cc
@@ -28,37 +28,37 @@
 
 #include "cpu/o3/tournament_pred.hh"
 
-TournamentBP::TournamentBP(unsigned _local_predictor_size,
-                           unsigned _local_ctr_bits,
-                           unsigned _local_history_table_size,
-                           unsigned _local_history_bits,
-                           unsigned _global_predictor_size,
-                           unsigned _global_ctr_bits,
-                           unsigned _global_history_bits,
-                           unsigned _choice_predictor_size,
-                           unsigned _choice_ctr_bits,
+TournamentBP::TournamentBP(unsigned _localPredictorSize,
+                           unsigned _localCtrBits,
+                           unsigned _localHistoryTableSize,
+                           unsigned _localHistoryBits,
+                           unsigned _globalPredictorSize,
+                           unsigned _globalCtrBits,
+                           unsigned _globalHistoryBits,
+                           unsigned _choicePredictorSize,
+                           unsigned _choiceCtrBits,
                            unsigned _instShiftAmt)
-    : localPredictorSize(_local_predictor_size),
-      localCtrBits(_local_ctr_bits),
-      localHistoryTableSize(_local_history_table_size),
-      localHistoryBits(_local_history_bits),
-      globalPredictorSize(_global_predictor_size),
-      globalCtrBits(_global_ctr_bits),
-      globalHistoryBits(_global_history_bits),
-      choicePredictorSize(_global_predictor_size),
-      choiceCtrBits(_choice_ctr_bits),
+    : localPredictorSize(_localPredictorSize),
+      localCtrBits(_localCtrBits),
+      localHistoryTableSize(_localHistoryTableSize),
+      localHistoryBits(_localHistoryBits),
+      globalPredictorSize(_globalPredictorSize),
+      globalCtrBits(_globalCtrBits),
+      globalHistoryBits(_globalHistoryBits),
+      choicePredictorSize(_globalPredictorSize),
+      choiceCtrBits(_choiceCtrBits),
       instShiftAmt(_instShiftAmt)
 {
     //Should do checks here to make sure sizes are correct (powers of 2)
 
     //Setup the array of counters for the local predictor
-    localCtrs = new SatCounter[localPredictorSize];
+    localCtrs.resize(localPredictorSize);
 
     for (int i = 0; i < localPredictorSize; ++i)
         localCtrs[i].setBits(localCtrBits);
 
     //Setup the history table for the local table
-    localHistoryTable = new unsigned[localHistoryTableSize];
+    localHistoryTable.resize(localHistoryTableSize);
 
     for (int i = 0; i < localHistoryTableSize; ++i)
         localHistoryTable[i] = 0;
@@ -67,7 +67,7 @@ TournamentBP::TournamentBP(unsigned _local_predictor_size,
     localHistoryMask = (1 << localHistoryBits) - 1;
 
     //Setup the array of counters for the global predictor
-    globalCtrs = new SatCounter[globalPredictorSize];
+    globalCtrs.resize(globalPredictorSize);
 
     for (int i = 0; i < globalPredictorSize; ++i)
         globalCtrs[i].setBits(globalCtrBits);
@@ -78,7 +78,7 @@ TournamentBP::TournamentBP(unsigned _local_predictor_size,
     globalHistoryMask = (1 << globalHistoryBits) - 1;
 
     //Setup the array of counters for the choice predictor
-    choiceCtrs = new SatCounter[choicePredictorSize];
+    choiceCtrs.resize(choicePredictorSize);
 
     for (int i = 0; i < choicePredictorSize; ++i)
         choiceCtrs[i].setBits(choiceCtrBits);
@@ -240,8 +240,7 @@ TournamentBP::update(Addr &branch_addr, unsigned correct_gh, bool taken)
         globalHistory = globalHistory & globalHistoryMask;
 
         localHistoryTable[local_history_idx] |= 1;
-    }
-    else {
+    } else {
         assert(globalHistory < globalPredictorSize &&
                local_predictor_idx < localPredictorSize);
 
diff --git a/cpu/o3/tournament_pred.hh b/cpu/o3/tournament_pred.hh
index cb93c2f67..7b600aa53 100644
--- a/cpu/o3/tournament_pred.hh
+++ b/cpu/o3/tournament_pred.hh
@@ -26,12 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_TOURNAMENT_PRED_HH__
-#define __CPU_O3_CPU_TOURNAMENT_PRED_HH__
+#ifndef __CPU_O3_TOURNAMENT_PRED_HH__
+#define __CPU_O3_TOURNAMENT_PRED_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
 #include "cpu/o3/sat_counter.hh"
+#include <vector>
 
 class TournamentBP
 {
@@ -39,15 +40,15 @@ class TournamentBP
     /**
      * Default branch predictor constructor.
      */
-    TournamentBP(unsigned local_predictor_size,
-                 unsigned local_ctr_bits,
-                 unsigned local_history_table_size,
-                 unsigned local_history_bits,
-                 unsigned global_predictor_size,
-                 unsigned global_history_bits,
-                 unsigned global_ctr_bits,
-                 unsigned choice_predictor_size,
-                 unsigned choice_ctr_bits,
+    TournamentBP(unsigned localPredictorSize,
+                 unsigned localCtrBits,
+                 unsigned localHistoryTableSize,
+                 unsigned localHistoryBits,
+                 unsigned globalPredictorSize,
+                 unsigned globalHistoryBits,
+                 unsigned globalCtrBits,
+                 unsigned choicePredictorSize,
+                 unsigned choiceCtrBits,
                  unsigned instShiftAmt);
 
     /**
@@ -78,7 +79,7 @@ class TournamentBP
     inline void updateHistoriesNotTaken(unsigned local_history_idx);
 
     /** Local counters. */
-    SatCounter *localCtrs;
+    std::vector<SatCounter> localCtrs;
 
     /** Size of the local predictor. */
     unsigned localPredictorSize;
@@ -87,7 +88,7 @@ class TournamentBP
     unsigned localCtrBits;
 
     /** Array of local history table entries. */
-    unsigned *localHistoryTable;
+    std::vector<unsigned> localHistoryTable;
 
     /** Size of the local history table. */
     unsigned localHistoryTableSize;
@@ -102,7 +103,7 @@ class TournamentBP
 
 
     /** Array of counters that make up the global predictor. */
-    SatCounter *globalCtrs;
+    std::vector<SatCounter> globalCtrs;
 
     /** Size of the global predictor. */
     unsigned globalPredictorSize;
@@ -121,7 +122,7 @@ class TournamentBP
 
 
     /** Array of counters that make up the choice predictor. */
-    SatCounter *choiceCtrs;
+    std::vector<SatCounter> choiceCtrs;
 
     /** Size of the choice predictor (identical to the global predictor). */
     unsigned choicePredictorSize;
@@ -140,4 +141,4 @@ class TournamentBP
     unsigned threshold;
 };
 
-#endif // __CPU_O3_CPU_TOURNAMENT_PRED_HH__
+#endif // __CPU_O3_TOURNAMENT_PRED_HH__
diff --git a/cpu/thread_state.hh b/cpu/thread_state.hh
new file mode 100644
index 000000000..e8381b9d3
--- /dev/null
+++ b/cpu/thread_state.hh
@@ -0,0 +1,92 @@
+
+#ifndef __CPU_THREAD_STATE_HH__
+#define __CPU_THREAD_STATE_HH__
+
+#include "cpu/exec_context.hh"
+
+#if FULL_SYSTEM
+class EndQuiesceEvent;
+class FunctionProfile;
+class ProfileNode;
+#else
+class Process;
+class FunctionalMemory;
+#endif
+
+struct ThreadState {
+#if FULL_SYSTEM
+    ThreadState(int _cpuId, int _tid, FunctionalMemory *_mem)
+        : cpuId(_cpuId), tid(_tid), mem(_mem), lastActivate(0), lastSuspend(0),
+          profile(NULL), profileNode(NULL), profilePC(0), quiesceEvent(NULL)
+#else
+    ThreadState(int _cpuId, int _tid, FunctionalMemory *_mem,
+                Process *_process, short _asid)
+        : cpuId(_cpuId), tid(_tid), mem(_mem), process(_process), asid(_asid)
+#endif
+    {
+        funcExeInst = 0;
+        storeCondFailures = 0;
+    }
+
+    ExecContext::Status status;
+
+    int cpuId;
+
+    // Index of hardware thread context on the CPU that this represents.
+    int tid;
+
+    Counter numInst;
+    Stats::Scalar<> numInsts;
+    Stats::Scalar<> numMemRefs;
+
+    // number of simulated loads
+    Counter numLoad;
+    Counter startNumLoad;
+
+    FunctionalMemory *mem;	// functional storage for process address space
+
+#if FULL_SYSTEM
+    Tick lastActivate;
+    Tick lastSuspend;
+
+    FunctionProfile *profile;
+    ProfileNode *profileNode;
+    Addr profilePC;
+
+    EndQuiesceEvent *quiesceEvent;
+
+#else
+    Process *process;
+
+    // Address space ID.  Note that this is used for TIMING cache
+    // simulation only; all functional memory accesses should use
+    // one of the FunctionalMemory pointers above.
+    short asid;
+
+#endif
+
+    /**
+     * Temporary storage to pass the source address from copy_load to
+     * copy_store.
+     * @todo Remove this temporary when we have a better way to do it.
+     */
+    Addr copySrcAddr;
+    /**
+     * Temp storage for the physical source address of a copy.
+     * @todo Remove this temporary when we have a better way to do it.
+     */
+    Addr copySrcPhysAddr;
+
+    /*
+     * number of executed instructions, for matching with syscall trace
+     * points in EIO files.
+     */
+    Counter funcExeInst;
+
+    //
+    // Count failed store conditionals so we can warn of apparent
+    // application deadlock situations.
+    unsigned storeCondFailures;
+};
+
+#endif // __CPU_THREAD_STATE_HH__
diff --git a/python/m5/objects/FUPool.py b/python/m5/objects/FUPool.py
new file mode 100644
index 000000000..5eecfd12f
--- /dev/null
+++ b/python/m5/objects/FUPool.py
@@ -0,0 +1,8 @@
+from m5 import *
+from FullCPU import OpType
+from FullCPU import OpDesc
+from FullCPU import FUDesc
+
+class FUPool(SimObject):
+    type = 'FUPool'
+    FUList = VectorParam.FUDesc("list of FU's for this pool")

From 759ff4b91024835d3bf436b993b0f39e276c36fe Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 18:45:01 -0400
Subject: [PATCH 05/50] Updates for OzoneCPU.

build/SConstruct:
    Include Ozone CPU models.
cpu/cpu_models.py:
    Include OzoneCPU models.

--HG--
extra : convert_revision : 51a016c216cacd2cc613eed79653026c2edda4b3
---
 build/SConstruct                    |    4 +-
 cpu/cpu_models.py                   |    6 +
 cpu/ozone/back_end.cc               |    5 +
 cpu/ozone/back_end.hh               |  509 ++++++++
 cpu/ozone/back_end_impl.hh          | 1853 +++++++++++++++++++++++++++
 cpu/ozone/cpu.cc                    |    9 +-
 cpu/ozone/cpu.hh                    |  610 ++++-----
 cpu/ozone/cpu_builder.cc            |  818 ++++++++++++
 cpu/ozone/cpu_impl.hh               | 1140 +++++++++++++++-
 cpu/ozone/dyn_inst.cc               |   35 +
 cpu/ozone/dyn_inst.hh               |  261 ++++
 cpu/ozone/dyn_inst_impl.hh          |  286 +++++
 cpu/ozone/front_end.cc              |    7 +
 cpu/ozone/front_end.hh              |  242 ++++
 cpu/ozone/front_end_impl.hh         |  798 ++++++++++++
 cpu/ozone/inorder_back_end.cc       |    5 +
 cpu/ozone/inorder_back_end.hh       |  417 ++++++
 cpu/ozone/inorder_back_end_impl.hh  |  519 ++++++++
 cpu/ozone/inst_queue.cc             |   36 +
 cpu/ozone/inst_queue.hh             |  506 ++++++++
 cpu/ozone/inst_queue_impl.hh        | 1341 +++++++++++++++++++
 cpu/ozone/lsq_unit.cc               |   34 +
 cpu/ozone/lsq_unit.hh               |  632 +++++++++
 cpu/ozone/lsq_unit_impl.hh          |  846 ++++++++++++
 cpu/ozone/null_predictor.hh         |   76 ++
 cpu/ozone/ozone_impl.hh             |   73 ++
 cpu/ozone/rename_table.cc           |    7 +
 cpu/ozone/rename_table.hh           |   25 +
 cpu/ozone/rename_table_impl.hh      |   23 +
 cpu/ozone/simple_impl.hh            |   69 +
 cpu/ozone/simple_params.hh          |  164 +++
 cpu/ozone/thread_state.hh           |  171 +++
 python/m5/objects/OzoneCPU.py       |   86 ++
 python/m5/objects/SimpleOzoneCPU.py |   86 ++
 34 files changed, 11324 insertions(+), 375 deletions(-)
 create mode 100644 cpu/ozone/back_end.cc
 create mode 100644 cpu/ozone/back_end.hh
 create mode 100644 cpu/ozone/back_end_impl.hh
 create mode 100644 cpu/ozone/cpu_builder.cc
 create mode 100644 cpu/ozone/dyn_inst.cc
 create mode 100644 cpu/ozone/dyn_inst.hh
 create mode 100644 cpu/ozone/dyn_inst_impl.hh
 create mode 100644 cpu/ozone/front_end.cc
 create mode 100644 cpu/ozone/front_end.hh
 create mode 100644 cpu/ozone/front_end_impl.hh
 create mode 100644 cpu/ozone/inorder_back_end.cc
 create mode 100644 cpu/ozone/inorder_back_end.hh
 create mode 100644 cpu/ozone/inorder_back_end_impl.hh
 create mode 100644 cpu/ozone/inst_queue.cc
 create mode 100644 cpu/ozone/inst_queue.hh
 create mode 100644 cpu/ozone/inst_queue_impl.hh
 create mode 100644 cpu/ozone/lsq_unit.cc
 create mode 100644 cpu/ozone/lsq_unit.hh
 create mode 100644 cpu/ozone/lsq_unit_impl.hh
 create mode 100644 cpu/ozone/null_predictor.hh
 create mode 100644 cpu/ozone/ozone_impl.hh
 create mode 100644 cpu/ozone/rename_table.cc
 create mode 100644 cpu/ozone/rename_table.hh
 create mode 100644 cpu/ozone/rename_table_impl.hh
 create mode 100644 cpu/ozone/simple_impl.hh
 create mode 100644 cpu/ozone/simple_params.hh
 create mode 100644 cpu/ozone/thread_state.hh
 create mode 100644 python/m5/objects/OzoneCPU.py
 create mode 100644 python/m5/objects/SimpleOzoneCPU.py

diff --git a/build/SConstruct b/build/SConstruct
index 306d3a9dc..c40f59bc2 100644
--- a/build/SConstruct
+++ b/build/SConstruct
@@ -222,7 +222,9 @@ env = conf.Finish()
 env['ALL_ISA_LIST'] = ['alpha', 'sparc', 'mips']
 
 # Define the universe of supported CPU models
-env['ALL_CPU_LIST'] = ['SimpleCPU', 'FastCPU', 'FullCPU', 'AlphaFullCPU']
+env['ALL_CPU_LIST'] = ['SimpleCPU', 'FastCPU', 'FullCPU', 'AlphaFullCPU',
+                       'OzoneSimpleCPU', 'OzoneCPU']
+
 
 # Sticky options get saved in the options file so they persist from
 # one invocation to the next (unless overridden, in which case the new
diff --git a/cpu/cpu_models.py b/cpu/cpu_models.py
index 675204e5b..8912673f7 100644
--- a/cpu/cpu_models.py
+++ b/cpu/cpu_models.py
@@ -68,4 +68,10 @@ CpuModel('FullCPU', 'full_cpu_exec.cc',
 CpuModel('AlphaFullCPU', 'alpha_o3_exec.cc',
          '#include "cpu/o3/alpha_dyn_inst.hh"',
          { 'CPU_exec_context': 'AlphaDynInst<AlphaSimpleImpl>' })
+CpuModel('OzoneSimpleCPU', 'ozone_simple_exec.cc',
+         '#include "cpu/ozone/dyn_inst.hh"',
+         { 'CPU_exec_context': 'OzoneDynInst<SimpleImpl>' })
+CpuModel('OzoneCPU', 'ozone_exec.cc',
+         '#include "cpu/ozone/dyn_inst.hh"',
+         { 'CPU_exec_context': 'OzoneDynInst<OzoneImpl>' })
 
diff --git a/cpu/ozone/back_end.cc b/cpu/ozone/back_end.cc
new file mode 100644
index 000000000..dbab5435e
--- /dev/null
+++ b/cpu/ozone/back_end.cc
@@ -0,0 +1,5 @@
+
+#include "cpu/ozone/back_end_impl.hh"
+#include "cpu/ozone/ozone_impl.hh"
+
+template class BackEnd<OzoneImpl>;
diff --git a/cpu/ozone/back_end.hh b/cpu/ozone/back_end.hh
new file mode 100644
index 000000000..0713a0143
--- /dev/null
+++ b/cpu/ozone/back_end.hh
@@ -0,0 +1,509 @@
+
+#ifndef __CPU_OZONE_BACK_END_HH__
+#define __CPU_OZONE_BACK_END_HH__
+
+#include <list>
+#include <queue>
+#include <string>
+
+#include "arch/faults.hh"
+#include "base/timebuf.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/ozone/rename_table.hh"
+#include "cpu/ozone/thread_state.hh"
+#include "mem/functional/functional.hh"
+#include "mem/mem_interface.hh"
+#include "mem/mem_req.hh"
+#include "sim/eventq.hh"
+
+class ExecContext;
+
+template <class Impl>
+class OzoneThreadState;
+
+template <class Impl>
+class BackEnd
+{
+  public:
+    typedef OzoneThreadState<Impl> Thread;
+
+    typedef typename Impl::Params Params;
+    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::FrontEnd FrontEnd;
+    typedef typename Impl::FullCPU::CommStruct CommStruct;
+
+    struct SizeStruct {
+        int size;
+    };
+
+    typedef SizeStruct DispatchToIssue;
+    typedef SizeStruct IssueToExec;
+    typedef SizeStruct ExecToCommit;
+    typedef SizeStruct Writeback;
+
+    TimeBuffer<DispatchToIssue> d2i;
+    typename TimeBuffer<DispatchToIssue>::wire instsToDispatch;
+    TimeBuffer<IssueToExec> i2e;
+    typename TimeBuffer<IssueToExec>::wire instsToExecute;
+    TimeBuffer<ExecToCommit> e2c;
+    TimeBuffer<Writeback> numInstsToWB;
+
+    TimeBuffer<CommStruct> *comm;
+    typename TimeBuffer<CommStruct>::wire toIEW;
+    typename TimeBuffer<CommStruct>::wire fromCommit;
+
+    class InstQueue {
+        enum queue {
+            NonSpec,
+            IQ,
+            ToBeScheduled,
+            ReadyList,
+            ReplayList
+        };
+        struct pqCompare {
+            bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
+            {
+                return lhs->seqNum > rhs->seqNum;
+            }
+        };
+      public:
+        InstQueue(Params *params);
+
+        std::string name() const;
+
+        void regStats();
+
+        void setIssueExecQueue(TimeBuffer<IssueToExec> *i2e_queue);
+
+        void setBE(BackEnd *_be) { be = _be; }
+
+        void insert(DynInstPtr &inst);
+
+        void scheduleReadyInsts();
+
+        void scheduleNonSpec(const InstSeqNum &sn);
+
+        DynInstPtr getReadyInst();
+
+        void commit(const InstSeqNum &sn) {}
+
+        void squash(const InstSeqNum &sn);
+
+        int wakeDependents(DynInstPtr &inst);
+
+        /** Tells memory dependence unit that a memory instruction needs to be
+         * rescheduled. It will re-execute once replayMemInst() is called.
+         */
+        void rescheduleMemInst(DynInstPtr &inst);
+
+        /** Re-executes all rescheduled memory instructions. */
+        void replayMemInst(DynInstPtr &inst);
+
+        /** Completes memory instruction. */
+        void completeMemInst(DynInstPtr &inst);
+
+        void violation(DynInstPtr &inst, DynInstPtr &violation) { }
+
+        bool isFull() { return numInsts >= size; }
+
+        void dumpInsts();
+
+      private:
+        bool find(queue q, typename std::list<DynInstPtr>::iterator it);
+        BackEnd *be;
+        TimeBuffer<IssueToExec> *i2e;
+        typename TimeBuffer<IssueToExec>::wire numIssued;
+        typedef typename std::list<DynInstPtr> InstList;
+        typedef typename std::list<DynInstPtr>::iterator InstListIt;
+        typedef typename std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare> ReadyInstQueue;
+        // Not sure I need the IQ list; it just needs to be a count.
+        InstList iq;
+        InstList toBeScheduled;
+        InstList readyList;
+        InstList nonSpec;
+        InstList replayList;
+        ReadyInstQueue readyQueue;
+        int size;
+        int numInsts;
+        int width;
+
+        Stats::VectorDistribution<> occ_dist;
+
+        Stats::Vector<> inst_count;
+        Stats::Vector<> peak_inst_count;
+        Stats::Scalar<> empty_count;
+        Stats::Scalar<> current_count;
+        Stats::Scalar<> fullCount;
+
+        Stats::Formula occ_rate;
+        Stats::Formula avg_residency;
+        Stats::Formula empty_rate;
+        Stats::Formula full_rate;
+    };
+
+    /** LdWriteback event for a load completion. */
+    class LdWritebackEvent : public Event {
+      private:
+        /** Instruction that is writing back data to the register file. */
+        DynInstPtr inst;
+        /** Pointer to IEW stage. */
+        BackEnd *be;
+
+      public:
+        /** Constructs a load writeback event. */
+        LdWritebackEvent(DynInstPtr &_inst, BackEnd *be);
+
+        /** Processes writeback event. */
+        virtual void process();
+        /** Returns the description of the writeback event. */
+        virtual const char *description();
+    };
+
+    BackEnd(Params *params);
+
+    std::string name() const;
+
+    void regStats();
+
+    void setCPU(FullCPU *cpu_ptr)
+    { cpu = cpu_ptr; }
+
+    void setFrontEnd(FrontEnd *front_end_ptr)
+    { frontEnd = front_end_ptr; }
+
+    void setXC(ExecContext *xc_ptr)
+    { xc = xc_ptr; }
+
+    void setThreadState(Thread *thread_ptr)
+    { thread = thread_ptr; }
+
+    void setCommBuffer(TimeBuffer<CommStruct> *_comm);
+
+    void tick();
+    void squash();
+    void squashFromXC();
+    bool xcSquash;
+
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+    Addr readCommitPC() { return commitPC; }
+
+    Addr commitPC;
+
+    bool robEmpty() { return instList.empty(); }
+
+    bool isFull() { return numInsts >= numROBEntries; }
+    bool isBlocked() { return status == Blocked || dispatchStatus == Blocked; }
+
+    /** Tells memory dependence unit that a memory instruction needs to be
+     * rescheduled. It will re-execute once replayMemInst() is called.
+     */
+    void rescheduleMemInst(DynInstPtr &inst)
+    { IQ.rescheduleMemInst(inst); }
+
+    /** Re-executes all rescheduled memory instructions. */
+    void replayMemInst(DynInstPtr &inst)
+    { IQ.replayMemInst(inst); }
+
+    /** Completes memory instruction. */
+    void completeMemInst(DynInstPtr &inst)
+    { IQ.completeMemInst(inst); }
+
+    void fetchFault(Fault &fault);
+
+  private:
+    void updateStructures();
+    void dispatchInsts();
+    void dispatchStall();
+    void checkDispatchStatus();
+    void scheduleReadyInsts();
+    void executeInsts();
+    void commitInsts();
+    void addToIQ(DynInstPtr &inst);
+    void addToLSQ(DynInstPtr &inst);
+    void instToCommit(DynInstPtr &inst);
+    void writebackInsts();
+    bool commitInst(int inst_num);
+    void squash(const InstSeqNum &sn);
+    void squashDueToBranch(DynInstPtr &inst);
+    void squashDueToMemBlocked(DynInstPtr &inst);
+    void updateExeInstStats(DynInstPtr &inst);
+    void updateComInstStats(DynInstPtr &inst);
+
+  public:
+    FullCPU *cpu;
+
+    FrontEnd *frontEnd;
+
+    ExecContext *xc;
+
+    Thread *thread;
+
+    enum Status {
+        Running,
+        Idle,
+        DcacheMissStall,
+        DcacheMissComplete,
+        Blocked
+    };
+
+    Status status;
+
+    Status dispatchStatus;
+
+    Counter funcExeInst;
+
+  private:
+//    typedef typename Impl::InstQueue InstQueue;
+
+    InstQueue IQ;
+
+    typedef typename Impl::LdstQueue LdstQueue;
+
+    LdstQueue LSQ;
+  public:
+    RenameTable<Impl> commitRenameTable;
+
+    RenameTable<Impl> renameTable;
+  private:
+    class DCacheCompletionEvent : public Event
+    {
+      private:
+        BackEnd *be;
+
+      public:
+        DCacheCompletionEvent(BackEnd *_be);
+
+        virtual void process();
+        virtual const char *description();
+    };
+
+    friend class DCacheCompletionEvent;
+
+    DCacheCompletionEvent cacheCompletionEvent;
+
+    MemInterface *dcacheInterface;
+
+    MemReqPtr memReq;
+
+    // General back end width. Used if the more specific isn't given.
+    int width;
+
+    // Dispatch width.
+    int dispatchWidth;
+    int numDispatchEntries;
+    int dispatchSize;
+
+    int issueWidth;
+
+    // Writeback width
+    int wbWidth;
+
+    // Commit width
+    int commitWidth;
+
+    /** Index into queue of instructions being written back. */
+    unsigned wbNumInst;
+
+    /** Cycle number within the queue of instructions being written
+     * back.  Used in case there are too many instructions writing
+     * back at the current cycle and writesbacks need to be scheduled
+     * for the future. See comments in instToCommit().
+     */
+    unsigned wbCycle;
+
+    int numROBEntries;
+    int numInsts;
+
+  private:
+    typedef typename std::list<DynInstPtr>::iterator InstListIt;
+
+    std::list<DynInstPtr> instList;
+    std::list<DynInstPtr> dispatch;
+    std::list<DynInstPtr> writeback;
+
+    int latency;
+
+    int squashLatency;
+
+    bool exactFullStall;
+
+    bool fetchRedirect[Impl::MaxThreads];
+
+    // number of cycles stalled for D-cache misses
+/*    Stats::Scalar<> dcacheStallCycles;
+      Counter lastDcacheStall;
+*/
+    Stats::Vector<> rob_cap_events;
+    Stats::Vector<> rob_cap_inst_count;
+    Stats::Vector<> iq_cap_events;
+    Stats::Vector<> iq_cap_inst_count;
+    // total number of instructions executed
+    Stats::Vector<> exe_inst;
+    Stats::Vector<> exe_swp;
+    Stats::Vector<> exe_nop;
+    Stats::Vector<> exe_refs;
+    Stats::Vector<> exe_loads;
+    Stats::Vector<> exe_branches;
+
+    Stats::Vector<> issued_ops;
+
+    // total number of loads forwaded from LSQ stores
+    Stats::Vector<> lsq_forw_loads;
+
+    // total number of loads ignored due to invalid addresses
+    Stats::Vector<> inv_addr_loads;
+
+    // total number of software prefetches ignored due to invalid addresses
+    Stats::Vector<> inv_addr_swpfs;
+    // ready loads blocked due to memory disambiguation
+    Stats::Vector<> lsq_blocked_loads;
+
+    Stats::Scalar<> lsqInversion;
+
+    Stats::Vector<> n_issued_dist;
+    Stats::VectorDistribution<> issue_delay_dist;
+
+    Stats::VectorDistribution<> queue_res_dist;
+/*
+    Stats::Vector<> stat_fu_busy;
+    Stats::Vector2d<> stat_fuBusy;
+    Stats::Vector<> dist_unissued;
+    Stats::Vector2d<> stat_issued_inst_type;
+
+    Stats::Formula misspec_cnt;
+    Stats::Formula misspec_ipc;
+    Stats::Formula issue_rate;
+    Stats::Formula issue_stores;
+    Stats::Formula issue_op_rate;
+    Stats::Formula fu_busy_rate;
+    Stats::Formula commit_stores;
+    Stats::Formula commit_ipc;
+    Stats::Formula commit_ipb;
+    Stats::Formula lsq_inv_rate;
+*/
+    Stats::Vector<> writeback_count;
+    Stats::Vector<> producer_inst;
+    Stats::Vector<> consumer_inst;
+    Stats::Vector<> wb_penalized;
+
+    Stats::Formula wb_rate;
+    Stats::Formula wb_fanout;
+    Stats::Formula wb_penalized_rate;
+
+    // total number of instructions committed
+    Stats::Vector<> stat_com_inst;
+    Stats::Vector<> stat_com_swp;
+    Stats::Vector<> stat_com_refs;
+    Stats::Vector<> stat_com_loads;
+    Stats::Vector<> stat_com_membars;
+    Stats::Vector<> stat_com_branches;
+
+    Stats::Distribution<> n_committed_dist;
+
+    Stats::Scalar<> commit_eligible_samples;
+    Stats::Vector<> commit_eligible;
+
+    Stats::Scalar<> ROB_fcount;
+    Stats::Formula ROB_full_rate;
+
+    Stats::Vector<>  ROB_count;	 // cumulative ROB occupancy
+    Stats::Formula ROB_occ_rate;
+    Stats::VectorDistribution<> ROB_occ_dist;
+  public:
+    void dumpInsts();
+};
+
+template <class Impl>
+template <class T>
+Fault
+BackEnd<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+/*    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+    Fault fault = cpu->translateDataReadReq(memReq);
+
+    // if we have a cache, do cache access too
+    if (fault == NoFault && dcacheInterface) {
+        memReq->cmd = Read;
+        memReq->completionEvent = NULL;
+        memReq->time = curTick;
+        memReq->flags &= ~INST_READ;
+        MemAccessResult result = dcacheInterface->access(memReq);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT && dcacheInterface->doEvents()) {
+            // Fix this hack for keeping funcExeInst correct with loads that
+            // are executed twice.
+            --funcExeInst;
+
+            memReq->completionEvent = &cacheCompletionEvent;
+            lastDcacheStall = curTick;
+//	    unscheduleTickEvent();
+//	    status = DcacheMissStall;
+            DPRINTF(OzoneCPU, "Dcache miss stall!\n");
+        } else {
+            // do functional access
+            fault = thread->mem->read(memReq, data);
+
+        }
+    }
+*/
+/*
+    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+        recordEvent("Uncached Read");
+*/
+    return LSQ.read(req, data, load_idx);
+}
+
+template <class Impl>
+template <class T>
+Fault
+BackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+/*
+    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+    Fault fault = cpu->translateDataWriteReq(memReq);
+
+    if (fault == NoFault && dcacheInterface) {
+        memReq->cmd = Write;
+        memcpy(memReq->data,(uint8_t *)&data,memReq->size);
+        memReq->completionEvent = NULL;
+        memReq->time = curTick;
+        memReq->flags &= ~INST_READ;
+        MemAccessResult result = dcacheInterface->access(memReq);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT && dcacheInterface->doEvents()) {
+            memReq->completionEvent = &cacheCompletionEvent;
+            lastDcacheStall = curTick;
+//	    unscheduleTickEvent();
+//	    status = DcacheMissStall;
+            DPRINTF(OzoneCPU, "Dcache miss stall!\n");
+        }
+    }
+
+    if (res && (fault == NoFault))
+        *res = memReq->result;
+        */
+/*
+    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+        recordEvent("Uncached Write");
+*/
+    return LSQ.write(req, data, store_idx);
+}
+
+#endif // __CPU_OZONE_BACK_END_HH__
diff --git a/cpu/ozone/back_end_impl.hh b/cpu/ozone/back_end_impl.hh
new file mode 100644
index 000000000..807afaf2e
--- /dev/null
+++ b/cpu/ozone/back_end_impl.hh
@@ -0,0 +1,1853 @@
+
+#include "encumbered/cpu/full/op_class.hh"
+#include "cpu/ozone/back_end.hh"
+
+template <class Impl>
+BackEnd<Impl>::InstQueue::InstQueue(Params *params)
+    : size(params->numIQEntries), numInsts(0), width(params->issueWidth)
+{
+}
+
+template <class Impl>
+std::string
+BackEnd<Impl>::InstQueue::name() const
+{
+    return be->name() + ".iq";
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::regStats()
+{
+    using namespace Stats;
+
+    occ_dist
+        .init(1, 0, size, 2)
+        .name(name() + "occ_dist")
+        .desc("IQ Occupancy per cycle")
+        .flags(total | cdf)
+        ;
+
+    inst_count
+        .init(1)
+        .name(name() + "cum_num_insts")
+        .desc("Total occupancy")
+        .flags(total)
+        ;
+
+    peak_inst_count
+        .init(1)
+        .name(name() + "peak_occupancy")
+        .desc("Peak IQ occupancy")
+        .flags(total)
+        ;
+
+    current_count
+        .name(name() + "current_count")
+        .desc("Occupancy this cycle")
+        ;
+
+    empty_count
+        .name(name() + "empty_count")
+        .desc("Number of empty cycles")
+        ;
+
+    fullCount
+        .name(name() + "full_count")
+        .desc("Number of full cycles")
+        ;
+
+
+    occ_rate
+        .name(name() + "occ_rate")
+        .desc("Average occupancy")
+        .flags(total)
+        ;
+    occ_rate = inst_count / be->cpu->numCycles;
+
+    avg_residency
+        .name(name() + "avg_residency")
+        .desc("Average IQ residency")
+        .flags(total)
+        ;
+    avg_residency = occ_rate / be->cpu->numCycles;
+
+    empty_rate
+        .name(name() + "empty_rate")
+        .desc("Fraction of cycles empty")
+        ;
+    empty_rate = 100 * empty_count / be->cpu->numCycles;
+
+    full_rate
+        .name(name() + "full_rate")
+        .desc("Fraction of cycles full")
+        ;
+    full_rate = 100 * fullCount / be->cpu->numCycles;
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::setIssueExecQueue(TimeBuffer<IssueToExec> *i2e_queue)
+{
+    i2e = i2e_queue;
+    numIssued = i2e->getWire(0);
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::insert(DynInstPtr &inst)
+{
+    numInsts++;
+    inst_count[0]++;
+    if (!inst->isNonSpeculative()) {
+        if (inst->readyToIssue()) {
+            toBeScheduled.push_front(inst);
+            inst->iqIt = toBeScheduled.begin();
+            inst->iqItValid = true;
+        } else {
+            iq.push_front(inst);
+            inst->iqIt = iq.begin();
+            inst->iqItValid = true;
+        }
+    } else {
+        nonSpec.push_front(inst);
+        inst->iqIt = nonSpec.begin();
+        inst->iqItValid = true;
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::scheduleReadyInsts()
+{
+    int scheduled = numIssued->size;
+    InstListIt iq_it = --toBeScheduled.end();
+    InstListIt iq_end_it = toBeScheduled.end();
+
+    while (iq_it != iq_end_it && scheduled < width) {
+//        if ((*iq_it)->readyToIssue()) {
+            DPRINTF(BE, "Instruction [sn:%lli] PC:%#x is ready\n",
+                    (*iq_it)->seqNum, (*iq_it)->readPC());
+            readyQueue.push(*iq_it);
+            readyList.push_front(*iq_it);
+
+            (*iq_it)->iqIt = readyList.begin();
+
+            toBeScheduled.erase(iq_it--);
+
+            ++scheduled;
+//        } else {
+//            iq_it++;
+//        }
+    }
+
+    numIssued->size+= scheduled;
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::scheduleNonSpec(const InstSeqNum &sn)
+{
+/*
+    InstListIt non_spec_it = nonSpec.begin();
+    InstListIt non_spec_end_it = nonSpec.end();
+
+    while ((*non_spec_it)->seqNum != sn) {
+        non_spec_it++;
+        assert(non_spec_it != non_spec_end_it);
+    }
+*/
+    DynInstPtr inst = nonSpec.back();
+
+    assert(inst->seqNum == sn);
+
+    assert(find(NonSpec, inst->iqIt));
+    nonSpec.erase(inst->iqIt);
+    readyList.push_front(inst);
+    inst->iqIt = readyList.begin();
+    readyQueue.push(inst);
+    numIssued->size++;
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+BackEnd<Impl>::InstQueue::getReadyInst()
+{
+    assert(!readyList.empty());
+
+    DynInstPtr inst = readyQueue.top();
+    readyQueue.pop();
+    assert(find(ReadyList, inst->iqIt));
+    readyList.erase(inst->iqIt);
+    inst->iqItValid = false;
+//    if (!inst->isMemRef())
+        --numInsts;
+    return inst;
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::squash(const InstSeqNum &sn)
+{
+    InstListIt iq_it = iq.begin();
+    InstListIt iq_end_it = iq.end();
+
+    while (iq_it != iq_end_it && (*iq_it)->seqNum > sn) {
+        (*iq_it)->iqItValid = false;
+        iq.erase(iq_it++);
+        --numInsts;
+    }
+
+    iq_it = nonSpec.begin();
+    iq_end_it = nonSpec.end();
+
+    while (iq_it != iq_end_it && (*iq_it)->seqNum > sn) {
+        (*iq_it)->iqItValid = false;
+        nonSpec.erase(iq_it++);
+        --numInsts;
+    }
+
+    iq_it = replayList.begin();
+    iq_end_it = replayList.end();
+
+    while (iq_it != iq_end_it) {
+        if ((*iq_it)->seqNum > sn) {
+            (*iq_it)->iqItValid = false;
+            replayList.erase(iq_it++);
+            --numInsts;
+        } else {
+            iq_it++;
+        }
+    }
+
+    assert(numInsts >= 0);
+/*
+    InstListIt ready_it = readyList.begin();
+    InstListIt ready_end_it = readyList.end();
+
+    while (ready_it != ready_end_it) {
+        if ((*ready_it)->seqNum > sn) {
+            readyList.erase(ready_it++);
+        } else {
+            ready_it++;
+        }
+    }
+*/
+}
+
+template <class Impl>
+int
+BackEnd<Impl>::InstQueue::wakeDependents(DynInstPtr &inst)
+{
+    assert(!inst->isSquashed());
+    std::vector<DynInstPtr> &dependents = inst->getDependents();
+    int num_outputs = dependents.size();
+
+    for (int i = 0; i < num_outputs; i++) {
+        DynInstPtr inst = dependents[i];
+        inst->markSrcRegReady();
+        if (inst->readyToIssue() && inst->iqItValid) {
+            if (inst->isNonSpeculative()) {
+                assert(find(NonSpec, inst->iqIt));
+                nonSpec.erase(inst->iqIt);
+            } else {
+                assert(find(IQ, inst->iqIt));
+                iq.erase(inst->iqIt);
+            }
+
+            toBeScheduled.push_front(inst);
+            inst->iqIt = toBeScheduled.begin();
+        }
+    }
+    return num_outputs;
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::rescheduleMemInst(DynInstPtr &inst)
+{
+    assert(!inst->iqItValid);
+    replayList.push_front(inst);
+    inst->iqIt = replayList.begin();
+    inst->iqItValid = true;
+    ++numInsts;
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::replayMemInst(DynInstPtr &inst)
+{
+    assert(find(ReplayList, inst->iqIt));
+    InstListIt iq_it = --replayList.end();
+    InstListIt iq_end_it = replayList.end();
+    while (iq_it != iq_end_it) {
+        DynInstPtr rescheduled_inst = (*iq_it);
+        replayList.erase(iq_it--);
+        toBeScheduled.push_front(rescheduled_inst);
+        rescheduled_inst->iqIt = toBeScheduled.begin();
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::completeMemInst(DynInstPtr &inst)
+{
+    panic("Not implemented.");
+}
+
+template <class Impl>
+bool
+BackEnd<Impl>::InstQueue::find(queue q, InstListIt it)
+{
+    InstListIt iq_it, iq_end_it;
+    switch(q) {
+      case NonSpec:
+        iq_it = nonSpec.begin();
+        iq_end_it = nonSpec.end();
+        break;
+      case IQ:
+        iq_it = iq.begin();
+        iq_end_it = iq.end();
+        break;
+      case ToBeScheduled:
+        iq_it = toBeScheduled.begin();
+        iq_end_it = toBeScheduled.end();
+        break;
+      case ReadyList:
+        iq_it = readyList.begin();
+        iq_end_it = readyList.end();
+        break;
+      case ReplayList:
+        iq_it = replayList.begin();
+        iq_end_it = replayList.end();
+    }
+
+    while (iq_it != it && iq_it != iq_end_it) {
+        iq_it++;
+    }
+    if (iq_it == it) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::InstQueue::dumpInsts()
+{
+    cprintf("IQ size: %i\n", iq.size());
+
+    InstListIt inst_list_it = --iq.end();
+
+    int num = 0;
+    int valid_num = 0;
+    while (inst_list_it != iq.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it--;
+        ++num;
+    }
+
+    cprintf("nonSpec size: %i\n", nonSpec.size());
+
+    inst_list_it = --nonSpec.end();
+
+    while (inst_list_it != nonSpec.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it--;
+        ++num;
+    }
+
+    cprintf("toBeScheduled size: %i\n", toBeScheduled.size());
+
+    inst_list_it = --toBeScheduled.end();
+
+    while (inst_list_it != toBeScheduled.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it--;
+        ++num;
+    }
+
+    cprintf("readyList size: %i\n", readyList.size());
+
+    inst_list_it = --readyList.end();
+
+    while (inst_list_it != readyList.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it--;
+        ++num;
+    }
+}
+
+template<class Impl>
+BackEnd<Impl>::LdWritebackEvent::LdWritebackEvent(DynInstPtr &_inst,
+                                                  BackEnd<Impl> *_be)
+    : Event(&mainEventQueue), inst(_inst), be(_be)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template<class Impl>
+void
+BackEnd<Impl>::LdWritebackEvent::process()
+{
+    DPRINTF(BE, "Load writeback event [sn:%lli]\n", inst->seqNum);
+//    DPRINTF(Activity, "Activity: Ld Writeback event [sn:%lli]\n", inst->seqNum);
+
+    //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
+
+//    iewStage->wakeCPU();
+
+    if (inst->isSquashed()) {
+        inst = NULL;
+        return;
+    }
+
+    if (!inst->isExecuted()) {
+        inst->setExecuted();
+
+        // Execute again to copy data to proper place.
+        inst->completeAcc();
+    }
+
+    // Need to insert instruction into queue to commit
+    be->instToCommit(inst);
+
+    //wroteToTimeBuffer = true;
+//    iewStage->activityThisCycle();
+
+    inst = NULL;
+}
+
+template<class Impl>
+const char *
+BackEnd<Impl>::LdWritebackEvent::description()
+{
+    return "Load writeback event";
+}
+
+
+template <class Impl>
+BackEnd<Impl>::DCacheCompletionEvent::DCacheCompletionEvent(BackEnd *_be)
+    : Event(&mainEventQueue, CPU_Tick_Pri), be(_be)
+{
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::DCacheCompletionEvent::process()
+{
+}
+
+template <class Impl>
+const char *
+BackEnd<Impl>::DCacheCompletionEvent::description()
+{
+    return "Cache completion event";
+}
+
+template <class Impl>
+BackEnd<Impl>::BackEnd(Params *params)
+    : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5),
+      xcSquash(false), IQ(params),
+      cacheCompletionEvent(this), width(params->backEndWidth),
+      exactFullStall(true)
+{
+    numROBEntries = params->numROBEntries;
+    numInsts = 0;
+    numDispatchEntries = 32;
+    IQ.setBE(this);
+    LSQ.setBE(this);
+
+    // Setup IQ and LSQ with their parameters here.
+    instsToDispatch = d2i.getWire(-1);
+
+    instsToExecute = i2e.getWire(-1);
+
+    IQ.setIssueExecQueue(&i2e);
+
+    dispatchWidth = params->dispatchWidth ? params->dispatchWidth : width;
+    issueWidth = params->issueWidth ? params->issueWidth : width;
+    wbWidth = params->wbWidth ? params->wbWidth : width;
+    commitWidth = params->commitWidth ? params->commitWidth : width;
+
+    LSQ.init(params, params->LQEntries, params->SQEntries, 0);
+
+    dispatchStatus = Running;
+}
+
+template <class Impl>
+std::string
+BackEnd<Impl>::name() const
+{
+    return cpu->name() + ".backend";
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::regStats()
+{
+    using namespace Stats;
+    rob_cap_events
+        .init(cpu->number_of_threads)
+        .name(name() + ".ROB:cap_events")
+        .desc("number of cycles where ROB cap was active")
+        .flags(total)
+        ;
+
+    rob_cap_inst_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".ROB:cap_inst")
+        .desc("number of instructions held up by ROB cap")
+        .flags(total)
+        ;
+
+    iq_cap_events
+        .init(cpu->number_of_threads)
+        .name(name() +".IQ:cap_events" )
+        .desc("number of cycles where IQ cap was active")
+        .flags(total)
+        ;
+
+    iq_cap_inst_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".IQ:cap_inst")
+        .desc("number of instructions held up by IQ cap")
+        .flags(total)
+        ;
+
+
+    exe_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:count")
+        .desc("number of insts issued")
+        .flags(total)
+        ;
+
+    exe_swp
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:swp")
+        .desc("number of swp insts issued")
+        .flags(total)
+        ;
+
+    exe_nop
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:nop")
+        .desc("number of nop insts issued")
+        .flags(total)
+        ;
+
+    exe_refs
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:refs")
+        .desc("number of memory reference insts issued")
+        .flags(total)
+        ;
+
+    exe_loads
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:loads")
+        .desc("number of load insts issued")
+        .flags(total)
+        ;
+
+    exe_branches
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:branches")
+        .desc("Number of branches issued")
+        .flags(total)
+        ;
+
+    issued_ops
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:op_count")
+        .desc("number of insts issued")
+        .flags(total)
+        ;
+
+/*
+    for (int i=0; i<Num_OpClasses; ++i) {
+        stringstream subname;
+        subname << opClassStrings[i] << "_delay";
+        issue_delay_dist.subname(i, subname.str());
+    }
+*/
+    //
+    //  Other stats
+    //
+    lsq_forw_loads
+        .init(cpu->number_of_threads)
+        .name(name() + ".LSQ:forw_loads")
+        .desc("number of loads forwarded via LSQ")
+        .flags(total)
+        ;
+
+    inv_addr_loads
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:addr_loads")
+        .desc("number of invalid-address loads")
+        .flags(total)
+        ;
+
+    inv_addr_swpfs
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:addr_swpfs")
+        .desc("number of invalid-address SW prefetches")
+        .flags(total)
+        ;
+
+    lsq_blocked_loads
+        .init(cpu->number_of_threads)
+        .name(name() + ".LSQ:blocked_loads")
+        .desc("number of ready loads not issued due to memory disambiguation")
+        .flags(total)
+        ;
+
+    lsqInversion
+        .name(name() + ".ISSUE:lsq_invert")
+        .desc("Number of times LSQ instruction issued early")
+        ;
+
+    n_issued_dist
+        .init(issueWidth + 1)
+        .name(name() + ".ISSUE:issued_per_cycle")
+        .desc("Number of insts issued each cycle")
+        .flags(total | pdf | dist)
+        ;
+    issue_delay_dist
+        .init(Num_OpClasses,0,99,2)
+        .name(name() + ".ISSUE:")
+        .desc("cycles from operands ready to issue")
+        .flags(pdf | cdf)
+        ;
+
+    queue_res_dist
+        .init(Num_OpClasses, 0, 99, 2)
+        .name(name() + ".IQ:residence:")
+        .desc("cycles from dispatch to issue")
+        .flags(total | pdf | cdf )
+        ;
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        queue_res_dist.subname(i, opClassStrings[i]);
+    }
+
+    writeback_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:count")
+        .desc("cumulative count of insts written-back")
+        .flags(total)
+        ;
+
+    producer_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:producers")
+        .desc("num instructions producing a value")
+        .flags(total)
+        ;
+
+    consumer_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:consumers")
+        .desc("num instructions consuming a value")
+        .flags(total)
+        ;
+
+    wb_penalized
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:penalized")
+        .desc("number of instrctions required to write to 'other' IQ")
+        .flags(total)
+        ;
+
+
+    wb_penalized_rate
+        .name(name() + ".WB:penalized_rate")
+        .desc ("fraction of instructions written-back that wrote to 'other' IQ")
+        .flags(total)
+        ;
+
+    wb_penalized_rate = wb_penalized / writeback_count;
+
+    wb_fanout
+        .name(name() + ".WB:fanout")
+        .desc("average fanout of values written-back")
+        .flags(total)
+        ;
+
+    wb_fanout = producer_inst / consumer_inst;
+
+    wb_rate
+        .name(name() + ".WB:rate")
+        .desc("insts written-back per cycle")
+        .flags(total)
+        ;
+    wb_rate = writeback_count / cpu->numCycles;
+
+    stat_com_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:count")
+        .desc("Number of instructions committed")
+        .flags(total)
+        ;
+
+    stat_com_swp
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:swp_count")
+        .desc("Number of s/w prefetches committed")
+        .flags(total)
+        ;
+
+    stat_com_refs
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:refs")
+        .desc("Number of memory references committed")
+        .flags(total)
+        ;
+
+    stat_com_loads
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:loads")
+        .desc("Number of loads committed")
+        .flags(total)
+        ;
+
+    stat_com_membars
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:membars")
+        .desc("Number of memory barriers committed")
+        .flags(total)
+        ;
+
+    stat_com_branches
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:branches")
+        .desc("Number of branches committed")
+        .flags(total)
+        ;
+    n_committed_dist
+        .init(0,commitWidth,1)
+        .name(name() + ".COM:committed_per_cycle")
+        .desc("Number of insts commited each cycle")
+        .flags(pdf)
+        ;
+
+    //
+    //  Commit-Eligible instructions...
+    //
+    //  -> The number of instructions eligible to commit in those
+    //  cycles where we reached our commit BW limit (less the number
+    //  actually committed)
+    //
+    //  -> The average value is computed over ALL CYCLES... not just
+    //  the BW limited cycles
+    //
+    //  -> The standard deviation is computed only over cycles where
+    //  we reached the BW limit
+    //
+    commit_eligible
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:bw_limited")
+        .desc("number of insts not committed due to BW limits")
+        .flags(total)
+        ;
+
+    commit_eligible_samples
+        .name(name() + ".COM:bw_lim_events")
+        .desc("number cycles where commit BW limit reached")
+        ;
+
+    ROB_fcount
+        .name(name() + ".ROB:full_count")
+        .desc("number of cycles where ROB was full")
+        ;
+
+    ROB_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".ROB:occupancy")
+        .desc(name() + ".ROB occupancy (cumulative)")
+        .flags(total)
+        ;
+
+    ROB_full_rate
+        .name(name() + ".ROB:full_rate")
+        .desc("ROB full per cycle")
+        ;
+    ROB_full_rate = ROB_fcount / cpu->numCycles;
+
+    ROB_occ_rate
+        .name(name() + ".ROB:occ_rate")
+        .desc("ROB occupancy rate")
+        .flags(total)
+        ;
+    ROB_occ_rate = ROB_count / cpu->numCycles;
+
+    ROB_occ_dist
+        .init(cpu->number_of_threads,0,numROBEntries,2)
+        .name(name() + ".ROB:occ_dist")
+        .desc("ROB Occupancy per cycle")
+        .flags(total | cdf)
+        ;
+
+    IQ.regStats();
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::setCommBuffer(TimeBuffer<CommStruct> *_comm)
+{
+    comm = _comm;
+    toIEW = comm->getWire(0);
+    fromCommit = comm->getWire(-1);
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::tick()
+{
+    DPRINTF(BE, "Ticking back end\n");
+
+    ROB_count[0]+= numInsts;
+
+    wbCycle = 0;
+
+    if (xcSquash) {
+        squashFromXC();
+    }
+
+    // Read in any done instruction information and update the IQ or LSQ.
+    updateStructures();
+
+    if (dispatchStatus != Blocked) {
+        d2i.advance();
+        dispatchInsts();
+    } else {
+        checkDispatchStatus();
+    }
+
+    i2e.advance();
+    scheduleReadyInsts();
+
+    e2c.advance();
+    executeInsts();
+
+    numInstsToWB.advance();
+    writebackInsts();
+
+    commitInsts();
+
+    assert(numInsts == instList.size());
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::updateStructures()
+{
+    if (fromCommit->doneSeqNum) {
+        IQ.commit(fromCommit->doneSeqNum);
+        LSQ.commitLoads(fromCommit->doneSeqNum);
+        LSQ.commitStores(fromCommit->doneSeqNum);
+    }
+
+    if (fromCommit->nonSpecSeqNum) {
+        if (fromCommit->uncached) {
+            LSQ.executeLoad(fromCommit->lqIdx);
+        } else {
+            IQ.scheduleNonSpec(
+                fromCommit->nonSpecSeqNum);
+        }
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::addToIQ(DynInstPtr &inst)
+{
+    // Do anything IQ specific here?
+    IQ.insert(inst);
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::addToLSQ(DynInstPtr &inst)
+{
+    // Do anything LSQ specific here?
+    LSQ.insert(inst);
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::dispatchInsts()
+{
+    DPRINTF(BE, "Trying to dispatch instructions.\n");
+
+    // Pull instructions out of the front end.
+    int disp_width = dispatchWidth ? dispatchWidth : width;
+
+    // Could model dispatching time, but in general 1 cycle is probably
+    // good enough.
+
+    if (dispatchSize < numDispatchEntries) {
+        for (int i = 0; i < disp_width; i++) {
+            // Get instructions
+            DynInstPtr inst = frontEnd->getInst();
+
+            if (!inst) {
+                // No more instructions to get
+                break;
+            }
+
+            DPRINTF(BE, "Processing instruction [sn:%lli] PC:%#x\n",
+                    inst->seqNum, inst->readPC());
+
+            for (int i = 0; i < inst->numDestRegs(); ++i)
+                renameTable[inst->destRegIdx(i)] = inst;
+
+            // Add to queue to be dispatched.
+            dispatch.push_back(inst);
+
+            d2i[0].size++;
+            ++dispatchSize;
+        }
+    }
+
+    assert(dispatch.size() < 64);
+
+    for (int i = 0; i < instsToDispatch->size; ++i) {
+        assert(!dispatch.empty());
+        // Get instruction from front of time buffer
+        DynInstPtr inst = dispatch.front();
+        dispatch.pop_front();
+
+        if (inst->isSquashed())
+            continue;
+
+        --dispatchSize;
+        ++numInsts;
+        instList.push_back(inst);
+
+        DPRINTF(BE, "Dispatching instruction [sn:%lli] PC:%#x\n",
+                inst->seqNum, inst->readPC());
+
+        addToIQ(inst);
+
+        if (inst->isMemRef()) {
+            addToLSQ(inst);
+        }
+
+        if (inst->isNonSpeculative()) {
+            inst->setCanCommit();
+        }
+
+        // Check if IQ or LSQ is full.  If so we'll need to break and stop
+        // removing instructions.  Also update the number of insts to remove
+        // from the queue.
+        if (exactFullStall) {
+            bool stall = false;
+            if (IQ.isFull()) {
+                DPRINTF(BE, "IQ is full!\n");
+                stall = true;
+            } else if (LSQ.isFull()) {
+                DPRINTF(BE, "LSQ is full!\n");
+                stall = true;
+            } else if (isFull()) {
+                DPRINTF(BE, "ROB is full!\n");
+                stall = true;
+                ROB_fcount++;
+            }
+            if (stall) {
+                instsToDispatch->size-= i+1;
+                dispatchStall();
+                return;
+            }
+        }
+    }
+
+    // Check if IQ or LSQ is full.  If so we'll need to break and stop
+    // removing instructions.  Also update the number of insts to remove
+    // from the queue.  Check here if we don't care about exact stall
+    // conditions.
+
+    bool stall = false;
+    if (IQ.isFull()) {
+        DPRINTF(BE, "IQ is full!\n");
+        stall = true;
+    } else if (LSQ.isFull()) {
+        DPRINTF(BE, "LSQ is full!\n");
+        stall = true;
+    } else if (isFull()) {
+        DPRINTF(BE, "ROB is full!\n");
+        stall = true;
+        ROB_fcount++;
+    }
+    if (stall) {
+        d2i.advance();
+        dispatchStall();
+        return;
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::dispatchStall()
+{
+    dispatchStatus = Blocked;
+    if (!cpu->decoupledFrontEnd) {
+        // Tell front end to stall here through a timebuffer, or just tell
+        // it directly.
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::checkDispatchStatus()
+{
+    assert(dispatchStatus == Blocked);
+    if (!IQ.isFull() && !LSQ.isFull() && !isFull()) {
+        DPRINTF(BE, "Dispatch no longer blocked\n");
+        dispatchStatus = Running;
+        dispatchInsts();
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::scheduleReadyInsts()
+{
+    // Tell IQ to put any ready instructions into the instruction list.
+    // Probably want to have a list of DynInstPtrs returned here.  Then I
+    // can choose to either put them into a time buffer to simulate
+    // IQ scheduling time, or hand them directly off to the next stage.
+    // Do you ever want to directly hand it off to the next stage?
+    DPRINTF(BE, "Trying to schedule ready instructions\n");
+    IQ.scheduleReadyInsts();
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::executeInsts()
+{
+    int insts_to_execute = instsToExecute->size;
+
+    issued_ops[0]+= insts_to_execute;
+    n_issued_dist[insts_to_execute]++;
+
+    DPRINTF(BE, "Trying to execute %i instructions\n", insts_to_execute);
+
+    fetchRedirect[0] = false;
+
+    while (insts_to_execute > 0) {
+        // Get ready instruction from the IQ (or queue coming out of IQ)
+        // Execute the ready instruction.
+        // Wakeup any dependents if it's done.
+        DynInstPtr inst = IQ.getReadyInst();
+
+        DPRINTF(BE, "Executing inst [sn:%lli] PC: %#x\n",
+                inst->seqNum, inst->readPC());
+
+        ++funcExeInst;
+
+        // Check if the instruction is squashed; if so then skip it
+        // and don't count it towards the FU usage.
+        if (inst->isSquashed()) {
+            DPRINTF(BE, "Execute: Instruction was squashed.\n");
+
+            // Not sure how to handle this plus the method of sending # of
+            // instructions to use.  Probably will just have to count it
+            // towards the bandwidth usage, but not the FU usage.
+            --insts_to_execute;
+
+            // Consider this instruction executed so that commit can go
+            // ahead and retire the instruction.
+            inst->setExecuted();
+
+            // Not sure if I should set this here or just let commit try to
+            // commit any squashed instructions.  I like the latter a bit more.
+            inst->setCanCommit();
+
+//            ++iewExecSquashedInsts;
+
+            continue;
+        }
+
+        Fault fault = NoFault;
+
+        // Execute instruction.
+        // Note that if the instruction faults, it will be handled
+        // at the commit stage.
+        if (inst->isMemRef() &&
+            (!inst->isDataPrefetch() && !inst->isInstPrefetch())) {
+            DPRINTF(BE, "Execute: Initiating access for memory "
+                    "reference.\n");
+
+            // Tell the LDSTQ to execute this instruction (if it is a load).
+            if (inst->isLoad()) {
+                // Loads will mark themselves as executed, and their writeback
+                // event adds the instruction to the queue to commit
+                fault = LSQ.executeLoad(inst);
+
+//                ++iewExecLoadInsts;
+            } else if (inst->isStore()) {
+                LSQ.executeStore(inst);
+
+//                ++iewExecStoreInsts;
+
+                if (!(inst->req->flags & LOCKED)) {
+                    inst->setExecuted();
+
+                    instToCommit(inst);
+                }
+                // Store conditionals will mark themselves as executed, and
+                // their writeback event will add the instruction to the queue
+                // to commit.
+            } else {
+                panic("Unexpected memory type!\n");
+            }
+
+        } else {
+            inst->execute();
+
+//            ++iewExecutedInsts;
+
+            inst->setExecuted();
+
+            instToCommit(inst);
+        }
+
+        updateExeInstStats(inst);
+
+        // Probably should have some sort of function for this.
+        // More general question of how to handle squashes?  Have some sort of
+        // squash unit that controls it?  Probably...
+        // Check if branch was correct.  This check happens after the
+        // instruction is added to the queue because even if the branch
+        // is mispredicted, the branch instruction itself is still valid.
+        // Only handle this if there hasn't already been something that
+        // redirects fetch in this group of instructions.
+
+        // This probably needs to prioritize the redirects if a different
+        // scheduler is used.  Currently the scheduler schedules the oldest
+        // instruction first, so the branch resolution order will be correct.
+        unsigned tid = inst->threadNumber;
+
+        if (!fetchRedirect[tid]) {
+
+            if (inst->mispredicted()) {
+                fetchRedirect[tid] = true;
+
+                DPRINTF(BE, "Execute: Branch mispredict detected.\n");
+                DPRINTF(BE, "Execute: Redirecting fetch to PC: %#x.\n",
+                        inst->nextPC);
+
+                // If incorrect, then signal the ROB that it must be squashed.
+                squashDueToBranch(inst);
+
+                if (inst->predTaken()) {
+//                    predictedTakenIncorrect++;
+                } else {
+//                    predictedNotTakenIncorrect++;
+                }
+            } else if (LSQ.violation()) {
+                fetchRedirect[tid] = true;
+
+                // Get the DynInst that caused the violation.  Note that this
+                // clears the violation signal.
+                DynInstPtr violator;
+                violator = LSQ.getMemDepViolator();
+
+                DPRINTF(BE, "LDSTQ detected a violation.  Violator PC: "
+                        "%#x, inst PC: %#x.  Addr is: %#x.\n",
+                        violator->readPC(), inst->readPC(), inst->physEffAddr);
+
+                // Tell the instruction queue that a violation has occured.
+//                IQ.violation(inst, violator);
+
+                // Squash.
+//                squashDueToMemOrder(inst,tid);
+                squashDueToBranch(inst);
+
+//                ++memOrderViolationEvents;
+            } else if (LSQ.loadBlocked()) {
+                fetchRedirect[tid] = true;
+
+                DPRINTF(BE, "Load operation couldn't execute because the "
+                        "memory system is blocked.  PC: %#x [sn:%lli]\n",
+                        inst->readPC(), inst->seqNum);
+
+                squashDueToMemBlocked(inst);
+            }
+        }
+
+//        instList.pop_front();
+
+        --insts_to_execute;
+
+        // keep an instruction count
+        thread->numInst++;
+        thread->numInsts++;
+    }
+
+    assert(insts_to_execute >= 0);
+}
+
+template<class Impl>
+void
+BackEnd<Impl>::instToCommit(DynInstPtr &inst)
+{
+    int wb_width = wbWidth;
+    // First check the time slot that this instruction will write
+    // to.  If there are free write ports at the time, then go ahead
+    // and write the instruction to that time.  If there are not,
+    // keep looking back to see where's the first time there's a
+    // free slot.  What happens if you run out of free spaces?
+    // For now naively assume that all instructions take one cycle.
+    // Otherwise would have to look into the time buffer based on the
+    // latency of the instruction.
+
+    DPRINTF(BE, "Sending instructions to commit [sn:%lli] PC %#x.\n",
+            inst->seqNum, inst->readPC());
+
+    while (numInstsToWB[wbCycle].size >= wb_width) {
+        ++wbCycle;
+
+        assert(wbCycle < 5);
+    }
+
+    // Add finished instruction to queue to commit.
+    writeback.push_back(inst);
+    numInstsToWB[wbCycle].size++;
+
+    if (wbCycle)
+        wb_penalized[0]++;
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::writebackInsts()
+{
+    int wb_width = wbWidth;
+    // Using this method I'm not quite sure how to prevent an
+    // instruction from waking its own dependents multiple times,
+    // without the guarantee that commit always has enough bandwidth
+    // to accept all instructions being written back.  This guarantee
+    // might not be too unrealistic.
+    InstListIt wb_inst_it = writeback.begin();
+    InstListIt wb_end_it = writeback.end();
+    int inst_num = 0;
+    int consumer_insts = 0;
+
+    for (; inst_num < wb_width &&
+             wb_inst_it != wb_end_it; inst_num++) {
+        DynInstPtr inst = (*wb_inst_it);
+
+        // Some instructions will be sent to commit without having
+        // executed because they need commit to handle them.
+        // E.g. Uncached loads have not actually executed when they
+        // are first sent to commit.  Instead commit must tell the LSQ
+        // when it's ready to execute the uncached load.
+        if (!inst->isSquashed()) {
+            DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n",
+                    inst->seqNum, inst->readPC());
+
+            inst->setCanCommit();
+            inst->setCompleted();
+
+            if (inst->isExecuted()) {
+                int dependents = IQ.wakeDependents(inst);
+                if (dependents) {
+                    producer_inst[0]++;
+                    consumer_insts+= dependents;
+                }
+            }
+        }
+
+        writeback.erase(wb_inst_it++);
+    }
+    LSQ.writebackStores();
+    consumer_inst[0]+= consumer_insts;
+    writeback_count[0]+= inst_num;
+}
+
+template <class Impl>
+bool
+BackEnd<Impl>::commitInst(int inst_num)
+{
+    // Read instruction from the head of the ROB
+    DynInstPtr inst = instList.front();
+
+    // Make sure instruction is valid
+    assert(inst);
+
+    if (!inst->readyToCommit())
+        return false;
+
+    DPRINTF(BE, "Trying to commit instruction [sn:%lli] PC:%#x\n",
+            inst->seqNum, inst->readPC());
+
+    // If the instruction is not executed yet, then it is a non-speculative
+    // or store inst.  Signal backwards that it should be executed.
+    if (!inst->isExecuted()) {
+        // Keep this number correct.  We have not yet actually executed
+        // and committed this instruction.
+//        thread->funcExeInst--;
+
+        if (inst->isNonSpeculative()) {
+#if !FULL_SYSTEM
+            // Hack to make sure syscalls aren't executed until all stores
+            // write back their data.  This direct communication shouldn't
+            // be used for anything other than this.
+            if (inst_num > 0 || LSQ.hasStoresToWB()) {
+                DPRINTF(BE, "Waiting for all stores to writeback.\n");
+                return false;
+            }
+#endif
+
+            DPRINTF(BE, "Encountered a store or non-speculative "
+                    "instruction at the head of the ROB, PC %#x.\n",
+                    inst->readPC());
+
+            // Send back the non-speculative instruction's sequence number.
+            toIEW->nonSpecSeqNum = inst->seqNum;
+
+            // Change the instruction so it won't try to commit again until
+            // it is executed.
+            inst->clearCanCommit();
+
+//            ++commitNonSpecStalls;
+
+            return false;
+        } else if (inst->isLoad()) {
+            DPRINTF(BE, "[sn:%lli]: Uncached load, PC %#x.\n",
+                    inst->seqNum, inst->readPC());
+
+            // Send back the non-speculative instruction's sequence
+            // number.  Maybe just tell the lsq to re-execute the load.
+            toIEW->nonSpecSeqNum = inst->seqNum;
+            toIEW->uncached = true;
+            toIEW->lqIdx = inst->lqIdx;
+
+            inst->clearCanCommit();
+
+            return false;
+        } else {
+            panic("Trying to commit un-executed instruction "
+                  "of unknown type!\n");
+        }
+    }
+
+    // Now check if it's one of the special trap or barrier or
+    // serializing instructions.
+    if (inst->isThreadSync())
+    {
+        // Not handled for now.
+        panic("Barrier instructions are not handled yet.\n");
+    }
+
+    // Check if the instruction caused a fault.  If so, trap.
+    Fault inst_fault = inst->getFault();
+
+    if (inst_fault != NoFault) {
+        if (!inst->isNop()) {
+#if FULL_SYSTEM
+            DPRINTF(BE, "Inst [sn:%lli] PC %#x has a fault\n",
+                    inst->seqNum, inst->readPC());
+
+//            assert(!thread->inSyscall);
+
+//            thread->inSyscall = true;
+
+            // Consider holding onto the trap and waiting until the trap event
+            // happens for this to be executed.
+            inst_fault->invoke(thread->getXCProxy());
+
+            // Exit state update mode to avoid accidental updating.
+//            thread->inSyscall = false;
+
+//            commitStatus = TrapPending;
+
+            // Generate trap squash event.
+//            generateTrapEvent();
+
+            return false;
+#else // !FULL_SYSTEM
+            panic("fault (%d) detected @ PC %08p", inst_fault,
+                  inst->PC);
+#endif // FULL_SYSTEM
+        }
+    }
+
+    if (inst->isControl()) {
+//        ++commitCommittedBranches;
+    }
+
+    int freed_regs = 0;
+
+    for (int i = 0; i < inst->numDestRegs(); ++i) {
+        DPRINTF(BE, "Commit rename map setting register %i to [sn:%lli]\n",
+                (int)inst->destRegIdx(i), inst->seqNum);
+        thread->renameTable[inst->destRegIdx(i)] = inst;
+        ++freed_regs;
+    }
+
+    if (inst->traceData) {
+        inst->traceData->finalize();
+        inst->traceData = NULL;
+    }
+
+    inst->clearDependents();
+
+    frontEnd->addFreeRegs(freed_regs);
+
+    instList.pop_front();
+
+    --numInsts;
+    cpu->numInst++;
+    thread->numInsts++;
+    ++thread->funcExeInst;
+    thread->PC = inst->readNextPC();
+    updateComInstStats(inst);
+
+    // Write the done sequence number here.
+    toIEW->doneSeqNum = inst->seqNum;
+
+    return true;
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::commitInsts()
+{
+    int commit_width = commitWidth ? commitWidth : width;
+
+    // Not sure this should be a loop or not.
+    int inst_num = 0;
+    while (!instList.empty() && inst_num < commit_width) {
+        if (instList.front()->isSquashed()) {
+            panic("No squashed insts should still be on the list!");
+            instList.front()->clearDependents();
+            instList.pop_front();
+            continue;
+        }
+
+        if (!commitInst(inst_num++)) {
+            break;
+        }
+    }
+    n_committed_dist.sample(inst_num);
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::squash(const InstSeqNum &sn)
+{
+    IQ.squash(sn);
+    LSQ.squash(sn);
+
+    int freed_regs = 0;
+    InstListIt dispatch_end = dispatch.end();
+    InstListIt insts_it = dispatch.end();
+    insts_it--;
+
+    while (insts_it != dispatch_end && (*insts_it)->seqNum > sn)
+    {
+        DPRINTF(BE, "Squashing instruction PC %#x, [sn:%lli].\n",
+                (*insts_it)->readPC(),
+                (*insts_it)->seqNum);
+
+        // Mark the instruction as squashed, and ready to commit so that
+        // it can drain out of the pipeline.
+        (*insts_it)->setSquashed();
+
+        (*insts_it)->setCanCommit();
+
+        for (int i = 0; i < (*insts_it)->numDestRegs(); ++i) {
+            renameTable[(*insts_it)->destRegIdx(i)] =
+                (*insts_it)->getPrevDestInst(i);
+            ++freed_regs;
+        }
+
+        (*insts_it)->clearDependents();
+
+        --insts_it;
+    }
+
+    insts_it = instList.end();
+    insts_it--;
+
+    while (!instList.empty() && (*insts_it)->seqNum > sn)
+    {
+        DPRINTF(BE, "Squashing instruction PC %#x, [sn:%lli].\n",
+                (*insts_it)->readPC(),
+                (*insts_it)->seqNum);
+
+        // Mark the instruction as squashed, and ready to commit so that
+        // it can drain out of the pipeline.
+        (*insts_it)->setSquashed();
+
+        (*insts_it)->setCanCommit();
+
+        for (int i = 0; i < (*insts_it)->numDestRegs(); ++i) {
+            renameTable[(*insts_it)->destRegIdx(i)] =
+                (*insts_it)->getPrevDestInst(i);
+            ++freed_regs;
+        }
+
+        (*insts_it)->clearDependents();
+
+        instList.erase(insts_it--);
+        --numInsts;
+    }
+
+    frontEnd->addFreeRegs(freed_regs);
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::squashFromXC()
+{
+    xcSquash = true;
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::squashDueToBranch(DynInstPtr &inst)
+{
+    // Update the branch predictor state I guess
+    squash(inst->seqNum);
+    frontEnd->squash(inst->seqNum, inst->readNextPC(),
+                     true, inst->mispredicted());
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::squashDueToMemBlocked(DynInstPtr &inst)
+{
+    DPRINTF(IEW, "Memory blocked, squashing load and younger insts, "
+            "PC: %#x [sn:%i].\n", inst->readPC(), inst->seqNum);
+
+    squash(inst->seqNum - 1);
+    frontEnd->squash(inst->seqNum - 1, inst->readPC());
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::fetchFault(Fault &fault)
+{
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::updateExeInstStats(DynInstPtr &inst)
+{
+    int thread_number = inst->threadNumber;
+
+    //
+    //  Pick off the software prefetches
+    //
+#ifdef TARGET_ALPHA
+    if (inst->isDataPrefetch())
+        exe_swp[thread_number]++;
+    else
+        exe_inst[thread_number]++;
+#else
+    exe_inst[thread_number]++;
+#endif
+
+    //
+    //  Control operations
+    //
+    if (inst->isControl())
+        exe_branches[thread_number]++;
+
+    //
+    //  Memory operations
+    //
+    if (inst->isMemRef()) {
+        exe_refs[thread_number]++;
+
+        if (inst->isLoad())
+            exe_loads[thread_number]++;
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::updateComInstStats(DynInstPtr &inst)
+{
+    unsigned thread = inst->threadNumber;
+
+    //
+    //  Pick off the software prefetches
+    //
+#ifdef TARGET_ALPHA
+    if (inst->isDataPrefetch()) {
+        stat_com_swp[thread]++;
+    } else {
+        stat_com_inst[thread]++;
+    }
+#else
+    stat_com_inst[thread]++;
+#endif
+
+    //
+    //  Control Instructions
+    //
+    if (inst->isControl())
+        stat_com_branches[thread]++;
+
+    //
+    //  Memory references
+    //
+    if (inst->isMemRef()) {
+        stat_com_refs[thread]++;
+
+        if (inst->isLoad()) {
+            stat_com_loads[thread]++;
+        }
+    }
+
+    if (inst->isMemBarrier()) {
+        stat_com_membars[thread]++;
+    }
+}
+
+template <class Impl>
+void
+BackEnd<Impl>::dumpInsts()
+{
+    int num = 0;
+    int valid_num = 0;
+
+    InstListIt inst_list_it = instList.begin();
+
+    cprintf("Inst list size: %i\n", instList.size());
+
+    while (inst_list_it != instList.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it++;
+        ++num;
+    }
+
+    cprintf("Dispatch list size: %i\n", dispatch.size());
+
+    inst_list_it = dispatch.begin();
+
+    while (inst_list_it != dispatch.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it++;
+        ++num;
+    }
+
+    cprintf("Writeback list size: %i\n", writeback.size());
+
+    inst_list_it = writeback.begin();
+
+    while (inst_list_it != writeback.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it++;
+        ++num;
+    }
+}
diff --git a/cpu/ozone/cpu.cc b/cpu/ozone/cpu.cc
index cbeca9d3b..d2ea0164c 100644
--- a/cpu/ozone/cpu.cc
+++ b/cpu/ozone/cpu.cc
@@ -26,8 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "cpu/ooo_cpu/ooo_cpu_impl.hh"
-#include "cpu/ooo_cpu/ooo_dyn_inst.hh"
-#include "cpu/ooo_cpu/ooo_impl.hh"
+#include "cpu/ozone/cpu_impl.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
 
-template class OoOCPU<OoOImpl>;
+template class OzoneCPU<SimpleImpl>;
+template class OzoneCPU<OzoneImpl>;
diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index f5d84d656..200ced265 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -26,15 +26,19 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_OOO_CPU_OOO_CPU_HH__
-#define __CPU_OOO_CPU_OOO_CPU_HH__
+#ifndef __CPU_OZONE_CPU_HH__
+#define __CPU_OZONE_CPU_HH__
+
+#include <set>
 
 #include "base/statistics.hh"
+#include "base/timebuf.hh"
 #include "config/full_system.hh"
 #include "cpu/base.hh"
 #include "cpu/exec_context.hh"
-#include "encumbered/cpu/full/fu_pool.hh"
-#include "cpu/ooo_cpu/ea_list.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/ozone/rename_table.hh"
+#include "cpu/ozone/thread_state.hh"
 #include "cpu/pc_event.hh"
 #include "cpu/static_inst.hh"
 #include "mem/mem_interface.hh"
@@ -42,16 +46,19 @@
 
 // forward declarations
 #if FULL_SYSTEM
-class Processor;
+#include "arch/alpha/tlb.hh"
+
 class AlphaITB;
 class AlphaDTB;
 class PhysicalMemory;
+class MemoryController;
 
 class RemoteGDB;
 class GDBListener;
 
 #else
 
+class PageTable;
 class Process;
 
 #endif // FULL_SYSTEM
@@ -72,23 +79,180 @@ namespace Trace {
  */
 
 template <class Impl>
-class OoOCPU : public BaseCPU
+class OzoneCPU : public BaseCPU
 {
   private:
+    typedef typename Impl::FrontEnd FrontEnd;
+    typedef typename Impl::BackEnd BackEnd;
+    typedef typename Impl::DynInst DynInst;
     typedef typename Impl::DynInst DynInst;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
+    typedef TheISA::MiscReg MiscReg;
+
+  public:
+    class OzoneXC : public ExecContext {
+      public:
+        OzoneCPU<Impl> *cpu;
+
+        OzoneThreadState<Impl> *thread;
+
+        BaseCPU *getCpuPtr();
+
+        void setCpuId(int id);
+
+        int readCpuId() { return thread->cpuId; }
+
+        FunctionalMemory *getMemPtr() { return thread->mem; }
+
+#if FULL_SYSTEM
+        System *getSystemPtr() { return cpu->system; }
+
+        PhysicalMemory *getPhysMemPtr() { return cpu->physmem; }
+
+        AlphaITB *getITBPtr() { return cpu->itb; }
+
+        AlphaDTB * getDTBPtr() { return cpu->dtb; }
+#else
+        Process *getProcessPtr() { return thread->process; }
+#endif
+
+        Status status() const { return thread->_status; }
+
+        void setStatus(Status new_status);
+
+        /// Set the status to Active.  Optional delay indicates number of
+        /// cycles to wait before beginning execution.
+        void activate(int delay = 1);
+
+        /// Set the status to Suspended.
+        void suspend();
+
+        /// Set the status to Unallocated.
+        void deallocate();
+
+        /// Set the status to Halted.
+        void halt();
+
+#if FULL_SYSTEM
+        void dumpFuncProfile();
+#endif
+
+        void takeOverFrom(ExecContext *old_context);
+
+        void regStats(const std::string &name);
+
+        void serialize(std::ostream &os);
+        void unserialize(Checkpoint *cp, const std::string &section);
+
+#if FULL_SYSTEM
+        Event *getQuiesceEvent();
+
+        Tick readLastActivate();
+        Tick readLastSuspend();
+
+        void profileClear();
+        void profileSample();
+#endif
+
+        int getThreadNum();
+
+        // Also somewhat obnoxious.  Really only used for the TLB fault.
+        TheISA::MachInst getInst();
+
+        void copyArchRegs(ExecContext *xc);
+
+        void clearArchRegs();
+
+        uint64_t readIntReg(int reg_idx);
+
+        float readFloatRegSingle(int reg_idx);
+
+        double readFloatRegDouble(int reg_idx);
+
+        uint64_t readFloatRegInt(int reg_idx);
+
+        void setIntReg(int reg_idx, uint64_t val);
+
+        void setFloatRegSingle(int reg_idx, float val);
+
+        void setFloatRegDouble(int reg_idx, double val);
+
+        void setFloatRegInt(int reg_idx, uint64_t val);
+
+        uint64_t readPC() { return thread->PC; }
+        void setPC(Addr val);
+
+        uint64_t readNextPC() { return thread->nextPC; }
+        void setNextPC(Addr val);
+
+      public:
+        // ISA stuff:
+        MiscReg readMiscReg(int misc_reg);
+
+        MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault);
+
+        Fault setMiscReg(int misc_reg, const MiscReg &val);
+
+        Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
+
+        unsigned readStCondFailures()
+        { return thread->storeCondFailures; }
+
+        void setStCondFailures(unsigned sc_failures)
+        { thread->storeCondFailures = sc_failures; }
+
+#if FULL_SYSTEM
+        bool inPalMode() { return cpu->inPalMode(); }
+#endif
+
+        bool misspeculating() { return false; }
+
+#if !FULL_SYSTEM
+        TheISA::IntReg getSyscallArg(int i)
+        { return thread->renameTable[TheISA::ArgumentReg0 + i]->readIntResult(); }
+
+        // used to shift args for indirect syscall
+        void setSyscallArg(int i, TheISA::IntReg val)
+        { thread->renameTable[TheISA::ArgumentReg0 + i]->setIntResult(i); }
+
+        void setSyscallReturn(SyscallReturn return_value)
+        { cpu->setSyscallReturn(return_value, thread->tid); }
+
+        Counter readFuncExeInst() { return thread->funcExeInst; }
+
+        void setFuncExeInst(Counter new_val)
+        { thread->funcExeInst = new_val; }
+#endif
+    };
+
+    // execution context proxy
+    OzoneXC xcProxy;
+
+    typedef OzoneThreadState<Impl> ImplState;
+
+  private:
+    OzoneThreadState<Impl> thread;
+/*
+    // Squash event for when the XC needs to squash all inflight instructions.
+    struct XCSquashEvent : public Event
+    {
+        void process();
+        const char *description();
+    };
+*/
   public:
     // main simulation loop (one cycle)
     void tick();
 
+    std::set<InstSeqNum> snList;
   private:
     struct TickEvent : public Event
     {
-        OoOCPU *cpu;
+        OzoneCPU *cpu;
         int width;
 
-        TickEvent(OoOCPU *c, int w);
+        TickEvent(OzoneCPU *c, int w);
         void process();
         const char *description();
     };
@@ -122,16 +286,14 @@ class OoOCPU : public BaseCPU
     enum Status {
         Running,
         Idle,
-        IcacheMiss,
-        IcacheMissComplete,
-        DcacheMissStall,
         SwitchedOut
     };
 
-  private:
     Status _status;
 
   public:
+    bool checkInterrupts;
+
     void post_interrupt(int int_num, int index);
 
     void zero_fill_64(Addr addr) {
@@ -142,33 +304,24 @@ class OoOCPU : public BaseCPU
         }
     };
 
-    struct Params : public BaseCPU::Params
-    {
-        MemInterface *icache_interface;
-        MemInterface *dcache_interface;
-        int width;
-#if FULL_SYSTEM
-        AlphaITB *itb;
-        AlphaDTB *dtb;
-        FunctionalMemory *mem;
-#else
-        Process *process;
-#endif
-        int issueWidth;
-    };
+    typedef typename Impl::Params Params;
 
-    OoOCPU(Params *params);
+    OzoneCPU(Params *params);
 
-    virtual ~OoOCPU();
+    virtual ~OzoneCPU();
 
     void init();
 
-  private:
-    void copyFromXC();
-
   public:
-    // execution context
-    ExecContext *xc;
+    BaseCPU *getCpuPtr() { return this; }
+
+    void setCpuId(int id) { cpuId = id; }
+
+    int readCpuId() { return cpuId; }
+
+//    FunctionalMemory *getMemPtr() { return mem; }
+
+    int cpuId;
 
     void switchOut();
     void takeOverFrom(BaseCPU *oldCPU);
@@ -177,6 +330,16 @@ class OoOCPU : public BaseCPU
     Addr dbg_vtophys(Addr addr);
 
     bool interval_stats;
+
+    AlphaITB *itb;
+    AlphaDTB *dtb;
+    System *system;
+
+    // the following two fields are redundant, since we can always
+    // look them up through the system pointer, but we'll leave them
+    // here for now for convenience
+    MemoryController *memctrl;
+    PhysicalMemory *physmem;
 #endif
 
     // L1 instruction cache
@@ -185,54 +348,18 @@ class OoOCPU : public BaseCPU
     // L1 data cache
     MemInterface *dcacheInterface;
 
-    FuncUnitPool *fuPool;
+#if !FULL_SYSTEM
+    PageTable *pTable;
+#endif
 
-    // Refcounted pointer to the one memory request.
-    MemReqPtr cacheMemReq;
-
-    class ICacheCompletionEvent : public Event
-    {
-      private:
-        OoOCPU *cpu;
-
-      public:
-        ICacheCompletionEvent(OoOCPU *_cpu);
-
-        virtual void process();
-        virtual const char *description();
-    };
-
-    // Will need to create a cache completion event upon any memory miss.
-    ICacheCompletionEvent iCacheCompletionEvent;
-
-    class DCacheCompletionEvent;
-
-    typedef typename
-    std::list<DCacheCompletionEvent>::iterator DCacheCompEventIt;
-
-    class DCacheCompletionEvent : public Event
-    {
-      private:
-        OoOCPU *cpu;
-        DynInstPtr inst;
-        DCacheCompEventIt dcceIt;
-
-      public:
-        DCacheCompletionEvent(OoOCPU *_cpu, DynInstPtr &_inst,
-                              DCacheCompEventIt &_dcceIt);
-
-        virtual void process();
-        virtual const char *description();
-    };
-
-    friend class DCacheCompletionEvent;
-
-  protected:
-    std::list<DCacheCompletionEvent> dCacheCompList;
-    DCacheCompEventIt dcceIt;
+    FrontEnd *frontEnd;
 
+    BackEnd *backEnd;
   private:
     Status status() const { return _status; }
+    void setStatus(Status new_status) { _status = new_status; }
+
+    // Not sure what an activate() call on the CPU's proxy XC would mean...
 
     virtual void activateContext(int thread_num, int delay);
     virtual void suspendContext(int thread_num);
@@ -244,17 +371,19 @@ class OoOCPU : public BaseCPU
     virtual void resetStats();
 
     // number of simulated instructions
+  public:
     Counter numInst;
     Counter startNumInst;
-    Stats::Scalar<> numInsts;
+//    Stats::Scalar<> numInsts;
 
     virtual Counter totalInstructions() const
     {
         return numInst - startNumInst;
     }
 
+  private:
     // number of simulated memory references
-    Stats::Scalar<> numMemRefs;
+//    Stats::Scalar<> numMemRefs;
 
     // number of simulated loads
     Counter numLoad;
@@ -263,27 +392,15 @@ class OoOCPU : public BaseCPU
     // number of idle cycles
     Stats::Average<> notIdleFraction;
     Stats::Formula idleFraction;
-
-    // number of cycles stalled for I-cache misses
-    Stats::Scalar<> icacheStallCycles;
-    Counter lastIcacheStall;
-
-    // number of cycles stalled for D-cache misses
-    Stats::Scalar<> dcacheStallCycles;
-    Counter lastDcacheStall;
-
-    void processICacheCompletion();
-
   public:
 
     virtual void serialize(std::ostream &os);
     virtual void unserialize(Checkpoint *cp, const std::string &section);
 
+
 #if FULL_SYSTEM
     bool validInstAddr(Addr addr) { return true; }
     bool validDataAddr(Addr addr) { return true; }
-    int getInstAsid() { return xc->regs.instAsid(); }
-    int getDataAsid() { return xc->regs.dataAsid(); }
 
     Fault translateInstReq(MemReqPtr &req)
     {
@@ -302,13 +419,13 @@ class OoOCPU : public BaseCPU
 
 #else
     bool validInstAddr(Addr addr)
-    { return xc->validInstAddr(addr); }
+    { return true; }
 
     bool validDataAddr(Addr addr)
-    { return xc->validDataAddr(addr); }
+    { return true; }
 
-    int getInstAsid() { return xc->asid; }
-    int getDataAsid() { return xc->asid; }
+    int getInstAsid() { return thread.asid; }
+    int getDataAsid() { return thread.asid; }
 
     Fault dummyTranslation(MemReqPtr &req)
     {
@@ -321,27 +438,38 @@ class OoOCPU : public BaseCPU
         req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
         return NoFault;
     }
+
+    /** Translates instruction requestion in syscall emulation mode. */
     Fault translateInstReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
+
+    /** Translates data read request in syscall emulation mode. */
     Fault translateDataReadReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
+
+    /** Translates data write request in syscall emulation mode. */
     Fault translateDataWriteReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
-
 #endif
-
+    /** CPU read function, forwards read to LSQ. */
     template <class T>
-    Fault read(Addr addr, T &data, unsigned flags, DynInstPtr inst);
+    Fault read(MemReqPtr &req, T &data, int load_idx)
+    {
+        return backEnd->read(req, data, load_idx);
+    }
 
+    /** CPU write function, forwards write to LSQ. */
     template <class T>
-    Fault write(T data, Addr addr, unsigned flags,
-                uint64_t *res, DynInstPtr inst);
+    Fault write(MemReqPtr &req, T &data, int store_idx)
+    {
+        return backEnd->write(req, data, store_idx);
+    }
 
     void prefetch(Addr addr, unsigned flags)
     {
@@ -357,270 +485,38 @@ class OoOCPU : public BaseCPU
 
     Fault copy(Addr dest);
 
-  private:
-    bool executeInst(DynInstPtr &inst);
-
-    void renameInst(DynInstPtr &inst);
-
-    void addInst(DynInstPtr &inst);
-
-    void commitHeadInst();
-
-    bool getOneInst();
-
-    Fault fetchCacheLine();
-
-    InstSeqNum getAndIncrementInstSeq();
-
-    bool ambigMemAddr;
-
-  private:
     InstSeqNum globalSeqNum;
 
-    DynInstPtr renameTable[TheISA::TotalNumRegs];
-    DynInstPtr commitTable[TheISA::TotalNumRegs];
-
-    // Might need a table of the shadow registers as well.
-#if FULL_SYSTEM
-    DynInstPtr palShadowTable[TheISA::NumIntRegs];
-#endif
-
   public:
-    // The register accessor methods provide the index of the
-    // instruction's operand (e.g., 0 or 1), not the architectural
-    // register index, to simplify the implementation of register
-    // renaming.  We find the architectural register index by indexing
-    // into the instruction's own operand index table.  Note that a
-    // raw pointer to the StaticInst is provided instead of a
-    // ref-counted StaticInstPtr to redice overhead.  This is fine as
-    // long as these methods don't copy the pointer into any long-term
-    // storage (which is pretty hard to imagine they would have reason
-    // to do).
+    void squashFromXC();
 
-    // In the OoO case these shouldn't read from the XC but rather from the
-    // rename table of DynInsts.  Also these likely shouldn't be called very
-    // often, other than when adding things into the xc during say a syscall.
-
-    uint64_t readIntReg(StaticInst *si, int idx)
-    {
-        return xc->readIntReg(si->srcRegIdx(idx));
-    }
-
-    float readFloatRegSingle(StaticInst *si, int idx)
-    {
-        int reg_idx = si->srcRegIdx(idx) - TheISA::FP_Base_DepTag;
-        return xc->readFloatRegSingle(reg_idx);
-    }
-
-    double readFloatRegDouble(StaticInst *si, int idx)
-    {
-        int reg_idx = si->srcRegIdx(idx) - TheISA::FP_Base_DepTag;
-        return xc->readFloatRegDouble(reg_idx);
-    }
-
-    uint64_t readFloatRegInt(StaticInst *si, int idx)
-    {
-        int reg_idx = si->srcRegIdx(idx) - TheISA::FP_Base_DepTag;
-        return xc->readFloatRegInt(reg_idx);
-    }
-
-    void setIntReg(StaticInst *si, int idx, uint64_t val)
-    {
-        xc->setIntReg(si->destRegIdx(idx), val);
-    }
-
-    void setFloatRegSingle(StaticInst *si, int idx, float val)
-    {
-        int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag;
-        xc->setFloatRegSingle(reg_idx, val);
-    }
-
-    void setFloatRegDouble(StaticInst *si, int idx, double val)
-    {
-        int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag;
-        xc->setFloatRegDouble(reg_idx, val);
-    }
-
-    void setFloatRegInt(StaticInst *si, int idx, uint64_t val)
-    {
-        int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag;
-        xc->setFloatRegInt(reg_idx, val);
-    }
-
-    uint64_t readPC() { return PC; }
-    void setNextPC(Addr val) { nextPC = val; }
-
-  private:
-    Addr PC;
-    Addr nextPC;
-
-    unsigned issueWidth;
-
-    bool fetchRedirExcp;
-    bool fetchRedirBranch;
-
-    /** Mask to get a cache block's address. */
-    Addr cacheBlkMask;
-
-    unsigned cacheBlkSize;
-
-    Addr cacheBlkPC;
-
-    /** The cache line being fetched. */
-    uint8_t *cacheData;
-
-  protected:
-    bool cacheBlkValid;
-
-  private:
-
-    // Align an address (typically a PC) to the start of an I-cache block.
-    // We fold in the PISA 64- to 32-bit conversion here as well.
-    Addr icacheBlockAlignPC(Addr addr)
-    {
-        addr = TheISA::realPCToFetchPC(addr);
-        return (addr & ~(cacheBlkMask));
-    }
-
-    unsigned instSize;
-
-    // ROB tracking stuff.
-    DynInstPtr robHeadPtr;
-    DynInstPtr robTailPtr;
-    unsigned robSize;
-    unsigned robInsts;
-
-    // List of outstanding EA instructions.
-  protected:
-    EAList eaList;
-
-  public:
-    void branchToTarget(Addr val)
-    {
-        if (!fetchRedirExcp) {
-            fetchRedirBranch = true;
-            PC = val;
-        }
-    }
-
-    // ISA stuff:
-    uint64_t readUniq() { return xc->readUniq(); }
-    void setUniq(uint64_t val) { xc->setUniq(val); }
-
-    uint64_t readFpcr() { return xc->readFpcr(); }
-    void setFpcr(uint64_t val) { xc->setFpcr(val); }
+    // @todo: This can be a useful debug function.  Implement it.
+    void dumpInsts() { frontEnd->dumpInsts(); }
 
 #if FULL_SYSTEM
-    uint64_t readIpr(int idx, Fault &fault) { return xc->readIpr(idx, fault); }
-    Fault setIpr(int idx, uint64_t val) { return xc->setIpr(idx, val); }
-    Fault hwrei() { return xc->hwrei(); }
-    int readIntrFlag() { return xc->readIntrFlag(); }
-    void setIntrFlag(int val) { xc->setIntrFlag(val); }
-    bool inPalMode() { return xc->inPalMode(); }
-    void trap(Fault fault) { fault->invoke(xc); }
-    bool simPalCheck(int palFunc) { return xc->simPalCheck(palFunc); }
+    Fault hwrei();
+    int readIntrFlag() { return thread.regs.intrflag; }
+    void setIntrFlag(int val) { thread.regs.intrflag = val; }
+    bool inPalMode() { return AlphaISA::PcPAL(thread.PC); }
+    bool inPalMode(Addr pc) { return AlphaISA::PcPAL(pc); }
+    bool simPalCheck(int palFunc);
 #else
-    void syscall() { xc->syscall(); }
+    void syscall();
+    void setSyscallReturn(SyscallReturn return_value, int tid);
 #endif
 
-    ExecContext *xcBase() { return xc; }
+    ExecContext *xcBase() { return &xcProxy; }
+
+    bool decoupledFrontEnd;
+    struct CommStruct {
+        InstSeqNum doneSeqNum;
+        InstSeqNum nonSpecSeqNum;
+        bool uncached;
+        unsigned lqIdx;
+
+        bool stall;
+    };
+    TimeBuffer<CommStruct> comm;
 };
 
-
-// precise architected memory state accessor macros
-template <class Impl>
-template <class T>
-Fault
-OoOCPU<Impl>::read(Addr addr, T &data, unsigned flags, DynInstPtr inst)
-{
-    MemReqPtr readReq = new MemReq();
-    readReq->xc = xc;
-    readReq->asid = 0;
-    readReq->data = new uint8_t[64];
-
-    readReq->reset(addr, sizeof(T), flags);
-
-    // translate to physical address - This might be an ISA impl call
-    Fault fault = translateDataReadReq(readReq);
-
-    // do functional access
-    if (fault == NoFault)
-        fault = xc->mem->read(readReq, data);
-#if 0
-    if (traceData) {
-        traceData->setAddr(addr);
-        if (fault == NoFault)
-            traceData->setData(data);
-    }
-#endif
-
-    // if we have a cache, do cache access too
-    if (fault == NoFault && dcacheInterface) {
-        readReq->cmd = Read;
-        readReq->completionEvent = NULL;
-        readReq->time = curTick;
-        /*MemAccessResult result = */dcacheInterface->access(readReq);
-
-        if (dcacheInterface->doEvents()) {
-            readReq->completionEvent = new DCacheCompletionEvent(this, inst,
-                                                                 dcceIt);
-        }
-    }
-
-    if (!dcacheInterface && (readReq->flags & UNCACHEABLE))
-        recordEvent("Uncached Read");
-
-    return fault;
-}
-
-template <class Impl>
-template <class T>
-Fault
-OoOCPU<Impl>::write(T data, Addr addr, unsigned flags,
-                    uint64_t *res, DynInstPtr inst)
-{
-    MemReqPtr writeReq = new MemReq();
-    writeReq->xc = xc;
-    writeReq->asid = 0;
-    writeReq->data = new uint8_t[64];
-
-#if 0
-    if (traceData) {
-        traceData->setAddr(addr);
-        traceData->setData(data);
-    }
-#endif
-
-    writeReq->reset(addr, sizeof(T), flags);
-
-    // translate to physical address
-    Fault fault = translateDataWriteReq(writeReq);
-
-    // do functional access
-    if (fault == NoFault)
-        fault = xc->write(writeReq, data);
-
-    if (fault == NoFault && dcacheInterface) {
-        writeReq->cmd = Write;
-        memcpy(writeReq->data,(uint8_t *)&data,writeReq->size);
-        writeReq->completionEvent = NULL;
-        writeReq->time = curTick;
-        /*MemAccessResult result = */dcacheInterface->access(writeReq);
-
-        if (dcacheInterface->doEvents()) {
-            writeReq->completionEvent = new DCacheCompletionEvent(this, inst,
-                                                                  dcceIt);
-        }
-    }
-
-    if (res && (fault == NoFault))
-        *res = writeReq->result;
-
-    if (!dcacheInterface && (writeReq->flags & UNCACHEABLE))
-        recordEvent("Uncached Write");
-
-    return fault;
-}
-
-
-#endif // __CPU_OOO_CPU_OOO_CPU_HH__
+#endif // __CPU_OZONE_CPU_HH__
diff --git a/cpu/ozone/cpu_builder.cc b/cpu/ozone/cpu_builder.cc
new file mode 100644
index 000000000..0146dd1bd
--- /dev/null
+++ b/cpu/ozone/cpu_builder.cc
@@ -0,0 +1,818 @@
+
+#include <string>
+
+#include "cpu/inst_seq.hh"
+#include "cpu/ozone/cpu.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
+#include "cpu/ozone/simple_params.hh"
+#include "mem/cache/base_cache.hh"
+#include "sim/builder.hh"
+#include "sim/process.hh"
+#include "sim/sim_object.hh"
+
+class DerivOzoneCPU : public OzoneCPU<OzoneImpl>
+{
+  public:
+    DerivOzoneCPU(SimpleParams *p)
+        : OzoneCPU<OzoneImpl>(p)
+    { }
+};
+
+class SimpleOzoneCPU : public OzoneCPU<SimpleImpl>
+{
+  public:
+    SimpleOzoneCPU(SimpleParams *p)
+        : OzoneCPU<SimpleImpl>(p)
+    { }
+};
+
+
+////////////////////////////////////////////////////////////////////////
+//
+//  OzoneCPU Simulation Object
+//
+
+BEGIN_DECLARE_SIM_OBJECT_PARAMS(DerivOzoneCPU)
+
+    Param<int> clock;
+    Param<int> numThreads;
+
+#if FULL_SYSTEM
+SimObjectParam<System *> system;
+Param<int> cpu_id;
+SimObjectParam<AlphaITB *> itb;
+SimObjectParam<AlphaDTB *> dtb;
+#else
+SimObjectVectorParam<Process *> workload;
+//SimObjectParam<PageTable *> page_table;
+#endif // FULL_SYSTEM
+
+SimObjectParam<FunctionalMemory *> mem;
+
+Param<Counter> max_insts_any_thread;
+Param<Counter> max_insts_all_threads;
+Param<Counter> max_loads_any_thread;
+Param<Counter> max_loads_all_threads;
+
+SimObjectParam<BaseCache *> icache;
+SimObjectParam<BaseCache *> dcache;
+
+Param<unsigned> cachePorts;
+Param<unsigned> width;
+Param<unsigned> frontEndWidth;
+Param<unsigned> backEndWidth;
+Param<unsigned> backEndSquashLatency;
+Param<unsigned> backEndLatency;
+Param<unsigned> maxInstBufferSize;
+Param<unsigned> numPhysicalRegs;
+
+Param<unsigned> decodeToFetchDelay;
+Param<unsigned> renameToFetchDelay;
+Param<unsigned> iewToFetchDelay;
+Param<unsigned> commitToFetchDelay;
+Param<unsigned> fetchWidth;
+
+Param<unsigned> renameToDecodeDelay;
+Param<unsigned> iewToDecodeDelay;
+Param<unsigned> commitToDecodeDelay;
+Param<unsigned> fetchToDecodeDelay;
+Param<unsigned> decodeWidth;
+
+Param<unsigned> iewToRenameDelay;
+Param<unsigned> commitToRenameDelay;
+Param<unsigned> decodeToRenameDelay;
+Param<unsigned> renameWidth;
+
+Param<unsigned> commitToIEWDelay;
+Param<unsigned> renameToIEWDelay;
+Param<unsigned> issueToExecuteDelay;
+Param<unsigned> issueWidth;
+Param<unsigned> executeWidth;
+Param<unsigned> executeIntWidth;
+Param<unsigned> executeFloatWidth;
+Param<unsigned> executeBranchWidth;
+Param<unsigned> executeMemoryWidth;
+
+Param<unsigned> iewToCommitDelay;
+Param<unsigned> renameToROBDelay;
+Param<unsigned> commitWidth;
+Param<unsigned> squashWidth;
+
+Param<unsigned> localPredictorSize;
+Param<unsigned> localCtrBits;
+Param<unsigned> localHistoryTableSize;
+Param<unsigned> localHistoryBits;
+Param<unsigned> globalPredictorSize;
+Param<unsigned> globalCtrBits;
+Param<unsigned> globalHistoryBits;
+Param<unsigned> choicePredictorSize;
+Param<unsigned> choiceCtrBits;
+
+Param<unsigned> BTBEntries;
+Param<unsigned> BTBTagSize;
+
+Param<unsigned> RASSize;
+
+Param<unsigned> LQEntries;
+Param<unsigned> SQEntries;
+Param<unsigned> LFSTSize;
+Param<unsigned> SSITSize;
+
+Param<unsigned> numPhysIntRegs;
+Param<unsigned> numPhysFloatRegs;
+Param<unsigned> numIQEntries;
+Param<unsigned> numROBEntries;
+
+Param<bool> decoupledFrontEnd;
+Param<int> dispatchWidth;
+Param<int> wbWidth;
+
+Param<unsigned> smtNumFetchingThreads;
+Param<std::string>   smtFetchPolicy;
+Param<std::string>   smtLSQPolicy;
+Param<unsigned> smtLSQThreshold;
+Param<std::string>   smtIQPolicy;
+Param<unsigned> smtIQThreshold;
+Param<std::string>   smtROBPolicy;
+Param<unsigned> smtROBThreshold;
+Param<std::string>   smtCommitPolicy;
+
+Param<unsigned> instShiftAmt;
+
+Param<bool> defer_registration;
+
+Param<bool> function_trace;
+Param<Tick> function_trace_start;
+
+END_DECLARE_SIM_OBJECT_PARAMS(DerivOzoneCPU)
+
+BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
+
+    INIT_PARAM(clock, "clock speed"),
+    INIT_PARAM(numThreads, "number of HW thread contexts"),
+
+#if FULL_SYSTEM
+    INIT_PARAM(system, "System object"),
+    INIT_PARAM(cpu_id, "processor ID"),
+    INIT_PARAM(itb, "Instruction translation buffer"),
+    INIT_PARAM(dtb, "Data translation buffer"),
+#else
+    INIT_PARAM(workload, "Processes to run"),
+//    INIT_PARAM(page_table, "Page table"),
+#endif // FULL_SYSTEM
+
+    INIT_PARAM_DFLT(mem, "Memory", NULL),
+
+    INIT_PARAM_DFLT(max_insts_any_thread,
+                    "Terminate when any thread reaches this inst count",
+                    0),
+    INIT_PARAM_DFLT(max_insts_all_threads,
+                    "Terminate when all threads have reached"
+                    "this inst count",
+                    0),
+    INIT_PARAM_DFLT(max_loads_any_thread,
+                    "Terminate when any thread reaches this load count",
+                    0),
+    INIT_PARAM_DFLT(max_loads_all_threads,
+                    "Terminate when all threads have reached this load"
+                    "count",
+                    0),
+
+    INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL),
+    INIT_PARAM_DFLT(dcache, "L1 data cache", NULL),
+
+    INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200),
+    INIT_PARAM_DFLT(width, "Width", 1),
+    INIT_PARAM_DFLT(frontEndWidth, "Front end width", 1),
+    INIT_PARAM_DFLT(backEndWidth, "Back end width", 1),
+    INIT_PARAM_DFLT(backEndSquashLatency, "Back end squash latency", 1),
+    INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1),
+    INIT_PARAM_DFLT(maxInstBufferSize, "Maximum instruction buffer size", 16),
+    INIT_PARAM(numPhysicalRegs, "Number of physical registers"),
+
+    INIT_PARAM(decodeToFetchDelay, "Decode to fetch delay"),
+    INIT_PARAM(renameToFetchDelay, "Rename to fetch delay"),
+    INIT_PARAM(iewToFetchDelay, "Issue/Execute/Writeback to fetch"
+               "delay"),
+    INIT_PARAM(commitToFetchDelay, "Commit to fetch delay"),
+    INIT_PARAM(fetchWidth, "Fetch width"),
+    INIT_PARAM(renameToDecodeDelay, "Rename to decode delay"),
+    INIT_PARAM(iewToDecodeDelay, "Issue/Execute/Writeback to decode"
+               "delay"),
+    INIT_PARAM(commitToDecodeDelay, "Commit to decode delay"),
+    INIT_PARAM(fetchToDecodeDelay, "Fetch to decode delay"),
+    INIT_PARAM(decodeWidth, "Decode width"),
+
+    INIT_PARAM(iewToRenameDelay, "Issue/Execute/Writeback to rename"
+               "delay"),
+    INIT_PARAM(commitToRenameDelay, "Commit to rename delay"),
+    INIT_PARAM(decodeToRenameDelay, "Decode to rename delay"),
+    INIT_PARAM(renameWidth, "Rename width"),
+
+    INIT_PARAM(commitToIEWDelay, "Commit to "
+               "Issue/Execute/Writeback delay"),
+    INIT_PARAM(renameToIEWDelay, "Rename to "
+               "Issue/Execute/Writeback delay"),
+    INIT_PARAM(issueToExecuteDelay, "Issue to execute delay (internal"
+               "to the IEW stage)"),
+    INIT_PARAM(issueWidth, "Issue width"),
+    INIT_PARAM(executeWidth, "Execute width"),
+    INIT_PARAM(executeIntWidth, "Integer execute width"),
+    INIT_PARAM(executeFloatWidth, "Floating point execute width"),
+    INIT_PARAM(executeBranchWidth, "Branch execute width"),
+    INIT_PARAM(executeMemoryWidth, "Memory execute width"),
+
+    INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
+               "delay"),
+    INIT_PARAM(renameToROBDelay, "Rename to reorder buffer delay"),
+    INIT_PARAM(commitWidth, "Commit width"),
+    INIT_PARAM(squashWidth, "Squash width"),
+
+    INIT_PARAM(localPredictorSize, "Size of local predictor"),
+    INIT_PARAM(localCtrBits, "Bits per counter"),
+    INIT_PARAM(localHistoryTableSize, "Size of local history table"),
+    INIT_PARAM(localHistoryBits, "Bits for the local history"),
+    INIT_PARAM(globalPredictorSize, "Size of global predictor"),
+    INIT_PARAM(globalCtrBits, "Bits per counter"),
+    INIT_PARAM(globalHistoryBits, "Bits of history"),
+    INIT_PARAM(choicePredictorSize, "Size of choice predictor"),
+    INIT_PARAM(choiceCtrBits, "Bits of choice counters"),
+
+    INIT_PARAM(BTBEntries, "Number of BTB entries"),
+    INIT_PARAM(BTBTagSize, "Size of the BTB tags, in bits"),
+
+    INIT_PARAM(RASSize, "RAS size"),
+
+    INIT_PARAM(LQEntries, "Number of load queue entries"),
+    INIT_PARAM(SQEntries, "Number of store queue entries"),
+    INIT_PARAM(LFSTSize, "Last fetched store table size"),
+    INIT_PARAM(SSITSize, "Store set ID table size"),
+
+    INIT_PARAM(numPhysIntRegs, "Number of physical integer registers"),
+    INIT_PARAM(numPhysFloatRegs, "Number of physical floating point "
+               "registers"),
+    INIT_PARAM(numIQEntries, "Number of instruction queue entries"),
+    INIT_PARAM(numROBEntries, "Number of reorder buffer entries"),
+
+    INIT_PARAM_DFLT(decoupledFrontEnd, "Decoupled front end", true),
+    INIT_PARAM_DFLT(dispatchWidth, "Dispatch width", 0),
+    INIT_PARAM_DFLT(wbWidth, "Writeback width", 0),
+
+    INIT_PARAM_DFLT(smtNumFetchingThreads, "SMT Number of Fetching Threads", 1),
+    INIT_PARAM_DFLT(smtFetchPolicy, "SMT Fetch Policy", "SingleThread"),
+    INIT_PARAM_DFLT(smtLSQPolicy,   "SMT LSQ Sharing Policy",    "Partitioned"),
+    INIT_PARAM_DFLT(smtLSQThreshold,"SMT LSQ Threshold", 100),
+    INIT_PARAM_DFLT(smtIQPolicy,    "SMT IQ Policy",    "Partitioned"),
+    INIT_PARAM_DFLT(smtIQThreshold, "SMT IQ Threshold", 100),
+    INIT_PARAM_DFLT(smtROBPolicy,   "SMT ROB Sharing Policy", "Partitioned"),
+    INIT_PARAM_DFLT(smtROBThreshold,"SMT ROB Threshold", 100),
+    INIT_PARAM_DFLT(smtCommitPolicy,"SMT Commit Fetch Policy", "RoundRobin"),
+
+    INIT_PARAM(instShiftAmt, "Number of bits to shift instructions by"),
+    INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
+
+    INIT_PARAM(function_trace, "Enable function trace"),
+    INIT_PARAM(function_trace_start, "Cycle to start function trace")
+
+END_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
+
+CREATE_SIM_OBJECT(DerivOzoneCPU)
+{
+    DerivOzoneCPU *cpu;
+
+#if FULL_SYSTEM
+    // Full-system only supports a single thread for the moment.
+    int actual_num_threads = 1;
+#else
+    // In non-full-system mode, we infer the number of threads from
+    // the workload if it's not explicitly specified.
+    int actual_num_threads =
+        numThreads.isValid() ? numThreads : workload.size();
+
+    if (workload.size() == 0) {
+        fatal("Must specify at least one workload!");
+    }
+
+#endif
+
+    SimpleParams *params = new SimpleParams;
+
+    params->clock = clock;
+
+    params->name = getInstanceName();
+    params->numberOfThreads = actual_num_threads;
+
+#if FULL_SYSTEM
+    params->system = system;
+    params->cpu_id = cpu_id;
+    params->itb = itb;
+    params->dtb = dtb;
+#else
+    params->workload = workload;
+//    params->pTable = page_table;
+#endif // FULL_SYSTEM
+
+    params->mem = mem;
+
+    params->max_insts_any_thread = max_insts_any_thread;
+    params->max_insts_all_threads = max_insts_all_threads;
+    params->max_loads_any_thread = max_loads_any_thread;
+    params->max_loads_all_threads = max_loads_all_threads;
+
+    //
+    // Caches
+    //
+    params->icacheInterface = icache ? icache->getInterface() : NULL;
+    params->dcacheInterface = dcache ? dcache->getInterface() : NULL;
+    params->cachePorts = cachePorts;
+
+    params->width = width;
+    params->frontEndWidth = frontEndWidth;
+    params->backEndWidth = backEndWidth;
+    params->backEndSquashLatency = backEndSquashLatency;
+    params->backEndLatency = backEndLatency;
+    params->maxInstBufferSize = maxInstBufferSize;
+    params->numPhysicalRegs = numPhysIntRegs + numPhysFloatRegs;
+
+    params->decodeToFetchDelay = decodeToFetchDelay;
+    params->renameToFetchDelay = renameToFetchDelay;
+    params->iewToFetchDelay = iewToFetchDelay;
+    params->commitToFetchDelay = commitToFetchDelay;
+    params->fetchWidth = fetchWidth;
+
+    params->renameToDecodeDelay = renameToDecodeDelay;
+    params->iewToDecodeDelay = iewToDecodeDelay;
+    params->commitToDecodeDelay = commitToDecodeDelay;
+    params->fetchToDecodeDelay = fetchToDecodeDelay;
+    params->decodeWidth = decodeWidth;
+
+    params->iewToRenameDelay = iewToRenameDelay;
+    params->commitToRenameDelay = commitToRenameDelay;
+    params->decodeToRenameDelay = decodeToRenameDelay;
+    params->renameWidth = renameWidth;
+
+    params->commitToIEWDelay = commitToIEWDelay;
+    params->renameToIEWDelay = renameToIEWDelay;
+    params->issueToExecuteDelay = issueToExecuteDelay;
+    params->issueWidth = issueWidth;
+    params->executeWidth = executeWidth;
+    params->executeIntWidth = executeIntWidth;
+    params->executeFloatWidth = executeFloatWidth;
+    params->executeBranchWidth = executeBranchWidth;
+    params->executeMemoryWidth = executeMemoryWidth;
+
+    params->iewToCommitDelay = iewToCommitDelay;
+    params->renameToROBDelay = renameToROBDelay;
+    params->commitWidth = commitWidth;
+    params->squashWidth = squashWidth;
+
+
+    params->localPredictorSize = localPredictorSize;
+    params->localCtrBits = localCtrBits;
+    params->localHistoryTableSize = localHistoryTableSize;
+    params->localHistoryBits = localHistoryBits;
+    params->globalPredictorSize = globalPredictorSize;
+    params->globalCtrBits = globalCtrBits;
+    params->globalHistoryBits = globalHistoryBits;
+    params->choicePredictorSize = choicePredictorSize;
+    params->choiceCtrBits = choiceCtrBits;
+
+    params->BTBEntries = BTBEntries;
+    params->BTBTagSize = BTBTagSize;
+
+    params->RASSize = RASSize;
+
+    params->LQEntries = LQEntries;
+    params->SQEntries = SQEntries;
+
+    params->SSITSize = SSITSize;
+    params->LFSTSize = LFSTSize;
+
+    params->numPhysIntRegs = numPhysIntRegs;
+    params->numPhysFloatRegs = numPhysFloatRegs;
+    params->numIQEntries = numIQEntries;
+    params->numROBEntries = numROBEntries;
+
+    params->decoupledFrontEnd = decoupledFrontEnd;
+    params->dispatchWidth = dispatchWidth;
+    params->wbWidth = wbWidth;
+
+    params->smtNumFetchingThreads = smtNumFetchingThreads;
+    params->smtFetchPolicy = smtFetchPolicy;
+    params->smtIQPolicy    = smtIQPolicy;
+    params->smtLSQPolicy    = smtLSQPolicy;
+    params->smtLSQThreshold = smtLSQThreshold;
+    params->smtROBPolicy   = smtROBPolicy;
+    params->smtROBThreshold = smtROBThreshold;
+    params->smtCommitPolicy = smtCommitPolicy;
+
+    params->instShiftAmt = 2;
+
+    params->deferRegistration = defer_registration;
+
+    params->functionTrace = function_trace;
+    params->functionTraceStart = function_trace_start;
+
+    cpu = new DerivOzoneCPU(params);
+
+    return cpu;
+}
+
+REGISTER_SIM_OBJECT("DerivOzoneCPU", DerivOzoneCPU)
+
+
+
+////////////////////////////////////////////////////////////////////////
+//
+//  OzoneCPU Simulation Object
+//
+
+BEGIN_DECLARE_SIM_OBJECT_PARAMS(SimpleOzoneCPU)
+
+    Param<int> clock;
+    Param<int> numThreads;
+
+#if FULL_SYSTEM
+SimObjectParam<System *> system;
+Param<int> cpu_id;
+SimObjectParam<AlphaITB *> itb;
+SimObjectParam<AlphaDTB *> dtb;
+#else
+SimObjectVectorParam<Process *> workload;
+//SimObjectParam<PageTable *> page_table;
+#endif // FULL_SYSTEM
+
+SimObjectParam<FunctionalMemory *> mem;
+
+Param<Counter> max_insts_any_thread;
+Param<Counter> max_insts_all_threads;
+Param<Counter> max_loads_any_thread;
+Param<Counter> max_loads_all_threads;
+
+SimObjectParam<BaseCache *> icache;
+SimObjectParam<BaseCache *> dcache;
+
+Param<unsigned> cachePorts;
+Param<unsigned> width;
+Param<unsigned> frontEndWidth;
+Param<unsigned> backEndWidth;
+Param<unsigned> backEndSquashLatency;
+Param<unsigned> backEndLatency;
+Param<unsigned> maxInstBufferSize;
+Param<unsigned> numPhysicalRegs;
+
+Param<unsigned> decodeToFetchDelay;
+Param<unsigned> renameToFetchDelay;
+Param<unsigned> iewToFetchDelay;
+Param<unsigned> commitToFetchDelay;
+Param<unsigned> fetchWidth;
+
+Param<unsigned> renameToDecodeDelay;
+Param<unsigned> iewToDecodeDelay;
+Param<unsigned> commitToDecodeDelay;
+Param<unsigned> fetchToDecodeDelay;
+Param<unsigned> decodeWidth;
+
+Param<unsigned> iewToRenameDelay;
+Param<unsigned> commitToRenameDelay;
+Param<unsigned> decodeToRenameDelay;
+Param<unsigned> renameWidth;
+
+Param<unsigned> commitToIEWDelay;
+Param<unsigned> renameToIEWDelay;
+Param<unsigned> issueToExecuteDelay;
+Param<unsigned> issueWidth;
+Param<unsigned> executeWidth;
+Param<unsigned> executeIntWidth;
+Param<unsigned> executeFloatWidth;
+Param<unsigned> executeBranchWidth;
+Param<unsigned> executeMemoryWidth;
+
+Param<unsigned> iewToCommitDelay;
+Param<unsigned> renameToROBDelay;
+Param<unsigned> commitWidth;
+Param<unsigned> squashWidth;
+
+Param<unsigned> localPredictorSize;
+Param<unsigned> localCtrBits;
+Param<unsigned> localHistoryTableSize;
+Param<unsigned> localHistoryBits;
+Param<unsigned> globalPredictorSize;
+Param<unsigned> globalCtrBits;
+Param<unsigned> globalHistoryBits;
+Param<unsigned> choicePredictorSize;
+Param<unsigned> choiceCtrBits;
+
+Param<unsigned> BTBEntries;
+Param<unsigned> BTBTagSize;
+
+Param<unsigned> RASSize;
+
+Param<unsigned> LQEntries;
+Param<unsigned> SQEntries;
+Param<unsigned> LFSTSize;
+Param<unsigned> SSITSize;
+
+Param<unsigned> numPhysIntRegs;
+Param<unsigned> numPhysFloatRegs;
+Param<unsigned> numIQEntries;
+Param<unsigned> numROBEntries;
+
+Param<bool> decoupledFrontEnd;
+Param<int> dispatchWidth;
+Param<int> wbWidth;
+
+Param<unsigned> smtNumFetchingThreads;
+Param<std::string>   smtFetchPolicy;
+Param<std::string>   smtLSQPolicy;
+Param<unsigned> smtLSQThreshold;
+Param<std::string>   smtIQPolicy;
+Param<unsigned> smtIQThreshold;
+Param<std::string>   smtROBPolicy;
+Param<unsigned> smtROBThreshold;
+Param<std::string>   smtCommitPolicy;
+
+Param<unsigned> instShiftAmt;
+
+Param<bool> defer_registration;
+
+Param<bool> function_trace;
+Param<Tick> function_trace_start;
+
+END_DECLARE_SIM_OBJECT_PARAMS(SimpleOzoneCPU)
+
+BEGIN_INIT_SIM_OBJECT_PARAMS(SimpleOzoneCPU)
+
+    INIT_PARAM(clock, "clock speed"),
+    INIT_PARAM(numThreads, "number of HW thread contexts"),
+
+#if FULL_SYSTEM
+    INIT_PARAM(system, "System object"),
+    INIT_PARAM(cpu_id, "processor ID"),
+    INIT_PARAM(itb, "Instruction translation buffer"),
+    INIT_PARAM(dtb, "Data translation buffer"),
+#else
+    INIT_PARAM(workload, "Processes to run"),
+//    INIT_PARAM(page_table, "Page table"),
+#endif // FULL_SYSTEM
+
+    INIT_PARAM_DFLT(mem, "Memory", NULL),
+
+    INIT_PARAM_DFLT(max_insts_any_thread,
+                    "Terminate when any thread reaches this inst count",
+                    0),
+    INIT_PARAM_DFLT(max_insts_all_threads,
+                    "Terminate when all threads have reached"
+                    "this inst count",
+                    0),
+    INIT_PARAM_DFLT(max_loads_any_thread,
+                    "Terminate when any thread reaches this load count",
+                    0),
+    INIT_PARAM_DFLT(max_loads_all_threads,
+                    "Terminate when all threads have reached this load"
+                    "count",
+                    0),
+
+    INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL),
+    INIT_PARAM_DFLT(dcache, "L1 data cache", NULL),
+
+    INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200),
+    INIT_PARAM_DFLT(width, "Width", 1),
+    INIT_PARAM_DFLT(frontEndWidth, "Front end width", 1),
+    INIT_PARAM_DFLT(backEndWidth, "Back end width", 1),
+    INIT_PARAM_DFLT(backEndSquashLatency, "Back end squash latency", 1),
+    INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1),
+    INIT_PARAM_DFLT(maxInstBufferSize, "Maximum instruction buffer size", 16),
+    INIT_PARAM(numPhysicalRegs, "Number of physical registers"),
+
+    INIT_PARAM(decodeToFetchDelay, "Decode to fetch delay"),
+    INIT_PARAM(renameToFetchDelay, "Rename to fetch delay"),
+    INIT_PARAM(iewToFetchDelay, "Issue/Execute/Writeback to fetch"
+               "delay"),
+    INIT_PARAM(commitToFetchDelay, "Commit to fetch delay"),
+    INIT_PARAM(fetchWidth, "Fetch width"),
+    INIT_PARAM(renameToDecodeDelay, "Rename to decode delay"),
+    INIT_PARAM(iewToDecodeDelay, "Issue/Execute/Writeback to decode"
+               "delay"),
+    INIT_PARAM(commitToDecodeDelay, "Commit to decode delay"),
+    INIT_PARAM(fetchToDecodeDelay, "Fetch to decode delay"),
+    INIT_PARAM(decodeWidth, "Decode width"),
+
+    INIT_PARAM(iewToRenameDelay, "Issue/Execute/Writeback to rename"
+               "delay"),
+    INIT_PARAM(commitToRenameDelay, "Commit to rename delay"),
+    INIT_PARAM(decodeToRenameDelay, "Decode to rename delay"),
+    INIT_PARAM(renameWidth, "Rename width"),
+
+    INIT_PARAM(commitToIEWDelay, "Commit to "
+               "Issue/Execute/Writeback delay"),
+    INIT_PARAM(renameToIEWDelay, "Rename to "
+               "Issue/Execute/Writeback delay"),
+    INIT_PARAM(issueToExecuteDelay, "Issue to execute delay (internal"
+               "to the IEW stage)"),
+    INIT_PARAM(issueWidth, "Issue width"),
+    INIT_PARAM(executeWidth, "Execute width"),
+    INIT_PARAM(executeIntWidth, "Integer execute width"),
+    INIT_PARAM(executeFloatWidth, "Floating point execute width"),
+    INIT_PARAM(executeBranchWidth, "Branch execute width"),
+    INIT_PARAM(executeMemoryWidth, "Memory execute width"),
+
+    INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
+               "delay"),
+    INIT_PARAM(renameToROBDelay, "Rename to reorder buffer delay"),
+    INIT_PARAM(commitWidth, "Commit width"),
+    INIT_PARAM(squashWidth, "Squash width"),
+
+    INIT_PARAM(localPredictorSize, "Size of local predictor"),
+    INIT_PARAM(localCtrBits, "Bits per counter"),
+    INIT_PARAM(localHistoryTableSize, "Size of local history table"),
+    INIT_PARAM(localHistoryBits, "Bits for the local history"),
+    INIT_PARAM(globalPredictorSize, "Size of global predictor"),
+    INIT_PARAM(globalCtrBits, "Bits per counter"),
+    INIT_PARAM(globalHistoryBits, "Bits of history"),
+    INIT_PARAM(choicePredictorSize, "Size of choice predictor"),
+    INIT_PARAM(choiceCtrBits, "Bits of choice counters"),
+
+    INIT_PARAM(BTBEntries, "Number of BTB entries"),
+    INIT_PARAM(BTBTagSize, "Size of the BTB tags, in bits"),
+
+    INIT_PARAM(RASSize, "RAS size"),
+
+    INIT_PARAM(LQEntries, "Number of load queue entries"),
+    INIT_PARAM(SQEntries, "Number of store queue entries"),
+    INIT_PARAM(LFSTSize, "Last fetched store table size"),
+    INIT_PARAM(SSITSize, "Store set ID table size"),
+
+    INIT_PARAM(numPhysIntRegs, "Number of physical integer registers"),
+    INIT_PARAM(numPhysFloatRegs, "Number of physical floating point "
+               "registers"),
+    INIT_PARAM(numIQEntries, "Number of instruction queue entries"),
+    INIT_PARAM(numROBEntries, "Number of reorder buffer entries"),
+
+    INIT_PARAM_DFLT(decoupledFrontEnd, "Decoupled front end", true),
+    INIT_PARAM_DFLT(dispatchWidth, "Dispatch width", 0),
+    INIT_PARAM_DFLT(wbWidth, "Writeback width", 0),
+
+    INIT_PARAM_DFLT(smtNumFetchingThreads, "SMT Number of Fetching Threads", 1),
+    INIT_PARAM_DFLT(smtFetchPolicy, "SMT Fetch Policy", "SingleThread"),
+    INIT_PARAM_DFLT(smtLSQPolicy,   "SMT LSQ Sharing Policy",    "Partitioned"),
+    INIT_PARAM_DFLT(smtLSQThreshold,"SMT LSQ Threshold", 100),
+    INIT_PARAM_DFLT(smtIQPolicy,    "SMT IQ Policy",    "Partitioned"),
+    INIT_PARAM_DFLT(smtIQThreshold, "SMT IQ Threshold", 100),
+    INIT_PARAM_DFLT(smtROBPolicy,   "SMT ROB Sharing Policy", "Partitioned"),
+    INIT_PARAM_DFLT(smtROBThreshold,"SMT ROB Threshold", 100),
+    INIT_PARAM_DFLT(smtCommitPolicy,"SMT Commit Fetch Policy", "RoundRobin"),
+
+    INIT_PARAM(instShiftAmt, "Number of bits to shift instructions by"),
+    INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
+
+    INIT_PARAM(function_trace, "Enable function trace"),
+    INIT_PARAM(function_trace_start, "Cycle to start function trace")
+
+END_INIT_SIM_OBJECT_PARAMS(SimpleOzoneCPU)
+
+CREATE_SIM_OBJECT(SimpleOzoneCPU)
+{
+    SimpleOzoneCPU *cpu;
+
+#if FULL_SYSTEM
+    // Full-system only supports a single thread for the moment.
+    int actual_num_threads = 1;
+#else
+    // In non-full-system mode, we infer the number of threads from
+    // the workload if it's not explicitly specified.
+    int actual_num_threads =
+        numThreads.isValid() ? numThreads : workload.size();
+
+    if (workload.size() == 0) {
+        fatal("Must specify at least one workload!");
+    }
+
+#endif
+
+    SimpleParams *params = new SimpleParams;
+
+    params->clock = clock;
+
+    params->name = getInstanceName();
+    params->numberOfThreads = actual_num_threads;
+
+#if FULL_SYSTEM
+    params->system = system;
+    params->cpu_id = cpu_id;
+    params->itb = itb;
+    params->dtb = dtb;
+#else
+    params->workload = workload;
+//    params->pTable = page_table;
+#endif // FULL_SYSTEM
+
+    params->mem = mem;
+
+    params->max_insts_any_thread = max_insts_any_thread;
+    params->max_insts_all_threads = max_insts_all_threads;
+    params->max_loads_any_thread = max_loads_any_thread;
+    params->max_loads_all_threads = max_loads_all_threads;
+
+    //
+    // Caches
+    //
+    params->icacheInterface = icache ? icache->getInterface() : NULL;
+    params->dcacheInterface = dcache ? dcache->getInterface() : NULL;
+    params->cachePorts = cachePorts;
+
+    params->width = width;
+    params->frontEndWidth = frontEndWidth;
+    params->backEndWidth = backEndWidth;
+    params->backEndSquashLatency = backEndSquashLatency;
+    params->backEndLatency = backEndLatency;
+    params->maxInstBufferSize = maxInstBufferSize;
+    params->numPhysicalRegs = numPhysIntRegs + numPhysFloatRegs;
+
+    params->decodeToFetchDelay = decodeToFetchDelay;
+    params->renameToFetchDelay = renameToFetchDelay;
+    params->iewToFetchDelay = iewToFetchDelay;
+    params->commitToFetchDelay = commitToFetchDelay;
+    params->fetchWidth = fetchWidth;
+
+    params->renameToDecodeDelay = renameToDecodeDelay;
+    params->iewToDecodeDelay = iewToDecodeDelay;
+    params->commitToDecodeDelay = commitToDecodeDelay;
+    params->fetchToDecodeDelay = fetchToDecodeDelay;
+    params->decodeWidth = decodeWidth;
+
+    params->iewToRenameDelay = iewToRenameDelay;
+    params->commitToRenameDelay = commitToRenameDelay;
+    params->decodeToRenameDelay = decodeToRenameDelay;
+    params->renameWidth = renameWidth;
+
+    params->commitToIEWDelay = commitToIEWDelay;
+    params->renameToIEWDelay = renameToIEWDelay;
+    params->issueToExecuteDelay = issueToExecuteDelay;
+    params->issueWidth = issueWidth;
+    params->executeWidth = executeWidth;
+    params->executeIntWidth = executeIntWidth;
+    params->executeFloatWidth = executeFloatWidth;
+    params->executeBranchWidth = executeBranchWidth;
+    params->executeMemoryWidth = executeMemoryWidth;
+
+    params->iewToCommitDelay = iewToCommitDelay;
+    params->renameToROBDelay = renameToROBDelay;
+    params->commitWidth = commitWidth;
+    params->squashWidth = squashWidth;
+
+
+    params->localPredictorSize = localPredictorSize;
+    params->localCtrBits = localCtrBits;
+    params->localHistoryTableSize = localHistoryTableSize;
+    params->localHistoryBits = localHistoryBits;
+    params->globalPredictorSize = globalPredictorSize;
+    params->globalCtrBits = globalCtrBits;
+    params->globalHistoryBits = globalHistoryBits;
+    params->choicePredictorSize = choicePredictorSize;
+    params->choiceCtrBits = choiceCtrBits;
+
+    params->BTBEntries = BTBEntries;
+    params->BTBTagSize = BTBTagSize;
+
+    params->RASSize = RASSize;
+
+    params->LQEntries = LQEntries;
+    params->SQEntries = SQEntries;
+
+    params->SSITSize = SSITSize;
+    params->LFSTSize = LFSTSize;
+
+    params->numPhysIntRegs = numPhysIntRegs;
+    params->numPhysFloatRegs = numPhysFloatRegs;
+    params->numIQEntries = numIQEntries;
+    params->numROBEntries = numROBEntries;
+
+    params->decoupledFrontEnd = decoupledFrontEnd;
+    params->dispatchWidth = dispatchWidth;
+    params->wbWidth = wbWidth;
+
+    params->smtNumFetchingThreads = smtNumFetchingThreads;
+    params->smtFetchPolicy = smtFetchPolicy;
+    params->smtIQPolicy    = smtIQPolicy;
+    params->smtLSQPolicy    = smtLSQPolicy;
+    params->smtLSQThreshold = smtLSQThreshold;
+    params->smtROBPolicy   = smtROBPolicy;
+    params->smtROBThreshold = smtROBThreshold;
+    params->smtCommitPolicy = smtCommitPolicy;
+
+    params->instShiftAmt = 2;
+
+    params->deferRegistration = defer_registration;
+
+    params->functionTrace = function_trace;
+    params->functionTraceStart = function_trace_start;
+
+    cpu = new SimpleOzoneCPU(params);
+
+    return cpu;
+}
+
+REGISTER_SIM_OBJECT("SimpleOzoneCPU", SimpleOzoneCPU)
+
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index e7ed3cfe0..36ec30b2c 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -26,23 +26,1137 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_OOO_CPU_OOO_IMPL_HH__
-#define __CPU_OOO_CPU_OOO_IMPL_HH__
+#include <cstdio>
+#include <cstdlib>
 
-#include "arch/isa_traits.hh"
+#include "arch/isa_traits.hh" // For MachInst
+#include "base/trace.hh"
+#include "config/full_system.hh"
+#include "cpu/base.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/exetrace.hh"
+#include "cpu/ozone/cpu.hh"
+#include "cpu/quiesce_event.hh"
+#include "cpu/static_inst.hh"
+#include "mem/base_mem.hh"
+#include "mem/mem_interface.hh"
+#include "sim/sim_object.hh"
+#include "sim/stats.hh"
+
+#if FULL_SYSTEM
+#include "arch/faults.hh"
+#include "arch/alpha/osfpal.hh"
+#include "arch/alpha/tlb.hh"
+#include "arch/vtophys.hh"
+#include "base/callback.hh"
+#include "base/remote_gdb.hh"
+#include "cpu/profile.hh"
+#include "kern/kernel_stats.hh"
+#include "mem/functional/memory_control.hh"
+#include "mem/functional/physical.hh"
+#include "sim/faults.hh"
+#include "sim/sim_events.hh"
+#include "sim/sim_exit.hh"
+#include "sim/system.hh"
+#else // !FULL_SYSTEM
+#include "mem/functional/functional.hh"
+#include "sim/process.hh"
+#endif // FULL_SYSTEM
+
+using namespace TheISA;
 
 template <class Impl>
-class OoOCPU;
+template<typename T>
+void
+OzoneCPU<Impl>::trace_data(T data) {
+    if (traceData) {
+        traceData->setData(data);
+    }
+}
 
 template <class Impl>
-class OoODynInst;
+OzoneCPU<Impl>::TickEvent::TickEvent(OzoneCPU *c, int w)
+    : Event(&mainEventQueue, CPU_Tick_Pri), cpu(c), width(w)
+{
+}
 
-struct OoOImpl {
-    typedef AlphaISA ISA;
-    typedef OoOCPU<OoOImpl> OoOCPU;
-    typedef OoOCPU FullCPU;
-    typedef OoODynInst<OoOImpl> DynInst;
-    typedef RefCountingPtr<DynInst> DynInstPtr;
-};
+template <class Impl>
+void
+OzoneCPU<Impl>::TickEvent::process()
+{
+    cpu->tick();
+}
 
-#endif // __CPU_OOO_CPU_OOO_IMPL_HH__
+template <class Impl>
+const char *
+OzoneCPU<Impl>::TickEvent::description()
+{
+    return "OzoneCPU tick event";
+}
+/*
+template <class Impl>
+OzoneCPU<Impl>::ICacheCompletionEvent::ICacheCompletionEvent(OzoneCPU *_cpu)
+    : Event(&mainEventQueue),
+      cpu(_cpu)
+{
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::ICacheCompletionEvent::process()
+{
+    cpu->processICacheCompletion();
+}
+
+template <class Impl>
+const char *
+OzoneCPU<Impl>::ICacheCompletionEvent::description()
+{
+    return "OzoneCPU I-cache completion event";
+}
+
+template <class Impl>
+OzoneCPU<Impl>::DCacheCompletionEvent::
+DCacheCompletionEvent(OzoneCPU *_cpu,
+                      DynInstPtr &_inst,
+                      DCacheCompEventIt &_dcceIt)
+    : Event(&mainEventQueue),
+      cpu(_cpu),
+      inst(_inst),
+      dcceIt(_dcceIt)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::DCacheCompletionEvent::process()
+{
+    inst->setCompleted();
+
+    // Maybe remove the EA from the list of addrs?
+    cpu->eaList.clearAddr(inst->seqNum, inst->getEA());
+    cpu->dCacheCompList.erase(this->dcceIt);
+}
+
+template <class Impl>
+const char *
+OzoneCPU<Impl>::DCacheCompletionEvent::description()
+{
+    return "OzoneCPU D-cache completion event";
+}
+*/
+template <class Impl>
+OzoneCPU<Impl>::OzoneCPU(Params *p)
+#if FULL_SYSTEM
+    : BaseCPU(p), thread(this, 0, p->mem), tickEvent(this, p->width),
+#else
+    : BaseCPU(p), thread(this, 0, p->workload[0], 0), tickEvent(this, p->width),
+#endif
+      comm(5, 5)
+{
+    frontEnd = new FrontEnd(p);
+    backEnd = new BackEnd(p);
+
+    _status = Idle;
+    thread.xcProxy = &xcProxy;
+
+    thread.inSyscall = false;
+
+    xcProxy.cpu = this;
+    xcProxy.thread = &thread;
+
+    thread.setStatus(ExecContext::Suspended);
+#if FULL_SYSTEM
+//    xc = new ExecContext(this, 0, p->system, p->itb, p->dtb, p->mem);
+
+    /***** All thread state stuff *****/
+    thread.cpu = this;
+    thread.tid = 0;
+    thread.mem = p->mem;
+
+    thread.quiesceEvent = new EndQuiesceEvent(&xcProxy);
+
+    system = p->system;
+    itb = p->itb;
+    dtb = p->dtb;
+    memctrl = p->system->memctrl;
+    physmem = p->system->physmem;
+
+    if (p->profile) {
+        thread.profile = new FunctionProfile(p->system->kernelSymtab);
+        Callback *cb =
+            new MakeCallback<OzoneXC,
+            &OzoneXC::dumpFuncProfile>(&xcProxy);
+        registerExitCallback(cb);
+    }
+
+    // let's fill with a dummy node for now so we don't get a segfault
+    // on the first cycle when there's no node available.
+    static ProfileNode dummyNode;
+    thread.profileNode = &dummyNode;
+    thread.profilePC = 3;
+
+#else
+//    xc = new ExecContext(this, /* thread_num */ 0, p->workload[0], /* asid */ 0);
+    thread.cpu = this;
+    thread.tid = 0;
+    thread.process = p->workload[0];
+//    thread.mem = thread.process->getMemory();
+    thread.asid = 0;
+#endif // !FULL_SYSTEM
+/*
+    icacheInterface = p->icache_interface;
+    dcacheInterface = p->dcache_interface;
+
+    cacheMemReq = new MemReq();
+    cacheMemReq->xc = xc;
+    cacheMemReq->asid = 0;
+    cacheMemReq->data = new uint8_t[64];
+*/
+    numInst = 0;
+    startNumInst = 0;
+/*    numLoad = 0;
+    startNumLoad = 0;
+    lastIcacheStall = 0;
+    lastDcacheStall = 0;
+
+    issueWidth = p->issueWidth;
+*/
+    execContexts.push_back(&xcProxy);
+
+    frontEnd->setCPU(this);
+    backEnd->setCPU(this);
+
+    frontEnd->setXC(&xcProxy);
+    backEnd->setXC(&xcProxy);
+
+    frontEnd->setThreadState(&thread);
+    backEnd->setThreadState(&thread);
+
+    frontEnd->setCommBuffer(&comm);
+    backEnd->setCommBuffer(&comm);
+
+    frontEnd->setBackEnd(backEnd);
+    backEnd->setFrontEnd(frontEnd);
+
+    decoupledFrontEnd = p->decoupledFrontEnd;
+
+    globalSeqNum = 1;
+
+    checkInterrupts = false;
+/*
+    fetchRedirBranch = true;
+    fetchRedirExcp = true;
+
+    // Need to initialize the rename maps, and the head and tail pointers.
+    robHeadPtr = new DynInst(this);
+    robTailPtr = new DynInst(this);
+
+    robHeadPtr->setNextInst(robTailPtr);
+//    robHeadPtr->setPrevInst(NULL);
+//    robTailPtr->setNextInst(NULL);
+    robTailPtr->setPrevInst(robHeadPtr);
+
+    robHeadPtr->setCompleted();
+    robTailPtr->setCompleted();
+
+    for (int i = 0; i < ISA::TotalNumRegs; ++i) {
+        renameTable[i] = new DynInst(this);
+        commitTable[i] = new DynInst(this);
+
+        renameTable[i]->setCompleted();
+        commitTable[i]->setCompleted();
+    }
+
+#if FULL_SYSTEM
+    for (int i = 0; i < ISA::NumIntRegs; ++i) {
+        palShadowTable[i] = new DynInst(this);
+        palShadowTable[i]->setCompleted();
+    }
+#endif
+
+    // Size of cache block.
+    cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
+
+    // Create mask to get rid of offset bits.
+    cacheBlkMask = (cacheBlkSize - 1);
+
+    // Get the size of an instruction.
+    instSize = sizeof(MachInst);
+
+    // Create space to store a cache line.
+    cacheData = new uint8_t[cacheBlkSize];
+
+    cacheBlkValid = false;
+*/
+    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
+        thread.renameTable[i] = new DynInst(this);
+        thread.renameTable[i]->setCompleted();
+    }
+
+    frontEnd->renameTable.copyFrom(thread.renameTable);
+    backEnd->renameTable.copyFrom(thread.renameTable);
+
+#if !FULL_SYSTEM
+    pTable = p->pTable;
+#endif
+
+    DPRINTF(OzoneCPU, "OzoneCPU: Created Ozone cpu object.\n");
+}
+
+template <class Impl>
+OzoneCPU<Impl>::~OzoneCPU()
+{
+}
+/*
+template <class Impl>
+void
+OzoneCPU<Impl>::copyFromXC()
+{
+    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
+        if (i < TheISA::NumIntRegs) {
+            renameTable[i]->setIntResult(xc->readIntReg(i));
+        } else if (i < TheISA::NumFloatRegs) {
+            renameTable[i]->setDoubleResult(xc->readFloatRegDouble(i));
+        }
+    }
+
+    DPRINTF(OzoneCPU, "Func Exe inst is: %i\n", xc->func_exe_inst);
+    backEnd->funcExeInst = xc->func_exe_inst;
+//    PC = xc->readPC();
+//    nextPC = xc->regs.npc;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::copyToXC()
+{
+    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
+        if (i < TheISA::NumIntRegs) {
+            xc->setIntReg(i, renameTable[i]->readIntResult());
+        } else if (i < TheISA::NumFloatRegs) {
+            xc->setFloatRegDouble(i, renameTable[i]->readDoubleResult());
+        }
+    }
+
+    this->xc->regs.miscRegs.fpcr = this->regFile.miscRegs[tid].fpcr;
+    this->xc->regs.miscRegs.uniq = this->regFile.miscRegs[tid].uniq;
+    this->xc->regs.miscRegs.lock_flag = this->regFile.miscRegs[tid].lock_flag;
+    this->xc->regs.miscRegs.lock_addr = this->regFile.miscRegs[tid].lock_addr;
+
+    xc->func_exe_inst = backEnd->funcExeInst;
+    xc->regs.pc = PC;
+    xc->regs.npc = nextPC;
+}
+*/
+template <class Impl>
+void
+OzoneCPU<Impl>::switchOut()
+{
+    _status = SwitchedOut;
+    if (tickEvent.scheduled())
+        tickEvent.squash();
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
+{
+    BaseCPU::takeOverFrom(oldCPU);
+
+    assert(!tickEvent.scheduled());
+
+    // if any of this CPU's ExecContexts are active, mark the CPU as
+    // running and schedule its tick event.
+    for (int i = 0; i < execContexts.size(); ++i) {
+        ExecContext *xc = execContexts[i];
+        if (xc->status() == ExecContext::Active &&
+            _status != Running) {
+            _status = Running;
+            tickEvent.schedule(curTick);
+        }
+    }
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::activateContext(int thread_num, int delay)
+{
+    // Eventually change this in SMT.
+    assert(thread_num == 0);
+//    assert(xcProxy);
+
+    assert(_status == Idle);
+    notIdleFraction++;
+    scheduleTickEvent(delay);
+    _status = Running;
+    thread._status = ExecContext::Active;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::suspendContext(int thread_num)
+{
+    // Eventually change this in SMT.
+    assert(thread_num == 0);
+//    assert(xcProxy);
+
+    assert(_status == Running);
+    notIdleFraction--;
+    unscheduleTickEvent();
+    _status = Idle;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::deallocateContext(int thread_num)
+{
+    // for now, these are equivalent
+    suspendContext(thread_num);
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::haltContext(int thread_num)
+{
+    // for now, these are equivalent
+    suspendContext(thread_num);
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::regStats()
+{
+    using namespace Stats;
+
+    BaseCPU::regStats();
+
+    thread.numInsts
+        .name(name() + ".num_insts")
+        .desc("Number of instructions executed")
+        ;
+
+    thread.numMemRefs
+        .name(name() + ".num_refs")
+        .desc("Number of memory references")
+        ;
+
+    notIdleFraction
+        .name(name() + ".not_idle_fraction")
+        .desc("Percentage of non-idle cycles")
+        ;
+
+    idleFraction
+        .name(name() + ".idle_fraction")
+        .desc("Percentage of idle cycles")
+        ;
+
+    idleFraction = constant(1.0) - notIdleFraction;
+
+    frontEnd->regStats();
+    backEnd->regStats();
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::resetStats()
+{
+    startNumInst = numInst;
+    notIdleFraction = (_status != Idle);
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::init()
+{
+    BaseCPU::init();
+/*
+    copyFromXC();
+
+    // ALso copy over PC/nextPC.  This isn't normally copied in "copyFromXC()"
+    // so that the XC doesn't mess up the PC when returning from a syscall.
+    PC = xc->readPC();
+    nextPC = xc->regs.npc;
+*/
+    // Mark this as in syscall so it won't need to squash
+    thread.inSyscall = true;
+#if FULL_SYSTEM
+    for (int i = 0; i < execContexts.size(); ++i) {
+        ExecContext *xc = execContexts[i];
+
+        // initialize CPU, including PC
+        TheISA::initCPU(xc, xc->readCpuId());
+    }
+#endif
+    frontEnd->renameTable.copyFrom(thread.renameTable);
+    backEnd->renameTable.copyFrom(thread.renameTable);
+
+    thread.inSyscall = false;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::serialize(std::ostream &os)
+{
+    // At this point, all DCacheCompEvents should be processed.
+
+    BaseCPU::serialize(os);
+    SERIALIZE_ENUM(_status);
+    nameOut(os, csprintf("%s.xc", name()));
+    xcProxy.serialize(os);
+    nameOut(os, csprintf("%s.tickEvent", name()));
+    tickEvent.serialize(os);
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::unserialize(Checkpoint *cp, const std::string &section)
+{
+    BaseCPU::unserialize(cp, section);
+    UNSERIALIZE_ENUM(_status);
+    xcProxy.unserialize(cp, csprintf("%s.xc", section));
+    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+}
+
+template <class Impl>
+Fault
+OzoneCPU<Impl>::copySrcTranslate(Addr src)
+{
+    panic("Copy not implemented!\n");
+    return NoFault;
+#if 0
+    static bool no_warn = true;
+    int blk_size = (dcacheInterface) ? dcacheInterface->getBlockSize() : 64;
+    // Only support block sizes of 64 atm.
+    assert(blk_size == 64);
+    int offset = src & (blk_size - 1);
+
+    // Make sure block doesn't span page
+    if (no_warn &&
+        (src & TheISA::PageMask) != ((src + blk_size) & TheISA::PageMask) &&
+        (src >> 40) != 0xfffffc) {
+        warn("Copied block source spans pages %x.", src);
+        no_warn = false;
+    }
+
+    memReq->reset(src & ~(blk_size - 1), blk_size);
+
+    // translate to physical address
+    Fault fault = xc->translateDataReadReq(memReq);
+
+    assert(fault != Alignment_Fault);
+
+    if (fault == NoFault) {
+        xc->copySrcAddr = src;
+        xc->copySrcPhysAddr = memReq->paddr + offset;
+    } else {
+        xc->copySrcAddr = 0;
+        xc->copySrcPhysAddr = 0;
+    }
+    return fault;
+#endif
+}
+
+template <class Impl>
+Fault
+OzoneCPU<Impl>::copy(Addr dest)
+{
+    panic("Copy not implemented!\n");
+    return NoFault;
+#if 0
+    static bool no_warn = true;
+    int blk_size = (dcacheInterface) ? dcacheInterface->getBlockSize() : 64;
+    // Only support block sizes of 64 atm.
+    assert(blk_size == 64);
+    uint8_t data[blk_size];
+    //assert(xc->copySrcAddr);
+    int offset = dest & (blk_size - 1);
+
+    // Make sure block doesn't span page
+    if (no_warn &&
+        (dest & TheISA::PageMask) != ((dest + blk_size) & TheISA::PageMask) &&
+        (dest >> 40) != 0xfffffc) {
+        no_warn = false;
+        warn("Copied block destination spans pages %x. ", dest);
+    }
+
+    memReq->reset(dest & ~(blk_size -1), blk_size);
+    // translate to physical address
+    Fault fault = xc->translateDataWriteReq(memReq);
+
+    assert(fault != Alignment_Fault);
+
+    if (fault == NoFault) {
+        Addr dest_addr = memReq->paddr + offset;
+        // Need to read straight from memory since we have more than 8 bytes.
+        memReq->paddr = xc->copySrcPhysAddr;
+        xc->mem->read(memReq, data);
+        memReq->paddr = dest_addr;
+        xc->mem->write(memReq, data);
+        if (dcacheInterface) {
+            memReq->cmd = Copy;
+            memReq->completionEvent = NULL;
+            memReq->paddr = xc->copySrcPhysAddr;
+            memReq->dest = dest_addr;
+            memReq->size = 64;
+            memReq->time = curTick;
+            dcacheInterface->access(memReq);
+        }
+    }
+    return fault;
+#endif
+}
+
+#if FULL_SYSTEM
+template <class Impl>
+Addr
+OzoneCPU<Impl>::dbg_vtophys(Addr addr)
+{
+    return vtophys(&xcProxy, addr);
+}
+#endif // FULL_SYSTEM
+/*
+template <class Impl>
+void
+OzoneCPU<Impl>::processICacheCompletion()
+{
+    switch (status()) {
+      case IcacheMiss:
+        DPRINTF(OzoneCPU, "OzoneCPU: Finished Icache miss.\n");
+
+        icacheStallCycles += curTick - lastIcacheStall;
+        _status = IcacheMissComplete;
+        cacheBlkValid = true;
+//	scheduleTickEvent(1);
+        break;
+      case SwitchedOut:
+        // If this CPU has been switched out due to sampling/warm-up,
+        // ignore any further status changes (e.g., due to cache
+        // misses outstanding at the time of the switch).
+        return;
+      default:
+        panic("OzoneCPU::processICacheCompletion: bad state");
+        break;
+    }
+}
+*/
+#if FULL_SYSTEM
+template <class Impl>
+void
+OzoneCPU<Impl>::post_interrupt(int int_num, int index)
+{
+    BaseCPU::post_interrupt(int_num, index);
+
+    if (thread._status == ExecContext::Suspended) {
+        DPRINTF(IPI,"Suspended Processor awoke\n");
+//	thread.activate();
+        // Hack for now.  Otherwise might have to go through the xcProxy, or
+        // I need to figure out what's the right thing to call.
+        activateContext(thread.tid, 1);
+    }
+}
+#endif // FULL_SYSTEM
+
+/* start simulation, program loaded, processor precise state initialized */
+template <class Impl>
+void
+OzoneCPU<Impl>::tick()
+{
+    DPRINTF(OzoneCPU, "\n\nOzoneCPU: Ticking cpu.\n");
+
+    thread.renameTable[ZeroReg]->setIntResult(0);
+    thread.renameTable[ZeroReg+TheISA::FP_Base_DepTag]->
+        setDoubleResult(0.0);
+
+    // General code flow:
+    // Check for any interrupts.  Handle them if I do have one.
+    // Check if I have a need to fetch a new cache block.  Either a bit could be
+    // set by functions indicating that I need to fetch a new block, or I could
+    // hang onto the last PC of the last cache block I fetched and compare the
+    // current PC to that.  Setting a bit seems nicer but may be more error
+    // prone.
+    // Scan through the IQ to figure out if there's anything I can issue/execute
+    // Might need something close to the FU Pools to tell what instructions
+    // I can issue.  How to handle loads and stores vs other insts?
+    // Extremely slow way: find first inst that can possibly issue; if it's a
+    // load or a store, then iterate through load/store queue.
+    // If I can't find instructions to execute and I've got room in the IQ
+    // (which is just a counter), then grab a few instructions out of the cache
+    // line buffer until I either run out or can execute up until my limit.
+
+    numCycles++;
+
+    traceData = NULL;
+
+//    Fault fault = NoFault;
+
+#if 0 // FULL_SYSTEM
+    if (checkInterrupts && check_interrupts() && !inPalMode() &&
+        status() != IcacheMissComplete) {
+        int ipl = 0;
+        int summary = 0;
+        checkInterrupts = false;
+
+        if (readMiscReg(IPR_SIRR)) {
+            for (int i = INTLEVEL_SOFTWARE_MIN;
+                 i < INTLEVEL_SOFTWARE_MAX; i++) {
+                if (readMiscReg(IPR_SIRR) & (ULL(1) << i)) {
+                    // See table 4-19 of 21164 hardware reference
+                    ipl = (i - INTLEVEL_SOFTWARE_MIN) + 1;
+                    summary |= (ULL(1) << i);
+                }
+            }
+        }
+
+        // Is this method so that if the interrupts are switched over from
+        // another CPU they'll still be handled?
+//	uint64_t interrupts = cpuXC->cpu->intr_status();
+        uint64_t interrupts = intr_status();
+        for (int i = INTLEVEL_EXTERNAL_MIN;
+            i < INTLEVEL_EXTERNAL_MAX; i++) {
+            if (interrupts & (ULL(1) << i)) {
+                // See table 4-19 of 21164 hardware reference
+                ipl = i;
+                summary |= (ULL(1) << i);
+            }
+        }
+
+        if (readMiscReg(IPR_ASTRR))
+            panic("asynchronous traps not implemented\n");
+
+        if (ipl && ipl > readMiscReg(IPR_IPLR)) {
+            setMiscReg(IPR_ISR, summary);
+            setMiscReg(IPR_INTID, ipl);
+
+            Fault(new InterruptFault)->invoke(xc);
+
+            DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
+                    readMiscReg(IPR_IPLR), ipl, summary);
+        }
+    }
+#endif
+
+    // Make call to ISA to ensure 0 register semantics...actually because the
+    // DynInsts will generally be the register file, this should only have to
+    // happen when the xc is actually written to (during a syscall or something)
+    // maintain $r0 semantics
+//    assert(renameTable[ZeroReg]->readIntResult() == 0);
+#ifdef TARGET_ALPHA
+//    assert(renameTable[ZeroReg]->readDoubleResult() == 0);
+#endif // TARGET_ALPHA
+
+    comm.advance();
+    frontEnd->tick();
+    backEnd->tick();
+
+    // Do this here?  For now the front end will control the PC.
+//    PC = nextPC;
+
+    // check for instruction-count-based events
+    comInstEventQueue[0]->serviceEvents(numInst);
+
+    if (!tickEvent.scheduled())
+        tickEvent.schedule(curTick + 1);
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::squashFromXC()
+{
+    thread.inSyscall = true;
+    backEnd->squashFromXC();
+}
+
+#if !FULL_SYSTEM
+template <class Impl>
+void
+OzoneCPU<Impl>::syscall()
+{
+    // Not sure this copy is needed, depending on how the XC proxy is made.
+    thread.renameTable.copyFrom(backEnd->renameTable);
+
+    thread.inSyscall = true;
+
+    thread.funcExeInst++;
+
+    DPRINTF(OzoneCPU, "FuncExeInst: %i\n", thread.funcExeInst);
+
+    thread.process->syscall(&xcProxy);
+
+    thread.funcExeInst--;
+
+    thread.inSyscall = false;
+
+    frontEnd->renameTable.copyFrom(thread.renameTable);
+    backEnd->renameTable.copyFrom(thread.renameTable);
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::setSyscallReturn(SyscallReturn return_value, int tid)
+{
+    // check for error condition.  Alpha syscall convention is to
+    // indicate success/failure in reg a3 (r19) and put the
+    // return value itself in the standard return value reg (v0).
+    if (return_value.successful()) {
+        // no error
+        thread.renameTable[SyscallSuccessReg]->setIntResult(0);
+        thread.renameTable[ReturnValueReg]->setIntResult(return_value.value());
+    } else {
+        // got an error, return details
+        thread.renameTable[SyscallSuccessReg]->setIntResult((IntReg) -1);
+        thread.renameTable[ReturnValueReg]->setIntResult(-return_value.value());
+    }
+}
+#else
+template <class Impl>
+Fault
+OzoneCPU<Impl>::hwrei()
+{
+    // Need to move this to ISA code
+    // May also need to make this per thread
+    if (!inPalMode())
+        return new UnimplementedOpcodeFault;
+
+    thread.setNextPC(thread.readMiscReg(AlphaISA::IPR_EXC_ADDR));
+
+    // Not sure how to make a similar check in the Ozone model
+//    if (!misspeculating()) {
+        kernelStats->hwrei();
+
+        checkInterrupts = true;
+//    }
+
+    // FIXME: XXX check for interrupts? XXX
+    return NoFault;
+}
+
+template <class Impl>
+bool
+OzoneCPU<Impl>::simPalCheck(int palFunc)
+{
+    // Need to move this to ISA code
+    // May also need to make this per thread
+    this->kernelStats->callpal(palFunc, &xcProxy);
+
+    switch (palFunc) {
+      case PAL::halt:
+        haltContext(thread.tid);
+        if (--System::numSystemsRunning == 0)
+            new SimExitEvent("all cpus halted");
+        break;
+
+      case PAL::bpt:
+      case PAL::bugchk:
+        if (system->breakpoint())
+            return false;
+        break;
+    }
+
+    return true;
+}
+#endif
+
+template <class Impl>
+BaseCPU *
+OzoneCPU<Impl>::OzoneXC::getCpuPtr()
+{
+    return cpu;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::setCpuId(int id)
+{
+    cpu->cpuId = id;
+    thread->cpuId = id;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::setStatus(Status new_status)
+{
+//    cpu->_status = new_status;
+    thread->_status = new_status;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::activate(int delay)
+{
+    cpu->activateContext(thread->tid, delay);
+}
+
+/// Set the status to Suspended.
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::suspend()
+{
+    cpu->suspendContext(thread->tid);
+}
+
+/// Set the status to Unallocated.
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::deallocate()
+{
+    cpu->deallocateContext(thread->tid);
+}
+
+/// Set the status to Halted.
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::halt()
+{
+    cpu->haltContext(thread->tid);
+}
+
+#if FULL_SYSTEM
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::dumpFuncProfile()
+{ }
+#endif
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
+{ }
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::regStats(const std::string &name)
+{ }
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::serialize(std::ostream &os)
+{ }
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::unserialize(Checkpoint *cp, const std::string &section)
+{ }
+
+#if FULL_SYSTEM
+template <class Impl>
+Event *
+OzoneCPU<Impl>::OzoneXC::getQuiesceEvent()
+{
+    return thread->quiesceEvent;
+}
+
+template <class Impl>
+Tick
+OzoneCPU<Impl>::OzoneXC::readLastActivate()
+{
+    return thread->lastActivate;
+}
+
+template <class Impl>
+Tick
+OzoneCPU<Impl>::OzoneXC::readLastSuspend()
+{
+    return thread->lastSuspend;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::profileClear()
+{
+    if (thread->profile)
+        thread->profile->clear();
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::profileSample()
+{
+    if (thread->profile)
+        thread->profile->sample(thread->profileNode, thread->profilePC);
+}
+#endif
+
+template <class Impl>
+int
+OzoneCPU<Impl>::OzoneXC::getThreadNum()
+{
+    return thread->tid;
+}
+
+// Also somewhat obnoxious.  Really only used for the TLB fault.
+template <class Impl>
+TheISA::MachInst
+OzoneCPU<Impl>::OzoneXC::getInst()
+{
+    return thread->inst;
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::copyArchRegs(ExecContext *xc)
+{
+    thread->PC = xc->readPC();
+    thread->nextPC = xc->readNextPC();
+
+    cpu->frontEnd->setPC(thread->PC);
+    cpu->frontEnd->setNextPC(thread->nextPC);
+
+    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
+        if (i < TheISA::FP_Base_DepTag) {
+            thread->renameTable[i]->setIntResult(xc->readIntReg(i));
+        } else if (i < (TheISA::FP_Base_DepTag + TheISA::NumFloatRegs)) {
+            int fp_idx = i - TheISA::FP_Base_DepTag;
+            thread->renameTable[i]->setDoubleResult(
+                xc->readFloatRegDouble(fp_idx));
+        }
+    }
+
+#if !FULL_SYSTEM
+    thread->funcExeInst = xc->readFuncExeInst();
+#endif
+
+    // Need to copy the XC values into the current rename table,
+    // copy the misc regs.
+    thread->regs.miscRegs.copyMiscRegs(xc);
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::clearArchRegs()
+{
+    panic("Unimplemented!");
+}
+
+template <class Impl>
+uint64_t
+OzoneCPU<Impl>::OzoneXC::readIntReg(int reg_idx)
+{
+    return thread->renameTable[reg_idx]->readIntResult();
+}
+
+template <class Impl>
+float
+OzoneCPU<Impl>::OzoneXC::readFloatRegSingle(int reg_idx)
+{
+    return thread->renameTable[reg_idx]->readFloatResult();
+}
+
+template <class Impl>
+double
+OzoneCPU<Impl>::OzoneXC::readFloatRegDouble(int reg_idx)
+{
+    return thread->renameTable[reg_idx]->readDoubleResult();
+}
+
+template <class Impl>
+uint64_t
+OzoneCPU<Impl>::OzoneXC::readFloatRegInt(int reg_idx)
+{
+    return thread->renameTable[reg_idx]->readIntResult();
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::setIntReg(int reg_idx, uint64_t val)
+{
+    thread->renameTable[reg_idx]->setIntResult(val);
+
+    if (!thread->inSyscall) {
+        cpu->squashFromXC();
+    }
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::setFloatRegSingle(int reg_idx, float val)
+{
+    panic("Unimplemented!");
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::setFloatRegDouble(int reg_idx, double val)
+{
+    thread->renameTable[reg_idx]->setDoubleResult(val);
+
+    if (!thread->inSyscall) {
+        cpu->squashFromXC();
+    }
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::setFloatRegInt(int reg_idx, uint64_t val)
+{
+    panic("Unimplemented!");
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::setPC(Addr val)
+{
+    thread->PC = val;
+    cpu->frontEnd->setPC(val);
+
+    if (!thread->inSyscall) {
+        cpu->squashFromXC();
+    }
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::OzoneXC::setNextPC(Addr val)
+{
+    thread->nextPC = val;
+    cpu->frontEnd->setNextPC(val);
+
+    if (!thread->inSyscall) {
+        cpu->squashFromXC();
+    }
+}
+
+template <class Impl>
+TheISA::MiscReg
+OzoneCPU<Impl>::OzoneXC::readMiscReg(int misc_reg)
+{
+    return thread->regs.miscRegs.readReg(misc_reg);
+}
+
+template <class Impl>
+TheISA::MiscReg
+OzoneCPU<Impl>::OzoneXC::readMiscRegWithEffect(int misc_reg, Fault &fault)
+{
+    return thread->regs.miscRegs.readRegWithEffect(misc_reg,
+                                                   fault, this);
+}
+
+template <class Impl>
+Fault
+OzoneCPU<Impl>::OzoneXC::setMiscReg(int misc_reg, const MiscReg &val)
+{
+    // Needs to setup a squash event unless we're in syscall mode
+    Fault ret_fault = thread->regs.miscRegs.setReg(misc_reg, val);
+
+    if (!thread->inSyscall) {
+        cpu->squashFromXC();
+    }
+
+    return ret_fault;
+}
+
+template <class Impl>
+Fault
+OzoneCPU<Impl>::OzoneXC::setMiscRegWithEffect(int misc_reg, const MiscReg &val)
+{
+    // Needs to setup a squash event unless we're in syscall mode
+    Fault ret_fault = thread->regs.miscRegs.setRegWithEffect(misc_reg, val,
+                                                             this);
+
+    if (!thread->inSyscall) {
+        cpu->squashFromXC();
+    }
+
+    return ret_fault;
+}
diff --git a/cpu/ozone/dyn_inst.cc b/cpu/ozone/dyn_inst.cc
new file mode 100644
index 000000000..3bf8b03ca
--- /dev/null
+++ b/cpu/ozone/dyn_inst.cc
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/ozone/dyn_inst_impl.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
+
+template class OzoneDynInst<OzoneImpl>;
+template class OzoneDynInst<SimpleImpl>;
+
diff --git a/cpu/ozone/dyn_inst.hh b/cpu/ozone/dyn_inst.hh
new file mode 100644
index 000000000..4382af0fd
--- /dev/null
+++ b/cpu/ozone/dyn_inst.hh
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_OZONE_DYN_INST_HH__
+#define __CPU_OZONE_DYN_INST_HH__
+
+#include "arch/isa_traits.hh"
+#include "config/full_system.hh"
+#include "cpu/base_dyn_inst.hh"
+#include "cpu/ozone/cpu.hh"   // MUST include this
+#include "cpu/inst_seq.hh"
+#include "cpu/ozone/simple_impl.hh" // Would be nice to not have to include this
+#include "cpu/ozone/ozone_impl.hh"
+
+#include <list>
+#include <vector>
+
+template <class Impl>
+class OzoneDynInst : public BaseDynInst<Impl>
+{
+  public:
+    // Typedefs
+    typedef typename Impl::FullCPU FullCPU;
+
+    typedef typename FullCPU::ImplState ImplState;
+
+    // Typedef for DynInstPtr.  This is really just a RefCountingPtr<OoODynInst>.
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+//    typedef typename Impl::BranchPred::BPredInfo BPredInfo;
+
+    typedef TheISA::ExtMachInst ExtMachInst;
+    typedef TheISA::MachInst MachInst;
+    typedef TheISA::MiscReg MiscReg;
+    typedef typename std::list<DynInstPtr>::iterator ListIt;
+
+    // Note that this is duplicated from the BaseDynInst class; I'm simply not
+    // sure the enum would carry through so I could use it in array
+    // declarations in this class.
+    enum {
+        MaxInstSrcRegs = TheISA::MaxInstSrcRegs,
+        MaxInstDestRegs = TheISA::MaxInstDestRegs
+    };
+
+    OzoneDynInst(FullCPU *cpu);
+
+    OzoneDynInst(ExtMachInst inst, Addr PC, Addr Pred_PC,
+                 InstSeqNum seq_num, FullCPU *cpu);
+
+    OzoneDynInst(StaticInstPtr inst);
+
+    ~OzoneDynInst();
+
+    void setSrcInst(DynInstPtr &newSrcInst, int regIdx)
+    { srcInsts[regIdx] = newSrcInst; }
+
+    bool srcInstReady(int regIdx);
+
+    void setPrevDestInst(DynInstPtr &oldDestInst, int regIdx)
+    { prevDestInst[regIdx] = oldDestInst; }
+
+    DynInstPtr &getPrevDestInst(int regIdx)
+    { return prevDestInst[regIdx]; }
+
+    void addDependent(DynInstPtr &dependent_inst);
+
+    std::vector<DynInstPtr> &getDependents() { return dependents; }
+
+    void wakeDependents();
+
+//    void setBPredInfo(const BPredInfo &bp_info) { bpInfo = bp_info; }
+
+//    BPredInfo &getBPredInfo() { return bpInfo; }
+
+//    OzoneXC *thread;
+
+  private:
+    void initInstPtrs();
+
+    std::vector<DynInstPtr> dependents;
+
+    /** The instruction that produces the value of the source registers.  These
+     *  may be NULL if the value has already been read from the source
+     *  instruction.
+     */
+    DynInstPtr srcInsts[MaxInstSrcRegs];
+
+    /**
+     *  Previous rename instruction for this destination.
+     */
+    DynInstPtr prevDestInst[MaxInstSrcRegs];
+
+//    BPredInfo bpInfo;
+
+  public:
+
+    Fault initiateAcc();
+
+    Fault completeAcc();
+/*
+    template <class T>
+    Fault read(Addr addr, T &data, unsigned flags);
+
+    template <class T>
+    Fault write(T data, Addr addr, unsigned flags, uint64_t *res);
+*/
+    // The register accessor methods provide the index of the
+    // instruction's operand (e.g., 0 or 1), not the architectural
+    // register index, to simplify the implementation of register
+    // renaming.  We find the architectural register index by indexing
+    // into the instruction's own operand index table.  Note that a
+    // raw pointer to the StaticInst is provided instead of a
+    // ref-counted StaticInstPtr to redice overhead.  This is fine as
+    // long as these methods don't copy the pointer into any long-term
+    // storage (which is pretty hard to imagine they would have reason
+    // to do).
+
+    uint64_t readIntReg(const StaticInst *si, int idx)
+    {
+        return srcInsts[idx]->readIntResult();
+    }
+
+    float readFloatRegSingle(const StaticInst *si, int idx)
+    {
+        return srcInsts[idx]->readFloatResult();
+    }
+
+    double readFloatRegDouble(const StaticInst *si, int idx)
+    {
+        return srcInsts[idx]->readDoubleResult();
+    }
+
+    uint64_t readFloatRegInt(const StaticInst *si, int idx)
+    {
+        return srcInsts[idx]->readIntResult();
+    }
+
+    /** @todo: Make results into arrays so they can handle multiple dest
+     *  registers.
+     */
+    void setIntReg(const StaticInst *si, int idx, uint64_t val)
+    {
+        this->instResult.integer = val;
+    }
+
+    void setFloatRegSingle(const StaticInst *si, int idx, float val)
+    {
+        this->instResult.fp = val;
+    }
+
+    void setFloatRegDouble(const StaticInst *si, int idx, double val)
+    {
+        this->instResult.dbl = val;
+    }
+
+    void setFloatRegInt(const StaticInst *si, int idx, uint64_t val)
+    {
+        this->instResult.integer = val;
+    }
+
+    void setIntResult(uint64_t result) { this->instResult.integer = result; }
+    void setDoubleResult(double result) { this->instResult.dbl = result; }
+
+    bool srcsReady();
+    bool eaSrcsReady();
+
+    Fault execute();
+
+    Fault executeEAComp()
+    { return NoFault; }
+
+    Fault executeMemAcc()
+    { return this->staticInst->memAccInst()->execute(this, this->traceData); }
+
+    void clearDependents();
+
+  public:
+    // ISA stuff
+    MiscReg readMiscReg(int misc_reg);
+
+    MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault);
+
+    Fault setMiscReg(int misc_reg, const MiscReg &val);
+
+    Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
+
+#if FULL_SYSTEM
+    Fault hwrei();
+    int readIntrFlag();
+    void setIntrFlag(int val);
+    bool inPalMode();
+    void trap(Fault fault);
+    bool simPalCheck(int palFunc);
+#else
+    void syscall();
+#endif
+
+    ListIt iqIt;
+    bool iqItValid;
+};
+
+/*
+template<class Impl>
+template<class T>
+inline Fault
+OzoneDynInst<Impl>::read(Addr addr, T &data, unsigned flags)
+{
+    Fault fault = this->cpu->read(addr, data, flags, this);
+
+    if (this->traceData) {
+        this->traceData->setAddr(addr);
+        this->traceData->setData(data);
+    }
+
+    return fault;
+}
+
+template<class Impl>
+template<class T>
+inline Fault
+OzoneDynInst<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
+{
+    Fault fault = this->cpu->write(data, addr, flags, res, this);
+
+    this->storeSize = sizeof(T);
+    this->storeData = data;
+
+    if (this->traceData) {
+        this->traceData->setAddr(addr);
+        this->traceData->setData(data);
+    }
+
+    return fault;
+}
+*/
+#endif // __CPU_OZONE_DYN_INST_HH__
diff --git a/cpu/ozone/dyn_inst_impl.hh b/cpu/ozone/dyn_inst_impl.hh
new file mode 100644
index 000000000..2d86ced62
--- /dev/null
+++ b/cpu/ozone/dyn_inst_impl.hh
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/faults.hh"
+#include "arch/isa_traits.hh"
+#include "config/full_system.hh"
+#include "cpu/ozone/dyn_inst.hh"
+#include "kern/kernel_stats.hh"
+
+using namespace TheISA;
+
+template <class Impl>
+OzoneDynInst<Impl>::OzoneDynInst(FullCPU *cpu)
+    : BaseDynInst<Impl>(0, 0, 0, 0, cpu)
+{
+    this->setCompleted();
+
+    initInstPtrs();
+}
+
+template <class Impl>
+OzoneDynInst<Impl>::OzoneDynInst(ExtMachInst inst, Addr PC, Addr Pred_PC,
+                                 InstSeqNum seq_num, FullCPU *cpu)
+    : BaseDynInst<Impl>(inst, PC, Pred_PC, seq_num, cpu)
+{
+    initInstPtrs();
+}
+
+template <class Impl>
+OzoneDynInst<Impl>::OzoneDynInst(StaticInstPtr _staticInst)
+    : BaseDynInst<Impl>(_staticInst)
+{
+    initInstPtrs();
+}
+
+template <class Impl>
+OzoneDynInst<Impl>::~OzoneDynInst()
+{
+    DPRINTF(BE, "[sn:%lli] destructor called\n", this->seqNum);
+    for (int i = 0; i < this->numSrcRegs(); ++i) {
+        srcInsts[i] = NULL;
+    }
+
+    for (int i = 0; i < this->numDestRegs(); ++i) {
+        prevDestInst[i] = NULL;
+    }
+
+    dependents.clear();
+}
+
+template <class Impl>
+Fault
+OzoneDynInst<Impl>::execute()
+{
+    // @todo: Pretty convoluted way to avoid squashing from happening when using
+    // the XC during an instruction's execution (specifically for instructions
+    // that have sideeffects that use the XC).  Fix this.
+    bool in_syscall = this->thread->inSyscall;
+    this->thread->inSyscall = true;
+
+    this->fault = this->staticInst->execute(this, this->traceData);
+
+    this->thread->inSyscall = in_syscall;
+
+    return this->fault;
+}
+
+template <class Impl>
+Fault
+OzoneDynInst<Impl>::initiateAcc()
+{
+    // @todo: Pretty convoluted way to avoid squashing from happening when using
+    // the XC during an instruction's execution (specifically for instructions
+    // that have sideeffects that use the XC).  Fix this.
+    bool in_syscall = this->thread->inSyscall;
+    this->thread->inSyscall = true;
+
+    this->fault = this->staticInst->initiateAcc(this, this->traceData);
+
+    this->thread->inSyscall = in_syscall;
+
+    return this->fault;
+}
+
+template <class Impl>
+Fault
+OzoneDynInst<Impl>::completeAcc()
+{
+    if (this->isLoad()) {
+        this->fault = this->staticInst->completeAcc(this->req->data,
+                                                    this,
+                                                    this->traceData);
+    } else if (this->isStore()) {
+        this->fault = this->staticInst->completeAcc((uint8_t*)&this->req->result,
+                                                    this,
+                                                    this->traceData);
+    } else {
+        panic("Unknown type!");
+    }
+
+    return this->fault;
+}
+
+template <class Impl>
+bool
+OzoneDynInst<Impl>::srcInstReady(int regIdx)
+{
+    return srcInsts[regIdx]->isCompleted();
+}
+
+template <class Impl>
+void
+OzoneDynInst<Impl>::addDependent(DynInstPtr &dependent_inst)
+{
+    dependents.push_back(dependent_inst);
+}
+
+template <class Impl>
+void
+OzoneDynInst<Impl>::wakeDependents()
+{
+    for (int i = 0; i < dependents.size(); ++i) {
+        dependents[i]->markSrcRegReady();
+    }
+}
+
+template <class Impl>
+void
+OzoneDynInst<Impl>::initInstPtrs()
+{
+    for (int i = 0; i < MaxInstSrcRegs; ++i) {
+        srcInsts[i] = NULL;
+    }
+    iqItValid = false;
+}
+
+template <class Impl>
+bool
+OzoneDynInst<Impl>::srcsReady()
+{
+    for (int i = 0; i < this->numSrcRegs(); ++i) {
+        if (!srcInsts[i]->isCompleted())
+            return false;
+    }
+
+    return true;
+}
+
+template <class Impl>
+bool
+OzoneDynInst<Impl>::eaSrcsReady()
+{
+    for (int i = 1; i < this->numSrcRegs(); ++i) {
+        if (!srcInsts[i]->isCompleted())
+            return false;
+    }
+
+    return true;
+}
+
+template <class Impl>
+void
+OzoneDynInst<Impl>::clearDependents()
+{
+    dependents.clear();
+    for (int i = 0; i < this->numSrcRegs(); ++i) {
+        srcInsts[i] = NULL;
+    }
+    for (int i = 0; i < this->numDestRegs(); ++i) {
+        prevDestInst[i] = NULL;
+    }
+}
+template <class Impl>
+MiscReg
+OzoneDynInst<Impl>::readMiscReg(int misc_reg)
+{
+    return this->thread->readMiscReg(misc_reg);
+}
+
+template <class Impl>
+MiscReg
+OzoneDynInst<Impl>::readMiscRegWithEffect(int misc_reg, Fault &fault)
+{
+    return this->thread->readMiscRegWithEffect(misc_reg, fault);
+}
+
+template <class Impl>
+Fault
+OzoneDynInst<Impl>::setMiscReg(int misc_reg, const MiscReg &val)
+{
+    return this->thread->setMiscReg(misc_reg, val);
+}
+
+template <class Impl>
+Fault
+OzoneDynInst<Impl>::setMiscRegWithEffect(int misc_reg, const MiscReg &val)
+{
+    return this->thread->setMiscRegWithEffect(misc_reg, val);
+}
+
+#if FULL_SYSTEM
+
+template <class Impl>
+Fault
+OzoneDynInst<Impl>::hwrei()
+{
+    if (!this->cpu->inPalMode(this->readPC()))
+        return new AlphaISA::UnimplementedOpcodeFault;
+
+    this->setNextPC(this->thread->readMiscReg(AlphaISA::IPR_EXC_ADDR));
+
+    this->cpu->kernelStats->hwrei();
+
+    this->cpu->checkInterrupts = true;
+
+    // FIXME: XXX check for interrupts? XXX
+    return NoFault;
+}
+
+template <class Impl>
+int
+OzoneDynInst<Impl>::readIntrFlag()
+{
+return this->cpu->readIntrFlag();
+}
+
+template <class Impl>
+void
+OzoneDynInst<Impl>::setIntrFlag(int val)
+{
+    this->cpu->setIntrFlag(val);
+}
+
+template <class Impl>
+bool
+OzoneDynInst<Impl>::inPalMode()
+{
+    return this->cpu->inPalMode();
+}
+
+template <class Impl>
+void
+OzoneDynInst<Impl>::trap(Fault fault)
+{
+    fault->invoke(this->thread->getXCProxy());
+}
+
+template <class Impl>
+bool
+OzoneDynInst<Impl>::simPalCheck(int palFunc)
+{
+    return this->cpu->simPalCheck(palFunc);
+}
+#else
+template <class Impl>
+void
+OzoneDynInst<Impl>::syscall()
+{
+    this->cpu->syscall();
+}
+#endif
diff --git a/cpu/ozone/front_end.cc b/cpu/ozone/front_end.cc
new file mode 100644
index 000000000..a974d43cb
--- /dev/null
+++ b/cpu/ozone/front_end.cc
@@ -0,0 +1,7 @@
+
+#include "cpu/ozone/front_end_impl.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
+
+template class FrontEnd<OzoneImpl>;
+template class FrontEnd<SimpleImpl>;
diff --git a/cpu/ozone/front_end.hh b/cpu/ozone/front_end.hh
new file mode 100644
index 000000000..5e257b506
--- /dev/null
+++ b/cpu/ozone/front_end.hh
@@ -0,0 +1,242 @@
+
+#ifndef __CPU_OZONE_FRONT_END_HH__
+#define __CPU_OZONE_FRONT_END_HH__
+
+#include <deque>
+
+//#include "cpu/ozone/cpu.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/o3/bpred_unit.hh"
+#include "cpu/ozone/rename_table.hh"
+//#include "cpu/ozone/thread_state.hh"
+#include "mem/mem_req.hh"
+#include "sim/eventq.hh"
+#include "sim/stats.hh"
+
+class ExecContext;
+class MemInterface;
+template <class>
+class OzoneThreadState;
+class PageTable;
+template <class>
+class TimeBuffer;
+
+template <class Impl>
+class FrontEnd
+{
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::BackEnd BackEnd;
+
+    typedef typename Impl::FullCPU::OzoneXC OzoneXC;
+    typedef typename Impl::FullCPU::CommStruct CommStruct;
+
+    FrontEnd(Params *params);
+
+    std::string name() const;
+
+    void setCPU(FullCPU *cpu_ptr)
+    { cpu = cpu_ptr; }
+
+    void setBackEnd(BackEnd *back_end_ptr)
+    { backEnd = back_end_ptr; }
+
+    void setCommBuffer(TimeBuffer<CommStruct> *_comm);
+
+    void setXC(ExecContext *xc_ptr);
+
+    void setThreadState(OzoneThreadState<Impl> *thread_ptr)
+    { thread = thread_ptr; }
+
+    void regStats();
+
+    void tick();
+    Fault fetchCacheLine();
+    void processInst(DynInstPtr &inst);
+    void squash(const InstSeqNum &squash_num, const Addr &next_PC,
+                const bool is_branch = false, const bool branch_taken = false);
+    DynInstPtr getInst();
+
+    void processCacheCompletion();
+
+    void addFreeRegs(int num_freed);
+
+    bool isEmpty() { return instBuffer.empty(); }
+
+  private:
+    bool updateStatus();
+
+    void checkBE();
+    DynInstPtr getInstFromCacheline();
+    void renameInst(DynInstPtr &inst);
+    // Returns true if we need to stop the front end this cycle
+    bool processBarriers(DynInstPtr &inst);
+
+    void handleFault(Fault &fault);
+
+    // Align an address (typically a PC) to the start of an I-cache block.
+    // We fold in the PISA 64- to 32-bit conversion here as well.
+    Addr icacheBlockAlignPC(Addr addr)
+    {
+        addr = TheISA::realPCToFetchPC(addr);
+        return (addr & ~(cacheBlkMask));
+    }
+
+    InstSeqNum getAndIncrementInstSeq()
+    { return cpu->globalSeqNum++; }
+
+  public:
+    FullCPU *cpu;
+
+    BackEnd *backEnd;
+
+    ExecContext *xc;
+
+    OzoneThreadState<Impl> *thread;
+
+    enum Status {
+        Running,
+        Idle,
+        IcacheMissStall,
+        IcacheMissComplete,
+        SerializeBlocked,
+        SerializeComplete,
+        RenameBlocked,
+        BEBlocked
+    };
+
+    Status status;
+
+  private:
+    TimeBuffer<CommStruct> *comm;
+    typename TimeBuffer<CommStruct>::wire fromCommit;
+
+    typedef typename Impl::BranchPred BranchPred;
+
+    // Typedef for semi-opaque type that holds any information the branch
+    // predictor needs to update itself.  Only two fields are used outside of
+    // branch predictor, nextPC and isTaken.
+//    typedef typename BranchPred::BPredInfo BPredInfo;
+
+    BranchPred branchPred;
+
+    class ICacheCompletionEvent : public Event
+    {
+      private:
+        FrontEnd *frontEnd;
+
+      public:
+        ICacheCompletionEvent(FrontEnd *_fe);
+
+        virtual void process();
+        virtual const char *description();
+    };
+
+    ICacheCompletionEvent cacheCompletionEvent;
+
+    MemInterface *icacheInterface;
+
+#if !FULL_SYSTEM
+    PageTable *pTable;
+#endif
+
+    MemReqPtr memReq;
+
+    /** Mask to get a cache block's address. */
+    Addr cacheBlkMask;
+
+    unsigned cacheBlkSize;
+
+    Addr cacheBlkPC;
+
+    /** The cache line being fetched. */
+    uint8_t *cacheData;
+
+    bool fetchCacheLineNextCycle;
+
+    bool cacheBlkValid;
+
+  public:
+    RenameTable<Impl> renameTable;
+
+  private:
+    Addr PC;
+    Addr nextPC;
+
+  public:
+    void setPC(Addr val) { PC = val; }
+    void setNextPC(Addr val) { nextPC = val; }
+
+    void dumpInsts();
+
+  private:
+    typedef typename std::deque<DynInstPtr> InstBuff;
+    typedef typename InstBuff::iterator InstBuffIt;
+
+    InstBuff instBuffer;
+
+    int instBufferSize;
+
+    int maxInstBufferSize;
+
+    int width;
+
+    int freeRegs;
+
+    int numPhysRegs;
+
+    bool serializeNext;
+
+    DynInstPtr barrierInst;
+
+    // number of idle cycles
+/*
+    Stats::Average<> notIdleFraction;
+    Stats::Formula idleFraction;
+*/
+    // @todo: Consider making these vectors and tracking on a per thread basis.
+    /** Stat for total number of cycles stalled due to an icache miss. */
+    Stats::Scalar<> icacheStallCycles;
+    /** Stat for total number of fetched instructions. */
+    Stats::Scalar<> fetchedInsts;
+    Stats::Scalar<> fetchedBranches;
+    /** Stat for total number of predicted branches. */
+    Stats::Scalar<> predictedBranches;
+    /** Stat for total number of cycles spent fetching. */
+    Stats::Scalar<> fetchCycles;
+
+    Stats::Scalar<> fetchIdleCycles;
+    /** Stat for total number of cycles spent squashing. */
+    Stats::Scalar<> fetchSquashCycles;
+    /** Stat for total number of cycles spent blocked due to other stages in
+     * the pipeline.
+     */
+    Stats::Scalar<> fetchBlockedCycles;
+    /** Stat for total number of fetched cache lines. */
+    Stats::Scalar<> fetchedCacheLines;
+    /** Distribution of number of instructions fetched each cycle. */
+    Stats::Distribution<> fetchNisnDist;
+//    Stats::Vector<> qfull_iq_occupancy;
+//    Stats::VectorDistribution<> qfull_iq_occ_dist_;
+    Stats::Formula idleRate;
+    Stats::Formula branchRate;
+    Stats::Formula fetchRate;
+    Stats::Scalar<> IFQCount;	// cumulative IFQ occupancy
+    Stats::Formula IFQOccupancy;
+    Stats::Formula IFQLatency;
+    Stats::Scalar<> IFQFcount; // cumulative IFQ full count
+    Stats::Formula IFQFullRate;
+
+    Stats::Scalar<> dispatchCountStat;
+    Stats::Scalar<> dispatchedSerializing;
+    Stats::Scalar<> dispatchedTempSerializing;
+    Stats::Scalar<> dispatchSerializeStallCycles;
+    Stats::Formula dispatchRate;
+    Stats::Formula regIntFull;
+    Stats::Formula regFpFull;
+};
+
+#endif // __CPU_OZONE_FRONT_END_HH__
diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh
new file mode 100644
index 000000000..0136d0ef0
--- /dev/null
+++ b/cpu/ozone/front_end_impl.hh
@@ -0,0 +1,798 @@
+
+#include "arch/isa_traits.hh"
+#include "base/statistics.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/exetrace.hh"
+#include "cpu/ozone/front_end.hh"
+#include "mem/mem_interface.hh"
+#include "sim/byte_swap.hh"
+
+using namespace TheISA;
+
+template <class Impl>
+FrontEnd<Impl>::FrontEnd(Params *params)
+    : branchPred(params),
+      cacheCompletionEvent(this),
+      icacheInterface(params->icacheInterface),
+      instBufferSize(0),
+      maxInstBufferSize(params->maxInstBufferSize),
+      width(params->frontEndWidth),
+      freeRegs(params->numPhysicalRegs),
+      numPhysRegs(params->numPhysicalRegs),
+      serializeNext(false)
+{
+    status = Idle;
+
+    // Setup branch predictor.
+
+    // Setup Memory Request
+    memReq = new MemReq();
+    memReq->asid = 0;
+    memReq->data = new uint8_t[64];
+
+    // Size of cache block.
+    cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
+
+    assert(isPowerOf2(cacheBlkSize));
+
+    // Create mask to get rid of offset bits.
+    cacheBlkMask = (cacheBlkSize - 1);
+
+    // Create space to store a cache line.
+    cacheData = new uint8_t[cacheBlkSize];
+
+    fetchCacheLineNextCycle = true;
+
+    cacheBlkValid = false;
+
+#if !FULL_SYSTEM
+    pTable = params->pTable;
+#endif
+}
+
+template <class Impl>
+std::string
+FrontEnd<Impl>::name() const
+{
+    return cpu->name() + ".frontend";
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::setCommBuffer(TimeBuffer<CommStruct> *_comm)
+{
+    comm = _comm;
+    // @todo: Hardcoded for now.  Allow this to be set by a latency.
+    fromCommit = comm->getWire(-1);
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::setXC(ExecContext *xc_ptr)
+{
+    xc = xc_ptr;
+    memReq->xc = xc;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::regStats()
+{
+    icacheStallCycles
+        .name(name() + ".icacheStallCycles")
+        .desc("Number of cycles fetch is stalled on an Icache miss")
+        .prereq(icacheStallCycles);
+
+    fetchedInsts
+        .name(name() + ".fetchedInsts")
+        .desc("Number of instructions fetch has processed")
+        .prereq(fetchedInsts);
+
+    fetchedBranches
+        .name(name() + ".fetchedBranches")
+        .desc("Number of fetched branches")
+        .prereq(fetchedBranches);
+
+    predictedBranches
+        .name(name() + ".predictedBranches")
+        .desc("Number of branches that fetch has predicted taken")
+        .prereq(predictedBranches);
+
+    fetchCycles
+        .name(name() + ".fetchCycles")
+        .desc("Number of cycles fetch has run and was not squashing or"
+              " blocked")
+        .prereq(fetchCycles);
+
+    fetchIdleCycles
+        .name(name() + ".fetchIdleCycles")
+        .desc("Number of cycles fetch was idle")
+        .prereq(fetchIdleCycles);
+
+    fetchSquashCycles
+        .name(name() + ".fetchSquashCycles")
+        .desc("Number of cycles fetch has spent squashing")
+        .prereq(fetchSquashCycles);
+
+    fetchBlockedCycles
+        .name(name() + ".fetchBlockedCycles")
+        .desc("Number of cycles fetch has spent blocked")
+        .prereq(fetchBlockedCycles);
+
+    fetchedCacheLines
+        .name(name() + ".fetchedCacheLines")
+        .desc("Number of cache lines fetched")
+        .prereq(fetchedCacheLines);
+
+    fetchNisnDist
+        .init(/* base value */ 0,
+              /* last value */ width,
+              /* bucket size */ 1)
+        .name(name() + ".rateDist")
+        .desc("Number of instructions fetched each cycle (Total)")
+        .flags(Stats::pdf);
+
+    idleRate
+        .name(name() + ".idleRate")
+        .desc("Percent of cycles fetch was idle")
+        .prereq(idleRate);
+    idleRate = fetchIdleCycles * 100 / cpu->numCycles;
+
+    branchRate
+        .name(name() + ".branchRate")
+        .desc("Number of branch fetches per cycle")
+        .flags(Stats::total);
+    branchRate = fetchedBranches / cpu->numCycles;
+
+    fetchRate
+        .name(name() + ".rate")
+        .desc("Number of inst fetches per cycle")
+        .flags(Stats::total);
+    fetchRate = fetchedInsts / cpu->numCycles;
+
+    IFQCount
+        .name(name() + ".IFQ:count")
+        .desc("cumulative IFQ occupancy")
+        ;
+
+    IFQFcount
+        .name(name() + ".IFQ:fullCount")
+        .desc("cumulative IFQ full count")
+        .flags(Stats::total)
+        ;
+
+    IFQOccupancy
+        .name(name() + ".IFQ:occupancy")
+        .desc("avg IFQ occupancy (inst's)")
+        ;
+    IFQOccupancy = IFQCount / cpu->numCycles;
+
+    IFQLatency
+        .name(name() + ".IFQ:latency")
+        .desc("avg IFQ occupant latency (cycle's)")
+        .flags(Stats::total)
+        ;
+
+    IFQFullRate
+        .name(name() + ".IFQ:fullRate")
+        .desc("fraction of time (cycles) IFQ was full")
+        .flags(Stats::total);
+        ;
+    IFQFullRate = IFQFcount * Stats::constant(100) / cpu->numCycles;
+
+    dispatchCountStat
+        .name(name() + ".DIS:count")
+        .desc("cumulative count of dispatched insts")
+        .flags(Stats::total)
+        ;
+
+    dispatchedSerializing
+        .name(name() + ".DIS:serializingInsts")
+        .desc("count of serializing insts dispatched")
+        .flags(Stats::total)
+        ;
+
+    dispatchedTempSerializing
+        .name(name() + ".DIS:tempSerializingInsts")
+        .desc("count of temporary serializing insts dispatched")
+        .flags(Stats::total)
+        ;
+
+    dispatchSerializeStallCycles
+        .name(name() + ".DIS:serializeStallCycles")
+        .desc("count of cycles dispatch stalled for serializing inst")
+        .flags(Stats::total)
+        ;
+
+    dispatchRate
+        .name(name() + ".DIS:rate")
+        .desc("dispatched insts per cycle")
+        .flags(Stats::total)
+        ;
+    dispatchRate = dispatchCountStat / cpu->numCycles;
+
+    regIntFull
+        .name(name() + ".REG:int:full")
+        .desc("number of cycles where there were no INT registers")
+        ;
+
+    regFpFull
+        .name(name() + ".REG:fp:full")
+        .desc("number of cycles where there were no FP registers")
+        ;
+    IFQLatency = IFQOccupancy / dispatchRate;
+
+    branchPred.regStats();
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::tick()
+{
+    // @todo: Maybe I want to just have direct communication...
+    if (fromCommit->doneSeqNum) {
+        branchPred.update(fromCommit->doneSeqNum, 0);
+    }
+
+    IFQCount += instBufferSize;
+    IFQFcount += instBufferSize == maxInstBufferSize;
+
+    // Fetch cache line
+    if (status == IcacheMissComplete) {
+        cacheBlkValid = true;
+
+        status = Running;
+        if (barrierInst)
+            status = SerializeBlocked;
+        if (freeRegs <= 0)
+            status = RenameBlocked;
+        checkBE();
+    } else if (status == IcacheMissStall) {
+        DPRINTF(FE, "Still in Icache miss stall.\n");
+        icacheStallCycles++;
+        return;
+    }
+
+    if (status == RenameBlocked || status == SerializeBlocked ||
+        status == BEBlocked) {
+        // This might cause the front end to run even though it
+        // shouldn't, but this should only be a problem for one cycle.
+        // Also will cause a one cycle bubble between changing state
+        // and restarting.
+        DPRINTF(FE, "In blocked status.\n");
+
+        fetchBlockedCycles++;
+
+        if (status == SerializeBlocked) {
+            dispatchSerializeStallCycles++;
+        }
+        updateStatus();
+        return;
+    } else if (status != IcacheMissComplete) {
+        if (fetchCacheLineNextCycle) {
+            Fault fault = fetchCacheLine();
+            if (fault != NoFault) {
+                handleFault(fault);
+                return;
+            }
+            fetchCacheLineNextCycle = false;
+        }
+        // If miss, stall until it returns.
+        if (status == IcacheMissStall) {
+            // Tell CPU to not tick me for now.
+            return;
+        }
+    }
+
+    fetchCycles++;
+
+    int num_inst = 0;
+
+    // Otherwise loop and process instructions.
+    // One way to hack infinite width is to set width and maxInstBufferSize
+    // both really high.  Inelegant, but probably will work.
+    while (num_inst < width &&
+           instBufferSize < maxInstBufferSize) {
+        // Get instruction from cache line.
+        DynInstPtr inst = getInstFromCacheline();
+
+        if (!inst) {
+            // PC is no longer in the cache line, end fetch.
+            // Might want to check this at the end of the cycle so that
+            // there's no cycle lost to checking for a new cache line.
+            DPRINTF(FE, "Need to get new cache line\n");
+            fetchCacheLineNextCycle = true;
+            break;
+        }
+
+        // if (generalizeFetch) {
+        processInst(inst);
+
+        if (status == SerializeBlocked) {
+            break;
+        }
+
+        // Possibly push into a time buffer that estimates the front end
+        // latency
+        instBuffer.push_back(inst);
+        ++instBufferSize;
+        ++num_inst;
+        // } else {
+        // fetch(num_inst);
+        // decode(num_inst);
+        // rename(num_inst);
+        // }
+
+        if (inst->predTaken()) {
+            // Start over with tick?
+            break;
+        } else if (freeRegs <= 0) {
+            DPRINTF(FE, "Ran out of free registers to rename to!\n");
+            status = RenameBlocked;
+            break;
+        } else if (serializeNext) {
+            break;
+        }
+    }
+
+    fetchNisnDist.sample(num_inst);
+    checkBE();
+
+    DPRINTF(FE, "Num insts processed: %i, Inst Buffer size: %i, Free "
+            "Regs %i\n", num_inst, instBufferSize, freeRegs);
+}
+
+template <class Impl>
+Fault
+FrontEnd<Impl>::fetchCacheLine()
+{
+    // Read a cache line, based on the current PC.
+#if FULL_SYSTEM
+    // Flag to say whether or not address is physical addr.
+    unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
+#else
+    unsigned flags = 0;
+#endif // FULL_SYSTEM
+    Fault fault = NoFault;
+
+    // Align the fetch PC so it's at the start of a cache block.
+    Addr fetch_PC = icacheBlockAlignPC(PC);
+
+    DPRINTF(FE, "Fetching cache line starting at %#x.\n", fetch_PC);
+
+    // Setup the memReq to do a read of the first isntruction's address.
+    // Set the appropriate read size and flags as well.
+    memReq->cmd = Read;
+    memReq->reset(fetch_PC, cacheBlkSize, flags);
+
+    // Translate the instruction request.
+    fault = cpu->translateInstReq(memReq);
+
+    // In the case of faults, the fetch stage may need to stall and wait
+    // on what caused the fetch (ITB or Icache miss).
+//    assert(fault == NoFault);
+
+    // Now do the timing access to see whether or not the instruction
+    // exists within the cache.
+    if (icacheInterface && fault == NoFault) {
+        memReq->completionEvent = NULL;
+
+        memReq->time = curTick;
+
+        MemAccessResult res = icacheInterface->access(memReq);
+
+        // If the cache missed then schedule an event to wake
+        // up this stage once the cache miss completes.
+        if (icacheInterface->doEvents() && res != MA_HIT) {
+            memReq->completionEvent = new ICacheCompletionEvent(this);
+
+            status = IcacheMissStall;
+
+            cacheBlkValid = false;
+
+            DPRINTF(FE, "Cache miss.\n");
+        }  else {
+            DPRINTF(FE, "Cache hit.\n");
+
+            cacheBlkValid = true;
+
+            memcpy(cacheData, memReq->data, memReq->size);
+        }
+    }
+
+    // Note that this will set the cache block PC a bit earlier than it should
+    // be set.
+    cacheBlkPC = fetch_PC;
+
+    ++fetchedCacheLines;
+
+    DPRINTF(FE, "Done fetching cache line.\n");
+
+    return fault;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::processInst(DynInstPtr &inst)
+{
+    if (processBarriers(inst)) {
+        return;
+    }
+
+    Addr inst_PC = inst->readPC();
+
+//    BPredInfo bp_info = branchPred.lookup(inst_PC);
+    if (!inst->isControl()) {
+        inst->setPredTarg(inst->readNextPC());
+    } else {
+        fetchedBranches++;
+        if (branchPred.predict(inst, inst_PC, inst->threadNumber)) {
+            predictedBranches++;
+        }
+    }
+
+    Addr next_PC = inst->readPredTarg();
+
+    DPRINTF(FE, "[sn:%lli] Predicted and processed inst PC %#x, next PC "
+            "%#x\n", inst->seqNum, inst_PC, next_PC);
+
+//    inst->setNextPC(next_PC);
+//    inst->setBPredInfo(bp_info);
+
+    // Not sure where I should set this
+    PC = next_PC;
+
+    renameInst(inst);
+}
+
+template <class Impl>
+bool
+FrontEnd<Impl>::processBarriers(DynInstPtr &inst)
+{
+    if (serializeNext) {
+        inst->setSerializeBefore();
+        serializeNext = false;
+    } else if (!inst->isSerializing()) {
+        return false;
+    }
+
+    if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
+        DPRINTF(FE, "Serialize before instruction encountered.\n");
+
+        if (!inst->isTempSerializeBefore()) {
+            dispatchedSerializing++;
+            inst->setSerializeHandled();
+        } else {
+            dispatchedTempSerializing++;
+        }
+
+        // Change status over to BarrierStall so that other stages know
+        // what this is blocked on.
+        status = SerializeBlocked;
+
+        barrierInst = inst;
+        return true;
+    } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) {
+        DPRINTF(FE, "Serialize after instruction encountered.\n");
+
+        inst->setSerializeHandled();
+
+        dispatchedSerializing++;
+
+        serializeNext = true;
+        return false;
+    }
+    return false;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::handleFault(Fault &fault)
+{
+    DPRINTF(FE, "Fault at fetch, telling commit\n");
+    backEnd->fetchFault(fault);
+    // We're blocked on the back end until it handles this fault.
+    status = BEBlocked;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
+                       const bool is_branch, const bool branch_taken)
+{
+    DPRINTF(FE, "Squashing from [sn:%lli], setting PC to %#x\n",
+            squash_num, next_PC);
+
+    while (!instBuffer.empty() &&
+           instBuffer.back()->seqNum > squash_num) {
+        DynInstPtr inst = instBuffer.back();
+
+        DPRINTF(FE, "Squashing instruction [sn:%lli] PC %#x\n",
+                inst->seqNum, inst->readPC());
+
+        inst->clearDependents();
+
+        instBuffer.pop_back();
+        --instBufferSize;
+
+        // Fix up branch predictor if necessary.
+//        branchPred.undo(inst->getBPredInfo());
+
+        freeRegs+= inst->numDestRegs();
+    }
+
+    // Copy over rename table from the back end.
+    renameTable.copyFrom(backEnd->renameTable);
+
+    PC = next_PC;
+
+    // Update BP with proper information.
+    if (is_branch) {
+        branchPred.squash(squash_num, next_PC, branch_taken, 0);
+    } else {
+        branchPred.squash(squash_num, 0);
+    }
+
+    // Clear the icache miss if it's outstanding.
+    if (status == IcacheMissStall && icacheInterface) {
+        DPRINTF(FE, "Squashing outstanding Icache miss.\n");
+        icacheInterface->squash(0);
+    }
+
+    if (status == SerializeBlocked) {
+        assert(barrierInst->seqNum > squash_num);
+        barrierInst = NULL;
+    }
+
+    // Unless this squash originated from the front end, we're probably
+    // in running mode now.
+    // Actually might want to make this latency dependent.
+    status = Running;
+    fetchCacheLineNextCycle = true;
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+FrontEnd<Impl>::getInst()
+{
+    if (instBufferSize == 0) {
+        return NULL;
+    }
+
+    DynInstPtr inst = instBuffer.front();
+
+    instBuffer.pop_front();
+
+    --instBufferSize;
+
+    dispatchCountStat++;
+
+    return inst;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::processCacheCompletion()
+{
+    DPRINTF(FE, "Processing cache completion\n");
+
+    // Do something here.
+    if (status != IcacheMissStall) {
+        DPRINTF(FE, "Previous fetch was squashed.\n");
+        return;
+    }
+
+    status = IcacheMissComplete;
+
+/*    if (checkStall(tid)) {
+        fetchStatus[tid] = Blocked;
+    } else {
+        fetchStatus[tid] = IcacheMissComplete;
+    }
+*/
+    memcpy(cacheData, memReq->data, memReq->size);
+
+    // Reset the completion event to NULL.
+    memReq->completionEvent = NULL;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::addFreeRegs(int num_freed)
+{
+    if (status == RenameBlocked && freeRegs + num_freed > 0) {
+        status = Running;
+    }
+
+    freeRegs+= num_freed;
+
+    assert(freeRegs <= numPhysRegs);
+}
+
+template <class Impl>
+bool
+FrontEnd<Impl>::updateStatus()
+{
+//    bool rename_block = freeRegs <= 0;
+    bool serialize_block = !backEnd->robEmpty() || instBufferSize;
+    bool be_block = cpu->decoupledFrontEnd ? false : backEnd->isBlocked();
+    bool ret_val = false;
+/*
+  // Should already be handled through addFreeRegs function
+    if (status == RenameBlocked && !rename_block) {
+        status = Running;
+        ret_val = true;
+    }
+*/
+
+    if (status == SerializeBlocked && !serialize_block) {
+        status = SerializeComplete;
+        ret_val = true;
+    }
+
+    if (status == BEBlocked && !be_block) {
+        if (barrierInst) {
+            status = SerializeBlocked;
+        } else {
+            status = Running;
+        }
+        ret_val = true;
+    }
+    return ret_val;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::checkBE()
+{
+    bool be_block = cpu->decoupledFrontEnd ? false : backEnd->isBlocked();
+    if (be_block) {
+        if (status == Running || status == Idle) {
+            status = BEBlocked;
+        }
+    }
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+FrontEnd<Impl>::getInstFromCacheline()
+{
+    if (status == SerializeComplete) {
+        DynInstPtr inst = barrierInst;
+        status = Running;
+        barrierInst = NULL;
+        return inst;
+    }
+
+    InstSeqNum inst_seq;
+    MachInst inst;
+    // @todo: Fix this magic number used here to handle word offset (and
+    // getting rid of PAL bit)
+    unsigned offset = (PC & cacheBlkMask) & ~3;
+
+    // PC of inst is not in this cache block
+    if (PC >= (cacheBlkPC + cacheBlkSize) || PC < cacheBlkPC || !cacheBlkValid) {
+//        DPRINTF(OoOCPU, "OoOCPU: PC is not in this cache block\n");
+//        DPRINTF(OoOCPU, "OoOCPU: PC: %#x, cacheBlkPC: %#x, cacheBlkValid: %i",
+//                PC, cacheBlkPC, cacheBlkValid);
+//        panic("Instruction not in cache line or cache line invalid!");
+        return NULL;
+    }
+
+    //////////////////////////
+    // Fetch one instruction
+    //////////////////////////
+
+    // Get a sequence number.
+    inst_seq = getAndIncrementInstSeq();
+
+    // Make sure this is a valid index.
+    assert(offset <= cacheBlkSize - sizeof(MachInst));
+
+    // Get the instruction from the array of the cache line.
+    inst = htog(*reinterpret_cast<MachInst *>(&cacheData[offset]));
+
+    ExtMachInst decode_inst = TheISA::makeExtMI(inst, PC);
+
+    // Create a new DynInst from the instruction fetched.
+    DynInstPtr instruction = new DynInst(decode_inst, PC, PC+sizeof(MachInst),
+                                         inst_seq, cpu);
+
+    instruction->setState(thread);
+
+    DPRINTF(FE, "Instruction [sn:%lli] created, with PC %#x\n%s\n",
+            inst_seq, instruction->readPC(),
+            instruction->staticInst->disassemble(PC));
+
+    instruction->traceData =
+        Trace::getInstRecord(curTick, xc, cpu,
+                             instruction->staticInst,
+                             instruction->readPC(), 0);
+
+    // Increment stat of fetched instructions.
+    ++fetchedInsts;
+
+    return instruction;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::renameInst(DynInstPtr &inst)
+{
+    DynInstPtr src_inst = NULL;
+    int num_src_regs = inst->numSrcRegs();
+    if (num_src_regs == 0) {
+        inst->setCanIssue();
+    } else {
+        for (int i = 0; i < num_src_regs; ++i) {
+            src_inst = renameTable[inst->srcRegIdx(i)];
+
+            inst->setSrcInst(src_inst, i);
+
+            DPRINTF(FE, "[sn:%lli]: Src reg %i is inst [sn:%lli]\n",
+                    inst->seqNum, (int)inst->srcRegIdx(i), src_inst->seqNum);
+
+            if (src_inst->isCompleted()) {
+                DPRINTF(FE, "Reg ready.\n");
+                inst->markSrcRegReady(i);
+            } else {
+                DPRINTF(FE, "Adding to dependent list.\n");
+                src_inst->addDependent(inst);
+            }
+        }
+    }
+
+    for (int i = 0; i < inst->numDestRegs(); ++i) {
+        RegIndex idx = inst->destRegIdx(i);
+
+        DPRINTF(FE, "Dest reg %i is now inst [sn:%lli], was previously "
+                "[sn:%lli]\n",
+                (int)inst->destRegIdx(i), inst->seqNum,
+                renameTable[idx]->seqNum);
+
+        inst->setPrevDestInst(renameTable[idx], i);
+
+        renameTable[idx] = inst;
+        --freeRegs;
+    }
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::dumpInsts()
+{
+    cprintf("instBuffer size: %i\n", instBuffer.size());
+
+    InstBuffIt buff_it = instBuffer.begin();
+
+    for (int num = 0; buff_it != instBuffer.end(); num++) {
+        cprintf("Instruction:%i\nPC:%#x\n[tid:%i]\n[sn:%lli]\nIssued:%i\n"
+                "Squashed:%i\n\n",
+                num, (*buff_it)->readPC(), (*buff_it)->threadNumber,
+                (*buff_it)->seqNum, (*buff_it)->isIssued(),
+                (*buff_it)->isSquashed());
+        buff_it++;
+    }
+
+}
+
+template <class Impl>
+FrontEnd<Impl>::ICacheCompletionEvent::ICacheCompletionEvent(FrontEnd *fe)
+    : Event(&mainEventQueue, Delayed_Writeback_Pri), frontEnd(fe)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::ICacheCompletionEvent::process()
+{
+    frontEnd->processCacheCompletion();
+}
+
+template <class Impl>
+const char *
+FrontEnd<Impl>::ICacheCompletionEvent::description()
+{
+    return "ICache completion event";
+}
diff --git a/cpu/ozone/inorder_back_end.cc b/cpu/ozone/inorder_back_end.cc
new file mode 100644
index 000000000..14db610d2
--- /dev/null
+++ b/cpu/ozone/inorder_back_end.cc
@@ -0,0 +1,5 @@
+
+#include "cpu/ozone/inorder_back_end_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
+
+template class InorderBackEnd<SimpleImpl>;
diff --git a/cpu/ozone/inorder_back_end.hh b/cpu/ozone/inorder_back_end.hh
new file mode 100644
index 000000000..e621f6c01
--- /dev/null
+++ b/cpu/ozone/inorder_back_end.hh
@@ -0,0 +1,417 @@
+
+#ifndef __CPU_OZONE_INORDER_BACK_END_HH__
+#define __CPU_OZONE_INORDER_BACK_END_HH__
+
+#include <list>
+
+#include "arch/faults.hh"
+#include "base/timebuf.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/ozone/rename_table.hh"
+#include "cpu/ozone/thread_state.hh"
+#include "mem/mem_interface.hh"
+#include "mem/mem_req.hh"
+#include "sim/eventq.hh"
+
+template <class Impl>
+class InorderBackEnd
+{
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::FrontEnd FrontEnd;
+
+    typedef typename FullCPU::OzoneXC OzoneXC;
+    typedef typename Impl::FullCPU::CommStruct CommStruct;
+
+    InorderBackEnd(Params *params);
+
+    std::string name() const;
+
+    void setCPU(FullCPU *cpu_ptr)
+    { cpu = cpu_ptr; }
+
+    void setFrontEnd(FrontEnd *front_end_ptr)
+    { frontEnd = front_end_ptr; }
+
+    void setCommBuffer(TimeBuffer<CommStruct> *_comm)
+    { comm = _comm; }
+
+    void setXC(ExecContext *xc_ptr);
+
+    void setThreadState(OzoneThreadState<Impl> *thread_ptr);
+
+    void regStats() { }
+
+#if FULL_SYSTEM
+    void checkInterrupts();
+#endif
+
+    void tick();
+    void executeInsts();
+    void squash(const InstSeqNum &squash_num, const Addr &next_PC);
+
+    void squashFromXC();
+
+    bool robEmpty() { return instList.empty(); }
+
+    bool isFull() { return false; }
+    bool isBlocked() { return status == DcacheMissStoreStall ||
+                           status == DcacheMissLoadStall ||
+                           interruptBlocked; }
+
+    void fetchFault(Fault &fault);
+
+    void dumpInsts();
+
+  private:
+    void handleFault();
+
+    void setSquashInfoFromXC();
+
+    bool squashPending;
+    InstSeqNum squashSeqNum;
+    Addr squashNextPC;
+
+    Fault faultFromFetch;
+
+    bool interruptBlocked;
+
+  public:
+    template <class T>
+    Fault read(Addr addr, T &data, unsigned flags);
+
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    template <class T>
+    Fault write(T data, Addr addr, unsigned flags, uint64_t *res);
+
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+    Addr readCommitPC() { return commitPC; }
+
+    Addr commitPC;
+
+  public:
+    FullCPU *cpu;
+
+    FrontEnd *frontEnd;
+
+    ExecContext *xc;
+
+    OzoneThreadState<Impl> *thread;
+
+    RenameTable<Impl> renameTable;
+
+  protected:
+    enum Status {
+        Running,
+        Idle,
+        DcacheMissLoadStall,
+        DcacheMissStoreStall,
+        DcacheMissComplete,
+        Blocked
+    };
+
+    Status status;
+
+    class DCacheCompletionEvent : public Event
+    {
+      private:
+        InorderBackEnd *be;
+
+      public:
+        DCacheCompletionEvent(InorderBackEnd *_be);
+
+        virtual void process();
+        virtual const char *description();
+
+        DynInstPtr inst;
+    };
+
+    friend class DCacheCompletionEvent;
+
+    DCacheCompletionEvent cacheCompletionEvent;
+
+    MemInterface *dcacheInterface;
+
+    MemReqPtr memReq;
+
+  private:
+    typedef typename std::list<DynInstPtr>::iterator InstListIt;
+
+    std::list<DynInstPtr> instList;
+
+    // General back end width. Used if the more specific isn't given.
+    int width;
+
+    int latency;
+
+    int squashLatency;
+
+    TimeBuffer<int> numInstsToWB;
+    TimeBuffer<int>::wire instsAdded;
+    TimeBuffer<int>::wire instsToExecute;
+
+    TimeBuffer<CommStruct> *comm;
+    // number of cycles stalled for D-cache misses
+    Stats::Scalar<> dcacheStallCycles;
+    Counter lastDcacheStall;
+};
+
+template <class Impl>
+template <class T>
+Fault
+InorderBackEnd<Impl>::read(Addr addr, T &data, unsigned flags)
+{
+    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+    Fault fault = cpu->translateDataReadReq(memReq);
+
+    // if we have a cache, do cache access too
+    if (fault == NoFault && dcacheInterface) {
+        memReq->cmd = Read;
+        memReq->completionEvent = NULL;
+        memReq->time = curTick;
+        memReq->flags &= ~INST_READ;
+        MemAccessResult result = dcacheInterface->access(memReq);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT) {
+            // Fix this hack for keeping funcExeInst correct with loads that
+            // are executed twice.
+            memReq->completionEvent = &cacheCompletionEvent;
+            lastDcacheStall = curTick;
+//	    unscheduleTickEvent();
+            status = DcacheMissLoadStall;
+            DPRINTF(IBE, "Dcache miss stall!\n");
+        } else {
+            // do functional access
+            DPRINTF(IBE, "Dcache hit!\n");
+        }
+    }
+/*
+    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+        recordEvent("Uncached Read");
+*/
+    return fault;
+}
+#if 0
+template <class Impl>
+template <class T>
+Fault
+InorderBackEnd<Impl>::read(MemReqPtr &req, T &data)
+{
+#if FULL_SYSTEM && defined(TARGET_ALPHA)
+    if (req->flags & LOCKED) {
+        req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
+        req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
+    }
+#endif
+
+    Fault error;
+    error = thread->mem->read(req, data);
+    data = LittleEndianGuest::gtoh(data);
+    return error;
+}
+#endif
+
+template <class Impl>
+template <class T>
+Fault
+InorderBackEnd<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
+{
+    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+    Fault fault = cpu->translateDataWriteReq(memReq);
+
+    if (fault == NoFault && dcacheInterface) {
+        memReq->cmd = Write;
+//	memcpy(memReq->data,(uint8_t *)&data,memReq->size);
+        memReq->completionEvent = NULL;
+        memReq->time = curTick;
+        memReq->flags &= ~INST_READ;
+        MemAccessResult result = dcacheInterface->access(memReq);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT) {
+            memReq->completionEvent = &cacheCompletionEvent;
+            lastDcacheStall = curTick;
+//	    unscheduleTickEvent();
+            status = DcacheMissStoreStall;
+            DPRINTF(IBE, "Dcache miss stall!\n");
+        } else {
+            DPRINTF(IBE, "Dcache hit!\n");
+        }
+    }
+
+    if (res && (fault == NoFault))
+        *res = memReq->result;
+/*
+    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+        recordEvent("Uncached Write");
+*/
+    return fault;
+}
+#if 0
+template <class Impl>
+template <class T>
+Fault
+InorderBackEnd<Impl>::write(MemReqPtr &req, T &data)
+{
+#if FULL_SYSTEM && defined(TARGET_ALPHA)
+    ExecContext *xc;
+
+    // If this is a store conditional, act appropriately
+    if (req->flags & LOCKED) {
+        xc = req->xc;
+
+        if (req->flags & UNCACHEABLE) {
+            // Don't update result register (see stq_c in isa_desc)
+            req->result = 2;
+            xc->setStCondFailures(0);//Needed? [RGD]
+        } else {
+            bool lock_flag = xc->readMiscReg(TheISA::Lock_Flag_DepTag);
+            Addr lock_addr = xc->readMiscReg(TheISA::Lock_Addr_DepTag);
+            req->result = lock_flag;
+            if (!lock_flag ||
+                ((lock_addr & ~0xf) != (req->paddr & ~0xf))) {
+                xc->setMiscReg(TheISA::Lock_Flag_DepTag, false);
+                xc->setStCondFailures(xc->readStCondFailures() + 1);
+                if (((xc->readStCondFailures()) % 100000) == 0) {
+                    std::cerr << "Warning: "
+                              << xc->readStCondFailures()
+                              << " consecutive store conditional failures "
+                              << "on cpu " << req->xc->readCpuId()
+                              << std::endl;
+                }
+                return NoFault;
+            }
+            else xc->setStCondFailures(0);
+        }
+    }
+
+    // Need to clear any locked flags on other proccessors for
+    // this address.  Only do this for succsful Store Conditionals
+    // and all other stores (WH64?).  Unsuccessful Store
+    // Conditionals would have returned above, and wouldn't fall
+    // through.
+    for (int i = 0; i < cpu->system->execContexts.size(); i++){
+        xc = cpu->system->execContexts[i];
+        if ((xc->readMiscReg(TheISA::Lock_Addr_DepTag) & ~0xf) ==
+            (req->paddr & ~0xf)) {
+            xc->setMiscReg(TheISA::Lock_Flag_DepTag, false);
+        }
+    }
+
+#endif
+    return thread->mem->write(req, (T)LittleEndianGuest::htog(data));
+}
+#endif
+
+template <class Impl>
+template <class T>
+Fault
+InorderBackEnd<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+//    panic("Unimplemented!");
+//    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+//    Fault fault = cpu->translateDataReadReq(req);
+
+    // if we have a cache, do cache access too
+    if (dcacheInterface) {
+        req->cmd = Read;
+        req->completionEvent = NULL;
+        req->data = new uint8_t[64];
+        req->time = curTick;
+        req->flags &= ~INST_READ;
+        MemAccessResult result = dcacheInterface->access(req);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT) {
+            req->completionEvent = &cacheCompletionEvent;
+            lastDcacheStall = curTick;
+//	    unscheduleTickEvent();
+            status = DcacheMissLoadStall;
+            DPRINTF(IBE, "Dcache miss load stall!\n");
+        } else {
+            DPRINTF(IBE, "Dcache hit!\n");
+
+        }
+    }
+
+/*
+    if (!dcacheInterface && (req->flags & UNCACHEABLE))
+        recordEvent("Uncached Read");
+*/
+    return NoFault;
+}
+
+template <class Impl>
+template <class T>
+Fault
+InorderBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+//    req->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+//    Fault fault = cpu->translateDataWriteReq(req);
+
+    if (dcacheInterface) {
+        req->cmd = Write;
+        req->data = new uint8_t[64];
+        memcpy(req->data,(uint8_t *)&data,req->size);
+        req->completionEvent = NULL;
+        req->time = curTick;
+        req->flags &= ~INST_READ;
+        MemAccessResult result = dcacheInterface->access(req);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT) {
+            req->completionEvent = &cacheCompletionEvent;
+            lastDcacheStall = curTick;
+//	    unscheduleTickEvent();
+            status = DcacheMissStoreStall;
+            DPRINTF(IBE, "Dcache miss store stall!\n");
+        } else {
+            DPRINTF(IBE, "Dcache hit!\n");
+
+        }
+    }
+
+    if (req->flags & LOCKED) {
+        if (req->flags & UNCACHEABLE) {
+            // Don't update result register (see stq_c in isa_desc)
+            req->result = 2;
+        } else {
+            req->result = 1;
+        }
+    }
+/*
+    if (res && (fault == NoFault))
+        *res = req->result;
+        */
+/*
+    if (!dcacheInterface && (req->flags & UNCACHEABLE))
+        recordEvent("Uncached Write");
+*/
+    return NoFault;
+}
+
+#endif // __CPU_OZONE_INORDER_BACK_END_HH__
diff --git a/cpu/ozone/inorder_back_end_impl.hh b/cpu/ozone/inorder_back_end_impl.hh
new file mode 100644
index 000000000..5a378ec76
--- /dev/null
+++ b/cpu/ozone/inorder_back_end_impl.hh
@@ -0,0 +1,519 @@
+
+#include "arch/faults.hh"
+#include "arch/isa_traits.hh"
+#include "cpu/ozone/inorder_back_end.hh"
+#include "cpu/ozone/thread_state.hh"
+
+using namespace TheISA;
+
+template <class Impl>
+InorderBackEnd<Impl>::InorderBackEnd(Params *params)
+    : squashPending(false),
+      squashSeqNum(0),
+      squashNextPC(0),
+      faultFromFetch(NoFault),
+      interruptBlocked(false),
+      cacheCompletionEvent(this),
+      dcacheInterface(params->dcacheInterface),
+      width(params->backEndWidth),
+      latency(params->backEndLatency),
+      squashLatency(params->backEndSquashLatency),
+      numInstsToWB(0, latency + 1)
+{
+    instsAdded = numInstsToWB.getWire(latency);
+    instsToExecute = numInstsToWB.getWire(0);
+
+    memReq = new MemReq;
+    memReq->data = new uint8_t[64];
+    status = Running;
+}
+
+template <class Impl>
+std::string
+InorderBackEnd<Impl>::name() const
+{
+    return cpu->name() + ".inorderbackend";
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::setXC(ExecContext *xc_ptr)
+{
+    xc = xc_ptr;
+    memReq->xc = xc;
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::setThreadState(OzoneThreadState<Impl> *thread_ptr)
+{
+    thread = thread_ptr;
+    thread->setFuncExeInst(0);
+}
+
+#if FULL_SYSTEM
+template <class Impl>
+void
+InorderBackEnd<Impl>::checkInterrupts()
+{
+    //Check if there are any outstanding interrupts
+    //Handle the interrupts
+    int ipl = 0;
+    int summary = 0;
+
+    cpu->checkInterrupts = false;
+
+    if (thread->readMiscReg(IPR_ASTRR))
+        panic("asynchronous traps not implemented\n");
+
+    if (thread->readMiscReg(IPR_SIRR)) {
+        for (int i = INTLEVEL_SOFTWARE_MIN;
+             i < INTLEVEL_SOFTWARE_MAX; i++) {
+            if (thread->readMiscReg(IPR_SIRR) & (ULL(1) << i)) {
+                // See table 4-19 of the 21164 hardware reference
+                ipl = (i - INTLEVEL_SOFTWARE_MIN) + 1;
+                summary |= (ULL(1) << i);
+            }
+        }
+    }
+
+    uint64_t interrupts = cpu->intr_status();
+
+    if (interrupts) {
+        for (int i = INTLEVEL_EXTERNAL_MIN;
+             i < INTLEVEL_EXTERNAL_MAX; i++) {
+            if (interrupts & (ULL(1) << i)) {
+                // See table 4-19 of the 21164 hardware reference
+                ipl = i;
+                summary |= (ULL(1) << i);
+            }
+        }
+    }
+
+    if (ipl && ipl > thread->readMiscReg(IPR_IPLR)) {
+        thread->inSyscall = true;
+
+        thread->setMiscReg(IPR_ISR, summary);
+        thread->setMiscReg(IPR_INTID, ipl);
+        Fault(new InterruptFault)->invoke(xc);
+        DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
+                thread->readMiscReg(IPR_IPLR), ipl, summary);
+
+        // May need to go 1 inst prior
+        squashPending = true;
+
+        thread->inSyscall = false;
+
+        setSquashInfoFromXC();
+    }
+}
+#endif
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::tick()
+{
+    // Squash due to an external source
+    // Not sure if this or an interrupt has higher priority
+    if (squashPending) {
+        squash(squashSeqNum, squashNextPC);
+        return;
+    }
+
+    // if (interrupt) then set thread PC, stall front end, record that
+    // I'm waiting for it to drain.  (for now just squash)
+#if FULL_SYSTEM
+    if (interruptBlocked ||
+        (cpu->checkInterrupts &&
+        cpu->check_interrupts() &&
+        !cpu->inPalMode())) {
+        if (!robEmpty()) {
+            interruptBlocked = true;
+        } else if (robEmpty() && cpu->inPalMode()) {
+            // Will need to let the front end continue a bit until
+            // we're out of pal mode.  Hopefully we never get into an
+            // infinite loop...
+            interruptBlocked = false;
+        } else {
+            interruptBlocked = false;
+            checkInterrupts();
+            return;
+        }
+    }
+#endif
+
+    if (status != DcacheMissLoadStall &&
+        status != DcacheMissStoreStall) {
+        for (int i = 0; i < width && (*instsAdded) < width; ++i) {
+            DynInstPtr inst = frontEnd->getInst();
+
+            if (!inst)
+                break;
+
+            instList.push_back(inst);
+
+            (*instsAdded)++;
+        }
+
+#if FULL_SYSTEM
+        if (faultFromFetch && robEmpty() && frontEnd->isEmpty()) {
+            handleFault();
+        } else {
+            executeInsts();
+        }
+#else
+        executeInsts();
+#endif
+    }
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::executeInsts()
+{
+    bool completed_last_inst = true;
+    int insts_to_execute = *instsToExecute;
+    int freed_regs = 0;
+
+    while (insts_to_execute > 0) {
+        assert(!instList.empty());
+        DynInstPtr inst = instList.front();
+
+        commitPC = inst->readPC();
+
+        thread->setPC(commitPC);
+        thread->setNextPC(inst->readNextPC());
+
+#if FULL_SYSTEM
+        int count = 0;
+        Addr oldpc;
+        do {
+            if (count == 0)
+                assert(!thread->inSyscall && !thread->trapPending);
+            oldpc = thread->readPC();
+            cpu->system->pcEventQueue.service(
+                thread->getXCProxy());
+            count++;
+        } while (oldpc != thread->readPC());
+        if (count > 1) {
+            DPRINTF(IBE, "PC skip function event, stopping commit\n");
+            completed_last_inst = false;
+            squashPending = true;
+            break;
+        }
+#endif
+
+        Fault inst_fault = NoFault;
+
+        if (status == DcacheMissComplete) {
+            DPRINTF(IBE, "Completing inst [sn:%lli]\n", inst->seqNum);
+            status = Running;
+        } else if (inst->isMemRef() && status != DcacheMissComplete &&
+            (!inst->isDataPrefetch() && !inst->isInstPrefetch())) {
+            DPRINTF(IBE, "Initiating mem op inst [sn:%lli] PC: %#x\n",
+                    inst->seqNum, inst->readPC());
+
+            cacheCompletionEvent.inst = inst;
+            inst_fault = inst->initiateAcc();
+            if (inst_fault == NoFault &&
+                status != DcacheMissLoadStall &&
+                status != DcacheMissStoreStall) {
+                inst_fault = inst->completeAcc();
+            }
+            ++thread->funcExeInst;
+        } else {
+            DPRINTF(IBE, "Executing inst [sn:%lli] PC: %#x\n",
+                    inst->seqNum, inst->readPC());
+            inst_fault = inst->execute();
+            ++thread->funcExeInst;
+        }
+
+        // Will need to be able to break this loop in case the load
+        // misses.  Split access/complete ops would be useful here
+        // with writeback events.
+        if (status == DcacheMissLoadStall) {
+            *instsToExecute = insts_to_execute;
+
+            completed_last_inst = false;
+            break;
+        } else if (status == DcacheMissStoreStall) {
+            // Figure out how to fix this hack.  Probably have DcacheMissLoad
+            // vs DcacheMissStore.
+            *instsToExecute = insts_to_execute;
+            completed_last_inst = false;
+/*
+            instList.pop_front();
+            --insts_to_execute;
+            if (inst->traceData) {
+                inst->traceData->finalize();
+            }
+*/
+
+            // Don't really need to stop for a store stall as long as
+            // the memory system is able to handle store forwarding
+            // and such.  Breaking out might help avoid the cache
+            // interface becoming blocked.
+            break;
+        }
+
+        inst->setExecuted();
+        inst->setCompleted();
+        inst->setCanCommit();
+
+        instList.pop_front();
+
+        --insts_to_execute;
+        --(*instsToExecute);
+
+        if (inst->traceData) {
+            inst->traceData->finalize();
+            inst->traceData = NULL;
+        }
+
+        if (inst_fault != NoFault) {
+#if FULL_SYSTEM
+            DPRINTF(IBE, "Inst [sn:%lli] PC %#x has a fault\n",
+                    inst->seqNum, inst->readPC());
+
+            assert(!thread->inSyscall);
+
+            thread->inSyscall = true;
+
+            // Hack for now; DTB will sometimes need the machine instruction
+            // for when faults happen.  So we will set it here, prior to the
+            // DTB possibly needing it for this translation.
+            thread->setInst(
+                static_cast<TheISA::MachInst>(inst->staticInst->machInst));
+
+            // Consider holding onto the trap and waiting until the trap event
+            // happens for this to be executed.
+            inst_fault->invoke(xc);
+
+            // Exit state update mode to avoid accidental updating.
+            thread->inSyscall = false;
+
+            squashPending = true;
+
+            // Generate trap squash event.
+//            generateTrapEvent(tid);
+            completed_last_inst = false;
+            break;
+#else // !FULL_SYSTEM
+            panic("fault (%d) detected @ PC %08p", inst_fault,
+                  inst->PC);
+#endif // FULL_SYSTEM
+        }
+
+        for (int i = 0; i < inst->numDestRegs(); ++i) {
+            renameTable[inst->destRegIdx(i)] = inst;
+            thread->renameTable[inst->destRegIdx(i)] = inst;
+            ++freed_regs;
+        }
+
+        inst->clearDependents();
+
+        comm->access(0)->doneSeqNum = inst->seqNum;
+
+        if (inst->mispredicted()) {
+            squash(inst->seqNum, inst->readNextPC());
+
+            thread->setNextPC(inst->readNextPC());
+
+            break;
+        } else if (squashPending) {
+            // Something external happened that caused the CPU to squash.
+            // Break out of commit and handle the squash next cycle.
+            break;
+        }
+        // If it didn't mispredict, then it executed fine.  Send back its
+        // registers and BP info?  What about insts that may still have
+        // latency, like loads?  Probably can send back the information after
+        // it is completed.
+
+        // keep an instruction count
+        cpu->numInst++;
+        thread->numInsts++;
+    }
+
+    frontEnd->addFreeRegs(freed_regs);
+
+    assert(insts_to_execute >= 0);
+
+    // Should only advance this if I have executed all instructions.
+    if (insts_to_execute == 0) {
+        numInstsToWB.advance();
+    }
+
+    // Should I set the PC to the next PC here?  What do I set next PC to?
+    if (completed_last_inst) {
+        thread->setPC(thread->readNextPC());
+        thread->setNextPC(thread->readPC() + sizeof(MachInst));
+    }
+
+    if (squashPending) {
+        setSquashInfoFromXC();
+    }
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::handleFault()
+{
+    DPRINTF(Commit, "Handling fault from fetch\n");
+
+    assert(!thread->inSyscall);
+
+    thread->inSyscall = true;
+
+    // Consider holding onto the trap and waiting until the trap event
+    // happens for this to be executed.
+    faultFromFetch->invoke(xc);
+
+    // Exit state update mode to avoid accidental updating.
+    thread->inSyscall = false;
+
+    squashPending = true;
+
+    setSquashInfoFromXC();
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC)
+{
+    DPRINTF(IBE, "Squashing from [sn:%lli], setting PC to %#x\n",
+            squash_num, next_PC);
+
+    InstListIt squash_it = --(instList.end());
+
+    int freed_regs = 0;
+
+    while (!instList.empty() && (*squash_it)->seqNum > squash_num) {
+        DynInstPtr inst = *squash_it;
+
+        DPRINTF(IBE, "Squashing instruction PC %#x, [sn:%lli].\n",
+                inst->readPC(),
+                inst->seqNum);
+
+        // May cause problems with misc regs
+        freed_regs+= inst->numDestRegs();
+        inst->clearDependents();
+        squash_it--;
+        instList.pop_back();
+    }
+
+    frontEnd->addFreeRegs(freed_regs);
+
+    for (int i = 0; i < latency+1; ++i) {
+        numInstsToWB.advance();
+    }
+
+    squashPending = false;
+
+    // Probably want to make sure that this squash is the one that set the
+    // thread into inSyscall mode.
+    thread->inSyscall = false;
+
+    // Tell front end to squash, reset PC to new one.
+    frontEnd->squash(squash_num, next_PC);
+
+    faultFromFetch = NULL;
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::squashFromXC()
+{
+    // Record that I need to squash
+    squashPending = true;
+
+    thread->inSyscall = true;
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::setSquashInfoFromXC()
+{
+    // Need to handle the case of the instList being empty.  In that case
+    // probably any number works, except maybe with stores in the store buffer.
+    squashSeqNum = instList.empty() ? 0 : instList.front()->seqNum - 1;
+
+    squashNextPC = thread->PC;
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::fetchFault(Fault &fault)
+{
+    faultFromFetch = fault;
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::dumpInsts()
+{
+    int num = 0;
+    int valid_num = 0;
+
+    InstListIt inst_list_it = instList.begin();
+
+    cprintf("Inst list size: %i\n", instList.size());
+
+    while (inst_list_it != instList.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it++;
+        ++num;
+    }
+}
+
+template <class Impl>
+InorderBackEnd<Impl>::DCacheCompletionEvent::DCacheCompletionEvent(
+    InorderBackEnd *_be)
+    : Event(&mainEventQueue, CPU_Tick_Pri), be(_be)
+{
+//    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+InorderBackEnd<Impl>::DCacheCompletionEvent::process()
+{
+    inst->completeAcc();
+    be->status = DcacheMissComplete;
+}
+
+template <class Impl>
+const char *
+InorderBackEnd<Impl>::DCacheCompletionEvent::description()
+{
+    return "DCache completion event";
+}
diff --git a/cpu/ozone/inst_queue.cc b/cpu/ozone/inst_queue.cc
new file mode 100644
index 000000000..9c61602d9
--- /dev/null
+++ b/cpu/ozone/inst_queue.cc
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/ozone/dyn_inst.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
+#include "cpu/ozone/inst_queue_impl.hh"
+
+// Force instantiation of InstructionQueue.
+template class InstQueue<SimpleImpl>;
+template class InstQueue<OzoneImpl>;
diff --git a/cpu/ozone/inst_queue.hh b/cpu/ozone/inst_queue.hh
new file mode 100644
index 000000000..2cbbb7987
--- /dev/null
+++ b/cpu/ozone/inst_queue.hh
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_OZONE_INST_QUEUE_HH__
+#define __CPU_OZONE_INST_QUEUE_HH__
+
+#include <list>
+#include <map>
+#include <queue>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "base/timebuf.hh"
+#include "cpu/inst_seq.hh"
+#include "sim/host.hh"
+
+class FUPool;
+class MemInterface;
+
+/**
+ * A standard instruction queue class.  It holds ready instructions, in
+ * order, in seperate priority queues to facilitate the scheduling of
+ * instructions.  The IQ uses a separate linked list to track dependencies.
+ * Similar to the rename map and the free list, it expects that
+ * floating point registers have their indices start after the integer
+ * registers (ie with 96 int and 96 fp registers, regs 0-95 are integer
+ * and 96-191 are fp).  This remains true even for both logical and
+ * physical register indices. The IQ depends on the memory dependence unit to
+ * track when memory operations are ready in terms of ordering; register
+ * dependencies are tracked normally. Right now the IQ also handles the
+ * execution timing; this is mainly to allow back-to-back scheduling without
+ * requiring IEW to be able to peek into the IQ. At the end of the execution
+ * latency, the instruction is put into the queue to execute, where it will
+ * have the execute() function called on it.
+ * @todo: Make IQ able to handle multiple FU pools.
+ */
+template <class Impl>
+class InstQueue
+{
+  public:
+    //Typedefs from the Impl.
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::Params Params;
+    typedef typename Impl::IssueStruct IssueStruct;
+/*
+    typedef typename Impl::CPUPol::IEW IEW;
+    typedef typename Impl::CPUPol::MemDepUnit MemDepUnit;
+    typedef typename Impl::CPUPol::IssueStruct IssueStruct;
+    typedef typename Impl::CPUPol::TimeStruct TimeStruct;
+*/
+    // Typedef of iterator through the list of instructions.
+    typedef typename std::list<DynInstPtr>::iterator ListIt;
+
+    friend class Impl::FullCPU;
+#if 0
+    /** FU completion event class. */
+    class FUCompletion : public Event {
+      private:
+        /** Executing instruction. */
+        DynInstPtr inst;
+
+        /** Index of the FU used for executing. */
+        int fuIdx;
+
+        /** Pointer back to the instruction queue. */
+        InstQueue<Impl> *iqPtr;
+
+      public:
+        /** Construct a FU completion event. */
+        FUCompletion(DynInstPtr &_inst, int fu_idx,
+                     InstQueue<Impl> *iq_ptr);
+
+        virtual void process();
+        virtual const char *description();
+    };
+#endif
+    /** Constructs an IQ. */
+    InstQueue(Params *params);
+
+    /** Destructs the IQ. */
+    ~InstQueue();
+
+    /** Returns the name of the IQ. */
+    std::string name() const;
+
+    /** Registers statistics. */
+    void regStats();
+
+    /** Sets CPU pointer. */
+    void setCPU(FullCPU *_cpu) { cpu = _cpu; }
+#if 0
+    /** Sets active threads list. */
+    void setActiveThreads(list<unsigned> *at_ptr);
+
+    /** Sets the IEW pointer. */
+    void setIEW(IEW *iew_ptr) { iewStage = iew_ptr; }
+#endif
+    /** Sets the timer buffer between issue and execute. */
+    void setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2eQueue);
+#if 0
+    /** Sets the global time buffer. */
+    void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
+
+    /** Number of entries needed for given amount of threads. */
+    int entryAmount(int num_threads);
+
+    /** Resets max entries for all threads. */
+    void resetEntries();
+#endif
+    /** Returns total number of free entries. */
+    unsigned numFreeEntries();
+
+    /** Returns number of free entries for a thread. */
+    unsigned numFreeEntries(unsigned tid);
+
+    /** Returns whether or not the IQ is full. */
+    bool isFull();
+
+    /** Returns whether or not the IQ is full for a specific thread. */
+    bool isFull(unsigned tid);
+
+    /** Returns if there are any ready instructions in the IQ. */
+    bool hasReadyInsts();
+
+    /** Inserts a new instruction into the IQ. */
+    void insert(DynInstPtr &new_inst);
+
+    /** Inserts a new, non-speculative instruction into the IQ. */
+    void insertNonSpec(DynInstPtr &new_inst);
+#if 0
+    /**
+     * Advances the tail of the IQ, used if an instruction is not added to the
+     * IQ for scheduling.
+     * @todo: Rename this function.
+     */
+    void advanceTail(DynInstPtr &inst);
+
+    /** Process FU completion event. */
+    void processFUCompletion(DynInstPtr &inst, int fu_idx);
+#endif
+    /**
+     * Schedules ready instructions, adding the ready ones (oldest first) to
+     * the queue to execute.
+     */
+    void scheduleReadyInsts();
+
+    /** Schedules a single specific non-speculative instruction. */
+    void scheduleNonSpec(const InstSeqNum &inst);
+
+    /**
+     * Commits all instructions up to and including the given sequence number,
+     * for a specific thread.
+     */
+    void commit(const InstSeqNum &inst, unsigned tid = 0);
+
+    /** Wakes all dependents of a completed instruction. */
+    void wakeDependents(DynInstPtr &completed_inst);
+
+    /** Adds a ready memory instruction to the ready list. */
+    void addReadyMemInst(DynInstPtr &ready_inst);
+#if 0
+    /**
+     * Reschedules a memory instruction. It will be ready to issue once
+     * replayMemInst() is called.
+     */
+    void rescheduleMemInst(DynInstPtr &resched_inst);
+
+    /** Replays a memory instruction. It must be rescheduled first. */
+    void replayMemInst(DynInstPtr &replay_inst);
+#endif
+    /** Completes a memory operation. */
+    void completeMemInst(DynInstPtr &completed_inst);
+#if 0
+    /** Indicates an ordering violation between a store and a load. */
+    void violation(DynInstPtr &store, DynInstPtr &faulting_load);
+#endif
+    /**
+     * Squashes instructions for a thread. Squashing information is obtained
+     * from the time buffer.
+     */
+    void squash(unsigned tid); // Probably want the ISN
+
+    /** Returns the number of used entries for a thread. */
+    unsigned getCount(unsigned tid) { return count[tid]; };
+
+    /** Updates the number of free entries. */
+    void updateFreeEntries(int num) { freeEntries += num; }
+
+    /** Debug function to print all instructions. */
+    void printInsts();
+
+  private:
+    /** Does the actual squashing. */
+    void doSquash(unsigned tid);
+
+    /////////////////////////
+    // Various pointers
+    /////////////////////////
+
+    /** Pointer to the CPU. */
+    FullCPU *cpu;
+
+    /** Cache interface. */
+    MemInterface *dcacheInterface;
+#if 0
+    /** Pointer to IEW stage. */
+    IEW *iewStage;
+
+    /** The memory dependence unit, which tracks/predicts memory dependences
+     *  between instructions.
+     */
+    MemDepUnit memDepUnit[Impl::MaxThreads];
+#endif
+    /** The queue to the execute stage.  Issued instructions will be written
+     *  into it.
+     */
+    TimeBuffer<IssueStruct> *issueToExecuteQueue;
+#if 0
+    /** The backwards time buffer. */
+    TimeBuffer<TimeStruct> *timeBuffer;
+
+    /** Wire to read information from timebuffer. */
+    typename TimeBuffer<TimeStruct>::wire fromCommit;
+
+    /** Function unit pool. */
+    FUPool *fuPool;
+#endif
+    //////////////////////////////////////
+    // Instruction lists, ready queues, and ordering
+    //////////////////////////////////////
+
+    /** List of all the instructions in the IQ (some of which may be issued). */
+    std::list<DynInstPtr> instList[Impl::MaxThreads];
+
+    /**
+     * Struct for comparing entries to be added to the priority queue.  This
+     * gives reverse ordering to the instructions in terms of sequence
+     * numbers: the instructions with smaller sequence numbers (and hence
+     * are older) will be at the top of the priority queue.
+     */
+    struct pqCompare {
+        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
+        {
+            return lhs->seqNum > rhs->seqNum;
+        }
+    };
+
+    /**
+     * Struct for an IQ entry. It includes the instruction and an iterator
+     * to the instruction's spot in the IQ.
+     */
+    struct IQEntry {
+        DynInstPtr inst;
+        ListIt iqIt;
+    };
+
+    typedef std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare>
+    ReadyInstQueue;
+
+    typedef std::map<DynInstPtr, pqCompare> ReadyInstMap;
+    typedef typename std::map<DynInstPtr, pqCompare>::iterator ReadyMapIt;
+
+    /** List of ready instructions.
+     */
+    ReadyInstQueue readyInsts;
+
+    /** List of non-speculative instructions that will be scheduled
+     *  once the IQ gets a signal from commit.  While it's redundant to
+     *  have the key be a part of the value (the sequence number is stored
+     *  inside of DynInst), when these instructions are woken up only
+     *  the sequence number will be available.  Thus it is most efficient to be
+     *  able to search by the sequence number alone.
+     */
+    std::map<InstSeqNum, DynInstPtr> nonSpecInsts;
+
+    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator NonSpecMapIt;
+#if 0
+    /** Entry for the list age ordering by op class. */
+    struct ListOrderEntry {
+        OpClass queueType;
+        InstSeqNum oldestInst;
+    };
+
+    /** List that contains the age order of the oldest instruction of each
+     *  ready queue.  Used to select the oldest instruction available
+     *  among op classes.
+     */
+    std::list<ListOrderEntry> listOrder;
+
+    typedef typename std::list<ListOrderEntry>::iterator ListOrderIt;
+
+    /** Tracks if each ready queue is on the age order list. */
+    bool queueOnList[Num_OpClasses];
+
+    /** Iterators of each ready queue.  Points to their spot in the age order
+     *  list.
+     */
+    ListOrderIt readyIt[Num_OpClasses];
+
+    /** Add an op class to the age order list. */
+    void addToOrderList(OpClass op_class);
+
+    /**
+     * Called when the oldest instruction has been removed from a ready queue;
+     * this places that ready queue into the proper spot in the age order list.
+     */
+    void moveToYoungerInst(ListOrderIt age_order_it);
+#endif
+    //////////////////////////////////////
+    // Various parameters
+    //////////////////////////////////////
+#if 0
+    /** IQ Resource Sharing Policy */
+    enum IQPolicy {
+        Dynamic,
+        Partitioned,
+        Threshold
+    };
+
+    /** IQ sharing policy for SMT. */
+    IQPolicy iqPolicy;
+#endif
+    /** Number of Total Threads*/
+    unsigned numThreads;
+#if 0
+    /** Pointer to list of active threads. */
+    list<unsigned> *activeThreads;
+#endif
+    /** Per Thread IQ count */
+    unsigned count[Impl::MaxThreads];
+
+    /** Max IQ Entries Per Thread */
+    unsigned maxEntries[Impl::MaxThreads];
+
+    /** Number of free IQ entries left. */
+    unsigned freeEntries;
+
+    /** The number of entries in the instruction queue. */
+    unsigned numEntries;
+
+    /** The total number of instructions that can be issued in one cycle. */
+    unsigned totalWidth;
+#if 0
+    /** The number of physical registers in the CPU. */
+    unsigned numPhysRegs;
+
+    /** The number of physical integer registers in the CPU. */
+    unsigned numPhysIntRegs;
+
+    /** The number of floating point registers in the CPU. */
+    unsigned numPhysFloatRegs;
+#endif
+    /** Delay between commit stage and the IQ.
+     *  @todo: Make there be a distinction between the delays within IEW.
+     */
+    unsigned commitToIEWDelay;
+
+    //////////////////////////////////
+    // Variables needed for squashing
+    //////////////////////////////////
+
+    /** The sequence number of the squashed instruction. */
+    InstSeqNum squashedSeqNum[Impl::MaxThreads];
+
+    /** Iterator that points to the last instruction that has been squashed.
+     *  This will not be valid unless the IQ is in the process of squashing.
+     */
+    ListIt squashIt[Impl::MaxThreads];
+#if 0
+    ///////////////////////////////////
+    // Dependency graph stuff
+    ///////////////////////////////////
+
+    class DependencyEntry
+    {
+      public:
+        DependencyEntry()
+            : inst(NULL), next(NULL)
+        { }
+
+        DynInstPtr inst;
+        //Might want to include data about what arch. register the
+        //dependence is waiting on.
+        DependencyEntry *next;
+
+        //This function, and perhaps this whole class, stand out a little
+        //bit as they don't fit a classification well.  I want access
+        //to the underlying structure of the linked list, yet at
+        //the same time it feels like this should be something abstracted
+        //away.  So for now it will sit here, within the IQ, until
+        //a better implementation is decided upon.
+        // This function probably shouldn't be within the entry...
+        void insert(DynInstPtr &new_inst);
+
+        void remove(DynInstPtr &inst_to_remove);
+
+        // Debug variable, remove when done testing.
+        static unsigned mem_alloc_counter;
+    };
+
+    /** Array of linked lists.  Each linked list is a list of all the
+     *  instructions that depend upon a given register.  The actual
+     *  register's index is used to index into the graph; ie all
+     *  instructions in flight that are dependent upon r34 will be
+     *  in the linked list of dependGraph[34].
+     */
+    DependencyEntry *dependGraph;
+
+    /** A cache of the recently woken registers.  It is 1 if the register
+     *  has been woken up recently, and 0 if the register has been added
+     *  to the dependency graph and has not yet received its value.  It
+     *  is basically a secondary scoreboard, and should pretty much mirror
+     *  the scoreboard that exists in the rename map.
+     */
+    vector<bool> regScoreboard;
+
+    /** Adds an instruction to the dependency graph, as a producer. */
+    bool addToDependents(DynInstPtr &new_inst);
+
+    /** Adds an instruction to the dependency graph, as a consumer. */
+    void createDependency(DynInstPtr &new_inst);
+#endif
+    /** Moves an instruction to the ready queue if it is ready. */
+    void addIfReady(DynInstPtr &inst);
+
+    /** Debugging function to count how many entries are in the IQ.  It does
+     *  a linear walk through the instructions, so do not call this function
+     *  during normal execution.
+     */
+    int countInsts();
+#if 0
+    /** Debugging function to dump out the dependency graph.
+     */
+    void dumpDependGraph();
+#endif
+    /** Debugging function to dump all the list sizes, as well as print
+     *  out the list of nonspeculative instructions.  Should not be used
+     *  in any other capacity, but it has no harmful sideaffects.
+     */
+    void dumpLists();
+
+    /** Debugging function to dump out all instructions that are in the
+     *  IQ.
+     */
+    void dumpInsts();
+
+    /** Stat for number of instructions added. */
+    Stats::Scalar<> iqInstsAdded;
+    /** Stat for number of non-speculative instructions added. */
+    Stats::Scalar<> iqNonSpecInstsAdded;
+//    Stats::Scalar<> iqIntInstsAdded;
+    /** Stat for number of integer instructions issued. */
+    Stats::Scalar<> iqIntInstsIssued;
+//    Stats::Scalar<> iqFloatInstsAdded;
+    /** Stat for number of floating point instructions issued. */
+    Stats::Scalar<> iqFloatInstsIssued;
+//    Stats::Scalar<> iqBranchInstsAdded;
+    /** Stat for number of branch instructions issued. */
+    Stats::Scalar<> iqBranchInstsIssued;
+//    Stats::Scalar<> iqMemInstsAdded;
+    /** Stat for number of memory instructions issued. */
+    Stats::Scalar<> iqMemInstsIssued;
+//    Stats::Scalar<> iqMiscInstsAdded;
+    /** Stat for number of miscellaneous instructions issued. */
+    Stats::Scalar<> iqMiscInstsIssued;
+    /** Stat for number of squashed instructions that were ready to issue. */
+    Stats::Scalar<> iqSquashedInstsIssued;
+    /** Stat for number of squashed instructions examined when squashing. */
+    Stats::Scalar<> iqSquashedInstsExamined;
+    /** Stat for number of squashed instruction operands examined when
+     * squashing.
+     */
+    Stats::Scalar<> iqSquashedOperandsExamined;
+    /** Stat for number of non-speculative instructions removed due to a squash.
+     */
+    Stats::Scalar<> iqSquashedNonSpecRemoved;
+
+};
+
+#endif //__CPU_OZONE_INST_QUEUE_HH__
diff --git a/cpu/ozone/inst_queue_impl.hh b/cpu/ozone/inst_queue_impl.hh
new file mode 100644
index 000000000..0523c68d6
--- /dev/null
+++ b/cpu/ozone/inst_queue_impl.hh
@@ -0,0 +1,1341 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Todo:
+// Current ordering allows for 0 cycle added-to-scheduled.  Could maybe fake
+// it; either do in reverse order, or have added instructions put into a
+// different ready queue that, in scheduleRreadyInsts(), gets put onto the
+// normal ready queue.  This would however give only a one cycle delay,
+// but probably is more flexible to actually add in a delay parameter than
+// just running it backwards.
+
+#include <vector>
+
+#include "sim/root.hh"
+
+#include "cpu/ozone/inst_queue.hh"
+#if 0
+template <class Impl>
+InstQueue<Impl>::FUCompletion::FUCompletion(DynInstPtr &_inst,
+                                                   int fu_idx,
+                                                   InstQueue<Impl> *iq_ptr)
+    : Event(&mainEventQueue, Stat_Event_Pri),
+      inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::FUCompletion::process()
+{
+    iqPtr->processFUCompletion(inst, fuIdx);
+}
+
+
+template <class Impl>
+const char *
+InstQueue<Impl>::FUCompletion::description()
+{
+    return "Functional unit completion event";
+}
+#endif
+template <class Impl>
+InstQueue<Impl>::InstQueue(Params *params)
+    : dcacheInterface(params->dcacheInterface),
+//      fuPool(params->fuPool),
+      numEntries(params->numIQEntries),
+      totalWidth(params->issueWidth),
+//      numPhysIntRegs(params->numPhysIntRegs),
+//      numPhysFloatRegs(params->numPhysFloatRegs),
+      commitToIEWDelay(params->commitToIEWDelay)
+{
+//    assert(fuPool);
+
+//    numThreads = params->numberOfThreads;
+    numThreads = 1;
+
+    //Initialize thread IQ counts
+    for (int i = 0; i <numThreads; i++) {
+        count[i] = 0;
+    }
+
+    // Initialize the number of free IQ entries.
+    freeEntries = numEntries;
+
+    // Set the number of physical registers as the number of int + float
+//    numPhysRegs = numPhysIntRegs + numPhysFloatRegs;
+
+//    DPRINTF(IQ, "There are %i physical registers.\n", numPhysRegs);
+
+    //Create an entry for each physical register within the
+    //dependency graph.
+//    dependGraph = new DependencyEntry[numPhysRegs];
+
+    // Resize the register scoreboard.
+//    regScoreboard.resize(numPhysRegs);
+/*
+    //Initialize Mem Dependence Units
+    for (int i = 0; i < numThreads; i++) {
+        memDepUnit[i].init(params,i);
+        memDepUnit[i].setIQ(this);
+    }
+
+    // Initialize all the head pointers to point to NULL, and all the
+    // entries as unready.
+    // Note that in actuality, the registers corresponding to the logical
+    // registers start off as ready.  However this doesn't matter for the
+    // IQ as the instruction should have been correctly told if those
+    // registers are ready in rename.  Thus it can all be initialized as
+    // unready.
+    for (int i = 0; i < numPhysRegs; ++i) {
+        dependGraph[i].next = NULL;
+        dependGraph[i].inst = NULL;
+        regScoreboard[i] = false;
+    }
+*/
+    for (int i = 0; i < numThreads; ++i) {
+        squashedSeqNum[i] = 0;
+    }
+/*
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        queueOnList[i] = false;
+        readyIt[i] = listOrder.end();
+    }
+
+    string policy = params->smtIQPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Figure out resource sharing policy
+    if (policy == "dynamic") {
+        iqPolicy = Dynamic;
+
+        //Set Max Entries to Total ROB Capacity
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = numEntries;
+        }
+
+    } else if (policy == "partitioned") {
+        iqPolicy = Partitioned;
+
+        //@todo:make work if part_amt doesnt divide evenly.
+        int part_amt = numEntries / numThreads;
+
+        //Divide ROB up evenly
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = part_amt;
+        }
+
+        DPRINTF(Fetch, "IQ sharing policy set to Partitioned:"
+                "%i entries per thread.\n",part_amt);
+
+    } else if (policy == "threshold") {
+        iqPolicy = Threshold;
+
+        double threshold =  (double)params->smtIQThreshold / 100;
+
+        int thresholdIQ = (int)((double)threshold * numEntries);
+
+        //Divide up by threshold amount
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = thresholdIQ;
+        }
+
+        DPRINTF(Fetch, "IQ sharing policy set to Threshold:"
+                "%i entries per thread.\n",thresholdIQ);
+   } else {
+       assert(0 && "Invalid IQ Sharing Policy.Options Are:{Dynamic,"
+              "Partitioned, Threshold}");
+   }
+*/
+}
+
+template <class Impl>
+InstQueue<Impl>::~InstQueue()
+{
+    // Clear the dependency graph
+/*
+    DependencyEntry *curr;
+    DependencyEntry *prev;
+
+    for (int i = 0; i < numPhysRegs; ++i) {
+        curr = dependGraph[i].next;
+
+        while (curr) {
+            DependencyEntry::mem_alloc_counter--;
+
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
+
+            delete prev;
+        }
+
+        if (dependGraph[i].inst) {
+            dependGraph[i].inst = NULL;
+        }
+
+        dependGraph[i].next = NULL;
+    }
+
+    assert(DependencyEntry::mem_alloc_counter == 0);
+
+    delete [] dependGraph;
+*/
+}
+
+template <class Impl>
+std::string
+InstQueue<Impl>::name() const
+{
+    return cpu->name() + ".iq";
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::regStats()
+{
+    iqInstsAdded
+        .name(name() + ".iqInstsAdded")
+        .desc("Number of instructions added to the IQ (excludes non-spec)")
+        .prereq(iqInstsAdded);
+
+    iqNonSpecInstsAdded
+        .name(name() + ".iqNonSpecInstsAdded")
+        .desc("Number of non-speculative instructions added to the IQ")
+        .prereq(iqNonSpecInstsAdded);
+
+//    iqIntInstsAdded;
+
+    iqIntInstsIssued
+        .name(name() + ".iqIntInstsIssued")
+        .desc("Number of integer instructions issued")
+        .prereq(iqIntInstsIssued);
+
+//    iqFloatInstsAdded;
+
+    iqFloatInstsIssued
+        .name(name() + ".iqFloatInstsIssued")
+        .desc("Number of float instructions issued")
+        .prereq(iqFloatInstsIssued);
+
+//    iqBranchInstsAdded;
+
+    iqBranchInstsIssued
+        .name(name() + ".iqBranchInstsIssued")
+        .desc("Number of branch instructions issued")
+        .prereq(iqBranchInstsIssued);
+
+//    iqMemInstsAdded;
+
+    iqMemInstsIssued
+        .name(name() + ".iqMemInstsIssued")
+        .desc("Number of memory instructions issued")
+        .prereq(iqMemInstsIssued);
+
+//    iqMiscInstsAdded;
+
+    iqMiscInstsIssued
+        .name(name() + ".iqMiscInstsIssued")
+        .desc("Number of miscellaneous instructions issued")
+        .prereq(iqMiscInstsIssued);
+
+    iqSquashedInstsIssued
+        .name(name() + ".iqSquashedInstsIssued")
+        .desc("Number of squashed instructions issued")
+        .prereq(iqSquashedInstsIssued);
+
+    iqSquashedInstsExamined
+        .name(name() + ".iqSquashedInstsExamined")
+        .desc("Number of squashed instructions iterated over during squash;"
+              " mainly for profiling")
+        .prereq(iqSquashedInstsExamined);
+
+    iqSquashedOperandsExamined
+        .name(name() + ".iqSquashedOperandsExamined")
+        .desc("Number of squashed operands that are examined and possibly "
+              "removed from graph")
+        .prereq(iqSquashedOperandsExamined);
+
+    iqSquashedNonSpecRemoved
+        .name(name() + ".iqSquashedNonSpecRemoved")
+        .desc("Number of squashed non-spec instructions that were removed")
+        .prereq(iqSquashedNonSpecRemoved);
+/*
+    for ( int i=0; i < numThreads; i++) {
+        // Tell mem dependence unit to reg stats as well.
+        memDepUnit[i].regStats();
+    }
+*/
+}
+/*
+template <class Impl>
+void
+InstQueue<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(IQ, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+*/
+template <class Impl>
+void
+InstQueue<Impl>::setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2e_ptr)
+{
+    DPRINTF(IQ, "Set the issue to execute queue.\n");
+    issueToExecuteQueue = i2e_ptr;
+}
+/*
+template <class Impl>
+void
+InstQueue<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+{
+    DPRINTF(IQ, "Set the time buffer.\n");
+    timeBuffer = tb_ptr;
+
+    fromCommit = timeBuffer->getWire(-commitToIEWDelay);
+}
+
+template <class Impl>
+int
+InstQueue<Impl>::entryAmount(int num_threads)
+{
+    if (iqPolicy == Partitioned) {
+        return numEntries / num_threads;
+    } else {
+        return 0;
+    }
+}
+
+
+template <class Impl>
+void
+InstQueue<Impl>::resetEntries()
+{
+    if (iqPolicy != Dynamic || numThreads > 1) {
+        int active_threads = (*activeThreads).size();
+
+        list<unsigned>::iterator threads  = (*activeThreads).begin();
+        list<unsigned>::iterator list_end = (*activeThreads).end();
+
+        while (threads != list_end) {
+            if (iqPolicy == Partitioned) {
+                maxEntries[*threads++] = numEntries / active_threads;
+            } else if(iqPolicy == Threshold && active_threads == 1) {
+                maxEntries[*threads++] = numEntries;
+            }
+        }
+    }
+}
+*/
+template <class Impl>
+unsigned
+InstQueue<Impl>::numFreeEntries()
+{
+    return freeEntries;
+}
+
+template <class Impl>
+unsigned
+InstQueue<Impl>::numFreeEntries(unsigned tid)
+{
+    return maxEntries[tid] - count[tid];
+}
+
+// Might want to do something more complex if it knows how many instructions
+// will be issued this cycle.
+template <class Impl>
+bool
+InstQueue<Impl>::isFull()
+{
+    if (freeEntries == 0) {
+        return(true);
+    } else {
+        return(false);
+    }
+}
+
+template <class Impl>
+bool
+InstQueue<Impl>::isFull(unsigned tid)
+{
+    if (numFreeEntries(tid) == 0) {
+        return(true);
+    } else {
+        return(false);
+    }
+}
+
+template <class Impl>
+bool
+InstQueue<Impl>::hasReadyInsts()
+{
+/*
+    if (!listOrder.empty()) {
+        return true;
+    }
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        if (!readyInsts[i].empty()) {
+            return true;
+        }
+    }
+
+    return false;
+*/
+    return readyInsts.empty();
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::insert(DynInstPtr &new_inst)
+{
+    // Make sure the instruction is valid
+    assert(new_inst);
+
+    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
+            new_inst->readPC());
+
+    // Check if there are any free entries.  Panic if there are none.
+    // Might want to have this return a fault in the future instead of
+    // panicing.
+    assert(freeEntries != 0);
+
+    instList[new_inst->threadNumber].push_back(new_inst);
+
+    // Decrease the number of free entries.
+    --freeEntries;
+
+    //Mark Instruction as in IQ
+//    new_inst->setInIQ();
+/*
+    // Look through its source registers (physical regs), and mark any
+    // dependencies.
+    addToDependents(new_inst);
+
+    // Have this instruction set itself as the producer of its destination
+    // register(s).
+    createDependency(new_inst);
+*/
+    // If it's a memory instruction, add it to the memory dependency
+    // unit.
+//    if (new_inst->isMemRef()) {
+//        memDepUnit[new_inst->threadNumber].insert(new_inst);
+//    } else {
+        // If the instruction is ready then add it to the ready list.
+        addIfReady(new_inst);
+//    }
+
+    ++iqInstsAdded;
+
+
+    //Update Thread IQ Count
+    count[new_inst->threadNumber]++;
+
+    assert(freeEntries == (numEntries - countInsts()));
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::insertNonSpec(DynInstPtr &new_inst)
+{
+    nonSpecInsts[new_inst->seqNum] = new_inst;
+
+    // @todo: Clean up this code; can do it by setting inst as unable
+    // to issue, then calling normal insert on the inst.
+
+    // Make sure the instruction is valid
+    assert(new_inst);
+
+    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
+            new_inst->readPC());
+
+    // Check if there are any free entries.  Panic if there are none.
+    // Might want to have this return a fault in the future instead of
+    // panicing.
+    assert(freeEntries != 0);
+
+    instList[new_inst->threadNumber].push_back(new_inst);
+
+    // Decrease the number of free entries.
+    --freeEntries;
+
+    //Mark Instruction as in IQ
+//    new_inst->setInIQ();
+/*
+    // Have this instruction set itself as the producer of its destination
+    // register(s).
+    createDependency(new_inst);
+
+    // If it's a memory instruction, add it to the memory dependency
+    // unit.
+    if (new_inst->isMemRef()) {
+        memDepUnit[new_inst->threadNumber].insertNonSpec(new_inst);
+    }
+*/
+    ++iqNonSpecInstsAdded;
+
+    //Update Thread IQ Count
+    count[new_inst->threadNumber]++;
+
+    assert(freeEntries == (numEntries - countInsts()));
+}
+/*
+template <class Impl>
+void
+InstQueue<Impl>::advanceTail(DynInstPtr &inst)
+{
+    // Have this instruction set itself as the producer of its destination
+    // register(s).
+    createDependency(inst);
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::addToOrderList(OpClass op_class)
+{
+    assert(!readyInsts[op_class].empty());
+
+    ListOrderEntry queue_entry;
+
+    queue_entry.queueType = op_class;
+
+    queue_entry.oldestInst = readyInsts[op_class].top()->seqNum;
+
+    ListOrderIt list_it = listOrder.begin();
+    ListOrderIt list_end_it = listOrder.end();
+
+    while (list_it != list_end_it) {
+        if ((*list_it).oldestInst > queue_entry.oldestInst) {
+            break;
+        }
+
+        list_it++;
+    }
+
+    readyIt[op_class] = listOrder.insert(list_it, queue_entry);
+    queueOnList[op_class] = true;
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::moveToYoungerInst(ListOrderIt list_order_it)
+{
+    // Get iterator of next item on the list
+    // Delete the original iterator
+    // Determine if the next item is either the end of the list or younger
+    // than the new instruction.  If so, then add in a new iterator right here.
+    // If not, then move along.
+    ListOrderEntry queue_entry;
+    OpClass op_class = (*list_order_it).queueType;
+    ListOrderIt next_it = list_order_it;
+
+    ++next_it;
+
+    queue_entry.queueType = op_class;
+    queue_entry.oldestInst = readyInsts[op_class].top()->seqNum;
+
+    while (next_it != listOrder.end() &&
+           (*next_it).oldestInst < queue_entry.oldestInst) {
+        ++next_it;
+    }
+
+    readyIt[op_class] = listOrder.insert(next_it, queue_entry);
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
+{
+    // The CPU could have been sleeping until this op completed (*extremely*
+    // long latency op).  Wake it if it was.  This may be overkill.
+    iewStage->wakeCPU();
+
+    fuPool->freeUnit(fu_idx);
+
+    int &size = issueToExecuteQueue->access(0)->size;
+
+    issueToExecuteQueue->access(0)->insts[size++] = inst;
+}
+*/
+// @todo: Figure out a better way to remove the squashed items from the
+// lists.  Checking the top item of each list to see if it's squashed
+// wastes time and forces jumps.
+template <class Impl>
+void
+InstQueue<Impl>::scheduleReadyInsts()
+{
+    DPRINTF(IQ, "Attempting to schedule ready instructions from "
+            "the IQ.\n");
+
+//    IssueStruct *i2e_info = issueToExecuteQueue->access(0);
+/*
+    // Will need to reorder the list if either a queue is not on the list,
+    // or it has an older instruction than last time.
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        if (!readyInsts[i].empty()) {
+            if (!queueOnList[i]) {
+                addToOrderList(OpClass(i));
+            } else if (readyInsts[i].top()->seqNum  <
+                       (*readyIt[i]).oldestInst) {
+                listOrder.erase(readyIt[i]);
+                addToOrderList(OpClass(i));
+            }
+        }
+    }
+
+    // Have iterator to head of the list
+    // While I haven't exceeded bandwidth or reached the end of the list,
+    // Try to get a FU that can do what this op needs.
+    // If successful, change the oldestInst to the new top of the list, put
+    // the queue in the proper place in the list.
+    // Increment the iterator.
+    // This will avoid trying to schedule a certain op class if there are no
+    // FUs that handle it.
+    ListOrderIt order_it = listOrder.begin();
+    ListOrderIt order_end_it = listOrder.end();
+    int total_issued = 0;
+    int exec_queue_slot = i2e_info->size;
+
+    while (exec_queue_slot < totalWidth && order_it != order_end_it) {
+        OpClass op_class = (*order_it).queueType;
+
+        assert(!readyInsts[op_class].empty());
+
+        DynInstPtr issuing_inst = readyInsts[op_class].top();
+
+        assert(issuing_inst->seqNum == (*order_it).oldestInst);
+
+        if (issuing_inst->isSquashed()) {
+            readyInsts[op_class].pop();
+
+            if (!readyInsts[op_class].empty()) {
+                moveToYoungerInst(order_it);
+            } else {
+                readyIt[op_class] = listOrder.end();
+                queueOnList[op_class] = false;
+            }
+
+            listOrder.erase(order_it++);
+
+            ++iqSquashedInstsIssued;
+
+            continue;
+        }
+
+        int idx = fuPool->getUnit(op_class);
+
+        if (idx != -1) {
+            int op_latency = fuPool->getOpLatency(op_class);
+
+            if (op_latency == 1) {
+                i2e_info->insts[exec_queue_slot++] = issuing_inst;
+                i2e_info->size++;
+
+                // Add the FU onto the list of FU's to be freed next cycle.
+                fuPool->freeUnit(idx);
+            } else {
+                int issue_latency = fuPool->getIssueLatency(op_class);
+
+                if (issue_latency > 1) {
+                    // Generate completion event for the FU
+                    FUCompletion *execution = new FUCompletion(issuing_inst,
+                                                               idx, this);
+
+                    execution->schedule(curTick + issue_latency - 1);
+                } else {
+                    i2e_info->insts[exec_queue_slot++] = issuing_inst;
+                    i2e_info->size++;
+
+                    // Add the FU onto the list of FU's to be freed next cycle.
+                    fuPool->freeUnit(idx);
+                }
+            }
+
+            DPRINTF(IQ, "Thread %i: Issuing instruction PC %#x "
+                    "[sn:%lli]\n",
+                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    issuing_inst->seqNum);
+
+            readyInsts[op_class].pop();
+
+            if (!readyInsts[op_class].empty()) {
+                moveToYoungerInst(order_it);
+            } else {
+                readyIt[op_class] = listOrder.end();
+                queueOnList[op_class] = false;
+            }
+
+            issuing_inst->setIssued();
+            ++total_issued;
+
+            if (!issuing_inst->isMemRef()) {
+                // Memory instructions can not be freed from the IQ until they
+                // complete.
+                ++freeEntries;
+                count[issuing_inst->threadNumber]--;
+                issuing_inst->removeInIQ();
+            } else {
+                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
+            }
+
+            listOrder.erase(order_it++);
+        } else {
+            ++order_it;
+        }
+    }
+
+    if (total_issued) {
+        cpu->activityThisCycle();
+    } else {
+        DPRINTF(IQ, "Not able to schedule any instructions.\n");
+    }
+*/
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
+{
+    DPRINTF(IQ, "Marking nonspeculative instruction with sequence "
+            "number %i as ready to execute.\n", inst);
+
+    NonSpecMapIt inst_it = nonSpecInsts.find(inst);
+
+    assert(inst_it != nonSpecInsts.end());
+
+//    unsigned tid = (*inst_it).second->threadNumber;
+
+    // Mark this instruction as ready to issue.
+    (*inst_it).second->setCanIssue();
+
+    // Now schedule the instruction.
+//    if (!(*inst_it).second->isMemRef()) {
+        addIfReady((*inst_it).second);
+//    } else {
+//        memDepUnit[tid].nonSpecInstReady((*inst_it).second);
+//    }
+
+    nonSpecInsts.erase(inst_it);
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::commit(const InstSeqNum &inst, unsigned tid)
+{
+    /*Need to go through each thread??*/
+    DPRINTF(IQ, "[tid:%i]: Committing instructions older than [sn:%i]\n",
+            tid,inst);
+
+    ListIt iq_it = instList[tid].begin();
+
+    while (iq_it != instList[tid].end() &&
+           (*iq_it)->seqNum <= inst) {
+        ++iq_it;
+        instList[tid].pop_front();
+    }
+
+    assert(freeEntries == (numEntries - countInsts()));
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
+{
+    DPRINTF(IQ, "Waking dependents of completed instruction.\n");
+    // Look at the physical destination register of the DynInst
+    // and look it up on the dependency graph.  Then mark as ready
+    // any instructions within the instruction queue.
+/*
+    DependencyEntry *curr;
+    DependencyEntry *prev;
+*/
+    // Tell the memory dependence unit to wake any dependents on this
+    // instruction if it is a memory instruction.  Also complete the memory
+    // instruction at this point since we know it executed fine.
+    // @todo: Might want to rename "completeMemInst" to
+    // something that indicates that it won't need to be replayed, and call
+    // this earlier.  Might not be a big deal.
+    if (completed_inst->isMemRef()) {
+//        memDepUnit[completed_inst->threadNumber].wakeDependents(completed_inst);
+        completeMemInst(completed_inst);
+    }
+    completed_inst->wakeDependents();
+/*
+    for (int dest_reg_idx = 0;
+         dest_reg_idx < completed_inst->numDestRegs();
+         dest_reg_idx++)
+    {
+        PhysRegIndex dest_reg =
+            completed_inst->renamedDestRegIdx(dest_reg_idx);
+
+        // Special case of uniq or control registers.  They are not
+        // handled by the IQ and thus have no dependency graph entry.
+        // @todo Figure out a cleaner way to handle this.
+        if (dest_reg >= numPhysRegs) {
+            continue;
+        }
+
+        DPRINTF(IQ, "Waking any dependents on register %i.\n",
+                (int) dest_reg);
+
+        //Maybe abstract this part into a function.
+        //Go through the dependency chain, marking the registers as ready
+        //within the waiting instructions.
+
+        curr = dependGraph[dest_reg].next;
+
+        while (curr) {
+            DPRINTF(IQ, "Waking up a dependent instruction, PC%#x.\n",
+                    curr->inst->readPC());
+
+            // Might want to give more information to the instruction
+            // so that it knows which of its source registers is ready.
+            // However that would mean that the dependency graph entries
+            // would need to hold the src_reg_idx.
+            curr->inst->markSrcRegReady();
+
+            addIfReady(curr->inst);
+
+            DependencyEntry::mem_alloc_counter--;
+
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
+
+            delete prev;
+        }
+
+        // Reset the head node now that all of its dependents have been woken
+        // up.
+        dependGraph[dest_reg].next = NULL;
+        dependGraph[dest_reg].inst = NULL;
+
+        // Mark the scoreboard as having that register ready.
+        regScoreboard[dest_reg] = true;
+    }
+*/
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst)
+{
+    OpClass op_class = ready_inst->opClass();
+
+    readyInsts.push(ready_inst);
+
+    DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
+            "the ready list, PC %#x opclass:%i [sn:%lli].\n",
+            ready_inst->readPC(), op_class, ready_inst->seqNum);
+}
+/*
+template <class Impl>
+void
+InstQueue<Impl>::rescheduleMemInst(DynInstPtr &resched_inst)
+{
+    memDepUnit[resched_inst->threadNumber].reschedule(resched_inst);
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::replayMemInst(DynInstPtr &replay_inst)
+{
+    memDepUnit[replay_inst->threadNumber].replay(replay_inst);
+}
+*/
+template <class Impl>
+void
+InstQueue<Impl>::completeMemInst(DynInstPtr &completed_inst)
+{
+    int tid = completed_inst->threadNumber;
+
+    DPRINTF(IQ, "Completing mem instruction PC:%#x [sn:%lli]\n",
+            completed_inst->readPC(), completed_inst->seqNum);
+
+    ++freeEntries;
+
+//    completed_inst->memOpDone = true;
+
+//    memDepUnit[tid].completed(completed_inst);
+
+    count[tid]--;
+}
+/*
+template <class Impl>
+void
+InstQueue<Impl>::violation(DynInstPtr &store,
+                                  DynInstPtr &faulting_load)
+{
+    memDepUnit[store->threadNumber].violation(store, faulting_load);
+}
+*/
+template <class Impl>
+void
+InstQueue<Impl>::squash(unsigned tid)
+{
+    DPRINTF(IQ, "[tid:%i]: Starting to squash instructions in "
+            "the IQ.\n", tid);
+
+    // Read instruction sequence number of last instruction out of the
+    // time buffer.
+//    squashedSeqNum[tid] = fromCommit->commitInfo[tid].doneSeqNum;
+
+    // Setup the squash iterator to point to the tail.
+    squashIt[tid] = instList[tid].end();
+    --squashIt[tid];
+
+    // Call doSquash if there are insts in the IQ
+    if (count[tid] > 0) {
+        doSquash(tid);
+    }
+
+    // Also tell the memory dependence unit to squash.
+//    memDepUnit[tid].squash(squashedSeqNum[tid], tid);
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::doSquash(unsigned tid)
+{
+    // Make sure the squashed sequence number is valid.
+    assert(squashedSeqNum[tid] != 0);
+
+    DPRINTF(IQ, "[tid:%i]: Squashing until sequence number %i!\n",
+            tid, squashedSeqNum[tid]);
+
+    // Squash any instructions younger than the squashed sequence number
+    // given.
+    while (squashIt[tid] != instList[tid].end() &&
+           (*squashIt[tid])->seqNum > squashedSeqNum[tid]) {
+
+        DynInstPtr squashed_inst = (*squashIt[tid]);
+
+        // Only handle the instruction if it actually is in the IQ and
+        // hasn't already been squashed in the IQ.
+        if (squashed_inst->threadNumber != tid ||
+            squashed_inst->isSquashedInIQ()) {
+            --squashIt[tid];
+            continue;
+        }
+
+        if (!squashed_inst->isIssued() ||
+            (squashed_inst->isMemRef()/* &&
+                                         !squashed_inst->memOpDone*/)) {
+
+            // Remove the instruction from the dependency list.
+            if (!squashed_inst->isNonSpeculative()) {
+/*
+                for (int src_reg_idx = 0;
+                     src_reg_idx < squashed_inst->numSrcRegs();
+                     src_reg_idx++)
+                {
+                    PhysRegIndex src_reg =
+                        squashed_inst->renamedSrcRegIdx(src_reg_idx);
+
+                    // Only remove it from the dependency graph if it was
+                    // placed there in the first place.
+                    // HACK: This assumes that instructions woken up from the
+                    // dependency chain aren't informed that a specific src
+                    // register has become ready.  This may not always be true
+                    // in the future.
+                    // Instead of doing a linked list traversal, we can just
+                    // remove these squashed instructions either at issue time,
+                    // or when the register is overwritten.  The only downside
+                    // to this is it leaves more room for error.
+
+                    if (!squashed_inst->isReadySrcRegIdx(src_reg_idx) &&
+                        src_reg < numPhysRegs) {
+                        dependGraph[src_reg].remove(squashed_inst);
+                    }
+
+
+                    ++iqSquashedOperandsExamined;
+                }
+*/
+                // Might want to remove producers as well.
+            } else {
+                nonSpecInsts[squashed_inst->seqNum] = NULL;
+
+                nonSpecInsts.erase(squashed_inst->seqNum);
+
+                ++iqSquashedNonSpecRemoved;
+            }
+
+            // Might want to also clear out the head of the dependency graph.
+
+            // Mark it as squashed within the IQ.
+            squashed_inst->setSquashedInIQ();
+
+            // @todo: Remove this hack where several statuses are set so the
+            // inst will flow through the rest of the pipeline.
+            squashed_inst->setIssued();
+            squashed_inst->setCanCommit();
+//            squashed_inst->removeInIQ();
+
+            //Update Thread IQ Count
+            count[squashed_inst->threadNumber]--;
+
+            ++freeEntries;
+
+            if (numThreads > 1) {
+                DPRINTF(IQ, "[tid:%i]: Instruction PC %#x squashed.\n",
+                        tid, squashed_inst->readPC());
+            } else {
+                DPRINTF(IQ, "Instruction PC %#x squashed.\n",
+                        squashed_inst->readPC());
+            }
+        }
+
+        --squashIt[tid];
+        ++iqSquashedInstsExamined;
+    }
+}
+/*
+template <class Impl>
+void
+InstQueue<Impl>::DependencyEntry::insert(DynInstPtr &new_inst)
+{
+    //Add this new, dependent instruction at the head of the dependency
+    //chain.
+
+    // First create the entry that will be added to the head of the
+    // dependency chain.
+    DependencyEntry *new_entry = new DependencyEntry;
+    new_entry->next = this->next;
+    new_entry->inst = new_inst;
+
+    // Then actually add it to the chain.
+    this->next = new_entry;
+
+    ++mem_alloc_counter;
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove)
+{
+    DependencyEntry *prev = this;
+    DependencyEntry *curr = this->next;
+
+    // Make sure curr isn't NULL.  Because this instruction is being
+    // removed from a dependency list, it must have been placed there at
+    // an earlier time.  The dependency chain should not be empty,
+    // unless the instruction dependent upon it is already ready.
+    if (curr == NULL) {
+        return;
+    }
+
+    // Find the instruction to remove within the dependency linked list.
+    while (curr->inst != inst_to_remove) {
+        prev = curr;
+        curr = curr->next;
+
+        assert(curr != NULL);
+    }
+
+    // Now remove this instruction from the list.
+    prev->next = curr->next;
+
+    --mem_alloc_counter;
+
+    // Could push this off to the destructor of DependencyEntry
+    curr->inst = NULL;
+
+    delete curr;
+}
+
+template <class Impl>
+bool
+InstQueue<Impl>::addToDependents(DynInstPtr &new_inst)
+{
+    // Loop through the instruction's source registers, adding
+    // them to the dependency list if they are not ready.
+    int8_t total_src_regs = new_inst->numSrcRegs();
+    bool return_val = false;
+
+    for (int src_reg_idx = 0;
+         src_reg_idx < total_src_regs;
+         src_reg_idx++)
+    {
+        // Only add it to the dependency graph if it's not ready.
+        if (!new_inst->isReadySrcRegIdx(src_reg_idx)) {
+            PhysRegIndex src_reg = new_inst->renamedSrcRegIdx(src_reg_idx);
+
+            // Check the IQ's scoreboard to make sure the register
+            // hasn't become ready while the instruction was in flight
+            // between stages.  Only if it really isn't ready should
+            // it be added to the dependency graph.
+            if (src_reg >= numPhysRegs) {
+                continue;
+            } else if (regScoreboard[src_reg] == false) {
+                DPRINTF(IQ, "Instruction PC %#x has src reg %i that "
+                        "is being added to the dependency chain.\n",
+                        new_inst->readPC(), src_reg);
+
+                dependGraph[src_reg].insert(new_inst);
+
+                // Change the return value to indicate that something
+                // was added to the dependency graph.
+                return_val = true;
+            } else {
+                DPRINTF(IQ, "Instruction PC %#x has src reg %i that "
+                        "became ready before it reached the IQ.\n",
+                        new_inst->readPC(), src_reg);
+                // Mark a register ready within the instruction.
+                new_inst->markSrcRegReady();
+            }
+        }
+    }
+
+    return return_val;
+}
+
+template <class Impl>
+void
+InstQueue<Impl>::createDependency(DynInstPtr &new_inst)
+{
+    //Actually nothing really needs to be marked when an
+    //instruction becomes the producer of a register's value,
+    //but for convenience a ptr to the producing instruction will
+    //be placed in the head node of the dependency links.
+    int8_t total_dest_regs = new_inst->numDestRegs();
+
+    for (int dest_reg_idx = 0;
+         dest_reg_idx < total_dest_regs;
+         dest_reg_idx++)
+    {
+        PhysRegIndex dest_reg = new_inst->renamedDestRegIdx(dest_reg_idx);
+
+        // Instructions that use the misc regs will have a reg number
+        // higher than the normal physical registers.  In this case these
+        // registers are not renamed, and there is no need to track
+        // dependencies as these instructions must be executed at commit.
+        if (dest_reg >= numPhysRegs) {
+            continue;
+        }
+
+        if (dependGraph[dest_reg].next) {
+            dumpDependGraph();
+            panic("Dependency graph %i not empty!", dest_reg);
+        }
+
+        dependGraph[dest_reg].inst = new_inst;
+
+        // Mark the scoreboard to say it's not yet ready.
+        regScoreboard[dest_reg] = false;
+    }
+}
+*/
+template <class Impl>
+void
+InstQueue<Impl>::addIfReady(DynInstPtr &inst)
+{
+    //If the instruction now has all of its source registers
+    // available, then add it to the list of ready instructions.
+    if (inst->readyToIssue()) {
+
+        //Add the instruction to the proper ready list.
+        if (inst->isMemRef()) {
+
+            DPRINTF(IQ, "Checking if memory instruction can issue.\n");
+
+            // Message to the mem dependence unit that this instruction has
+            // its registers ready.
+
+//            memDepUnit[inst->threadNumber].regsReady(inst);
+
+            return;
+        }
+
+        OpClass op_class = inst->opClass();
+
+        DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
+                "the ready list, PC %#x opclass:%i [sn:%lli].\n",
+                inst->readPC(), op_class, inst->seqNum);
+
+        readyInsts.push(inst);
+    }
+}
+
+template <class Impl>
+int
+InstQueue<Impl>::countInsts()
+{
+    //ksewell:This works but definitely could use a cleaner write
+    //with a more intuitive way of counting. Right now it's
+    //just brute force ....
+
+#if 0
+    int total_insts = 0;
+
+    for (int i = 0; i < numThreads; ++i) {
+        ListIt count_it = instList[i].begin();
+
+        while (count_it != instList[i].end()) {
+            if (!(*count_it)->isSquashed() && !(*count_it)->isSquashedInIQ()) {
+                if (!(*count_it)->isIssued()) {
+                    ++total_insts;
+                } else if ((*count_it)->isMemRef() &&
+                           !(*count_it)->memOpDone) {
+                    // Loads that have not been marked as executed still count
+                    // towards the total instructions.
+                    ++total_insts;
+                }
+            }
+
+            ++count_it;
+        }
+    }
+
+    return total_insts;
+#else
+    return numEntries - freeEntries;
+#endif
+}
+/*
+template <class Impl>
+void
+InstQueue<Impl>::dumpDependGraph()
+{
+    DependencyEntry *curr;
+
+    for (int i = 0; i < numPhysRegs; ++i)
+    {
+        curr = &dependGraph[i];
+
+        if (curr->inst) {
+            cprintf("dependGraph[%i]: producer: %#x [sn:%lli] consumer: ",
+                    i, curr->inst->readPC(), curr->inst->seqNum);
+        } else {
+            cprintf("dependGraph[%i]: No producer. consumer: ", i);
+        }
+
+        while (curr->next != NULL) {
+            curr = curr->next;
+
+            cprintf("%#x [sn:%lli] ",
+                    curr->inst->readPC(), curr->inst->seqNum);
+        }
+
+        cprintf("\n");
+    }
+}
+*/
+template <class Impl>
+void
+InstQueue<Impl>::dumpLists()
+{
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        cprintf("Ready list %i size: %i\n", i, readyInsts.size());
+
+        cprintf("\n");
+    }
+
+    cprintf("Non speculative list size: %i\n", nonSpecInsts.size());
+
+    NonSpecMapIt non_spec_it = nonSpecInsts.begin();
+    NonSpecMapIt non_spec_end_it = nonSpecInsts.end();
+
+    cprintf("Non speculative list: ");
+
+    while (non_spec_it != non_spec_end_it) {
+        cprintf("%#x [sn:%lli]", (*non_spec_it).second->readPC(),
+                (*non_spec_it).second->seqNum);
+        ++non_spec_it;
+    }
+
+    cprintf("\n");
+/*
+    ListOrderIt list_order_it = listOrder.begin();
+    ListOrderIt list_order_end_it = listOrder.end();
+    int i = 1;
+
+    cprintf("List order: ");
+
+    while (list_order_it != list_order_end_it) {
+        cprintf("%i OpClass:%i [sn:%lli] ", i, (*list_order_it).queueType,
+                (*list_order_it).oldestInst);
+
+        ++list_order_it;
+        ++i;
+    }
+*/
+    cprintf("\n");
+}
+
+
+template <class Impl>
+void
+InstQueue<Impl>::dumpInsts()
+{
+    for (int i = 0; i < numThreads; ++i) {
+//        int num = 0;
+//        int valid_num = 0;
+/*
+      ListIt inst_list_it = instList[i].begin();
+
+        while (inst_list_it != instList[i].end())
+        {
+            cprintf("Instruction:%i\n",
+                    num);
+            if (!(*inst_list_it)->isSquashed()) {
+                if (!(*inst_list_it)->isIssued()) {
+                    ++valid_num;
+                    cprintf("Count:%i\n", valid_num);
+                } else if ((*inst_list_it)->isMemRef() &&
+                           !(*inst_list_it)->memOpDone) {
+                    // Loads that have not been marked as executed still count
+                    // towards the total instructions.
+                    ++valid_num;
+                    cprintf("Count:%i\n", valid_num);
+                }
+            }
+
+            cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                    "Issued:%i\nSquashed:%i\n",
+                    (*inst_list_it)->readPC(),
+                    (*inst_list_it)->seqNum,
+                    (*inst_list_it)->threadNumber,
+                    (*inst_list_it)->isIssued(),
+                    (*inst_list_it)->isSquashed());
+
+            if ((*inst_list_it)->isMemRef()) {
+                cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+            }
+
+            cprintf("\n");
+
+            inst_list_it++;
+            ++num;
+        }
+*/
+    }
+}
diff --git a/cpu/ozone/lsq_unit.cc b/cpu/ozone/lsq_unit.cc
new file mode 100644
index 000000000..3ac51b87d
--- /dev/null
+++ b/cpu/ozone/lsq_unit.cc
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/lsq_unit_impl.hh"
+
+// Force the instantiation of LDSTQ for all the implementations we care about.
+template class OzoneLSQ<OzoneImpl>;
+
diff --git a/cpu/ozone/lsq_unit.hh b/cpu/ozone/lsq_unit.hh
new file mode 100644
index 000000000..3c3e3988c
--- /dev/null
+++ b/cpu/ozone/lsq_unit.hh
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_OZONE_LSQ_UNIT_HH__
+#define __CPU_OZONE_LSQ_UNIT_HH__
+
+#include <map>
+#include <queue>
+#include <algorithm>
+
+#include "arch/faults.hh"
+#include "arch/isa_traits.hh"
+#include "config/full_system.hh"
+#include "base/hashmap.hh"
+#include "cpu/inst_seq.hh"
+#include "mem/mem_interface.hh"
+//#include "mem/page_table.hh"
+#include "sim/sim_object.hh"
+
+class PageTable;
+
+/**
+ * Class that implements the actual LQ and SQ for each specific thread.
+ * Both are circular queues; load entries are freed upon committing, while
+ * store entries are freed once they writeback. The LSQUnit tracks if there
+ * are memory ordering violations, and also detects partial load to store
+ * forwarding cases (a store only has part of a load's data) that requires
+ * the load to wait until the store writes back. In the former case it
+ * holds onto the instruction until the dependence unit looks at it, and
+ * in the latter it stalls the LSQ until the store writes back. At that
+ * point the load is replayed.
+ */
+template <class Impl>
+class OzoneLSQ {
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::BackEnd BackEnd;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::IssueStruct IssueStruct;
+
+    typedef TheISA::IntReg IntReg;
+
+    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator LdMapIt;
+
+  private:
+    class StoreCompletionEvent : public Event {
+      public:
+        /** Constructs a store completion event. */
+        StoreCompletionEvent(int store_idx, Event *wb_event, OzoneLSQ *lsq_ptr);
+
+        /** Processes the store completion event. */
+        void process();
+
+        /** Returns the description of this event. */
+        const char *description();
+
+      private:
+        /** The store index of the store being written back. */
+        int storeIdx;
+        /** The writeback event for the store.  Needed for store
+         * conditionals.
+         */
+        Event *wbEvent;
+        /** The pointer to the LSQ unit that issued the store. */
+        OzoneLSQ<Impl> *lsqPtr;
+    };
+
+    friend class StoreCompletionEvent;
+
+  public:
+    /** Constructs an LSQ unit. init() must be called prior to use. */
+    OzoneLSQ();
+
+    /** Initializes the LSQ unit with the specified number of entries. */
+    void init(Params *params, unsigned maxLQEntries,
+              unsigned maxSQEntries, unsigned id);
+
+    /** Returns the name of the LSQ unit. */
+    std::string name() const;
+
+    /** Sets the CPU pointer. */
+    void setCPU(FullCPU *cpu_ptr)
+    { cpu = cpu_ptr; }
+
+    /** Sets the back-end stage pointer. */
+    void setBE(BackEnd *be_ptr)
+    { be = be_ptr; }
+
+    /** Sets the page table pointer. */
+    void setPageTable(PageTable *pt_ptr);
+
+    /** Ticks the LSQ unit, which in this case only resets the number of
+     * used cache ports.
+     * @todo: Move the number of used ports up to the LSQ level so it can
+     * be shared by all LSQ units.
+     */
+    void tick() { usedPorts = 0; }
+
+    /** Inserts an instruction. */
+    void insert(DynInstPtr &inst);
+    /** Inserts a load instruction. */
+    void insertLoad(DynInstPtr &load_inst);
+    /** Inserts a store instruction. */
+    void insertStore(DynInstPtr &store_inst);
+
+    /** Executes a load instruction. */
+    Fault executeLoad(DynInstPtr &inst);
+
+    Fault executeLoad(int lq_idx);
+    /** Executes a store instruction. */
+    Fault executeStore(DynInstPtr &inst);
+
+    /** Commits the head load. */
+    void commitLoad();
+    /** Commits a specific load, given by the sequence number. */
+    void commitLoad(InstSeqNum &inst);
+    /** Commits loads older than a specific sequence number. */
+    void commitLoads(InstSeqNum &youngest_inst);
+
+    /** Commits stores older than a specific sequence number. */
+    void commitStores(InstSeqNum &youngest_inst);
+
+    /** Writes back stores. */
+    void writebackStores();
+
+    // @todo: Include stats in the LSQ unit.
+    //void regStats();
+
+    /** Clears all the entries in the LQ. */
+    void clearLQ();
+
+    /** Clears all the entries in the SQ. */
+    void clearSQ();
+
+    /** Resizes the LQ to a given size. */
+    void resizeLQ(unsigned size);
+
+    /** Resizes the SQ to a given size. */
+    void resizeSQ(unsigned size);
+
+    /** Squashes all instructions younger than a specific sequence number. */
+    void squash(const InstSeqNum &squashed_num);
+
+    /** Returns if there is a memory ordering violation. Value is reset upon
+     * call to getMemDepViolator().
+     */
+    bool violation() { return memDepViolator; }
+
+    /** Returns the memory ordering violator. */
+    DynInstPtr getMemDepViolator();
+
+    /** Returns if a load became blocked due to the memory system.  It clears
+     *  the bool's value upon this being called.
+     */
+    inline bool loadBlocked();
+
+    /** Returns the number of free entries (min of free LQ and SQ entries). */
+    unsigned numFreeEntries();
+
+    /** Returns the number of loads ready to execute. */
+    int numLoadsReady();
+
+    /** Returns the number of loads in the LQ. */
+    int numLoads() { return loads; }
+
+    /** Returns the number of stores in the SQ. */
+    int numStores() { return stores; }
+
+    /** Returns if either the LQ or SQ is full. */
+    bool isFull() { return lqFull() || sqFull(); }
+
+    /** Returns if the LQ is full. */
+    bool lqFull() { return loads >= (LQEntries - 1); }
+
+    /** Returns if the SQ is full. */
+    bool sqFull() { return stores >= (SQEntries - 1); }
+
+    /** Debugging function to dump instructions in the LSQ. */
+    void dumpInsts();
+
+    /** Returns the number of instructions in the LSQ. */
+    unsigned getCount() { return loads + stores; }
+
+    /** Returns if there are any stores to writeback. */
+    bool hasStoresToWB() { return storesToWB; }
+
+    /** Returns the number of stores to writeback. */
+    int numStoresToWB() { return storesToWB; }
+
+    /** Returns if the LSQ unit will writeback on this cycle. */
+    bool willWB() { return storeQueue[storeWBIdx].canWB &&
+                        !storeQueue[storeWBIdx].completed &&
+                        !dcacheInterface->isBlocked(); }
+
+  private:
+    /** Completes the store at the specified index. */
+    void completeStore(int store_idx);
+
+    /** Increments the given store index (circular queue). */
+    inline void incrStIdx(int &store_idx);
+    /** Decrements the given store index (circular queue). */
+    inline void decrStIdx(int &store_idx);
+    /** Increments the given load index (circular queue). */
+    inline void incrLdIdx(int &load_idx);
+    /** Decrements the given load index (circular queue). */
+    inline void decrLdIdx(int &load_idx);
+
+  private:
+    /** Pointer to the CPU. */
+    FullCPU *cpu;
+
+    /** Pointer to the back-end stage. */
+    BackEnd *be;
+
+    /** Pointer to the D-cache. */
+    MemInterface *dcacheInterface;
+
+    /** Pointer to the page table. */
+    PageTable *pTable;
+
+  public:
+    struct SQEntry {
+        /** Constructs an empty store queue entry. */
+        SQEntry()
+            : inst(NULL), req(NULL), size(0), data(0),
+              canWB(0), committed(0), completed(0)
+        { }
+
+        /** Constructs a store queue entry for a given instruction. */
+        SQEntry(DynInstPtr &_inst)
+            : inst(_inst), req(NULL), size(0), data(0),
+              canWB(0), committed(0), completed(0)
+        { }
+
+        /** The store instruction. */
+        DynInstPtr inst;
+        /** The memory request for the store. */
+        MemReqPtr req;
+        /** The size of the store. */
+        int size;
+        /** The store data. */
+        IntReg data;
+        /** Whether or not the store can writeback. */
+        bool canWB;
+        /** Whether or not the store is committed. */
+        bool committed;
+        /** Whether or not the store is completed. */
+        bool completed;
+    };
+
+    enum Status {
+        Running,
+        Idle,
+        DcacheMissStall,
+        DcacheMissSwitch
+    };
+
+  private:
+    /** The OzoneLSQ thread id. */
+    unsigned lsqID;
+
+    /** The status of the LSQ unit. */
+    Status _status;
+
+    /** The store queue. */
+    std::vector<SQEntry> storeQueue;
+
+    /** The load queue. */
+    std::vector<DynInstPtr> loadQueue;
+
+    // Consider making these 16 bits
+    /** The number of LQ entries. */
+    unsigned LQEntries;
+    /** The number of SQ entries. */
+    unsigned SQEntries;
+
+    /** The number of load instructions in the LQ. */
+    int loads;
+    /** The number of store instructions in the SQ (excludes those waiting to
+     * writeback).
+     */
+    int stores;
+    /** The number of store instructions in the SQ waiting to writeback. */
+    int storesToWB;
+
+    /** The index of the head instruction in the LQ. */
+    int loadHead;
+    /** The index of the tail instruction in the LQ. */
+    int loadTail;
+
+    /** The index of the head instruction in the SQ. */
+    int storeHead;
+    /** The index of the first instruction that is ready to be written back,
+     * and has not yet been written back.
+     */
+    int storeWBIdx;
+    /** The index of the tail instruction in the SQ. */
+    int storeTail;
+
+    /// @todo Consider moving to a more advanced model with write vs read ports
+    /** The number of cache ports available each cycle. */
+    int cachePorts;
+
+    /** The number of used cache ports in this cycle. */
+    int usedPorts;
+
+    //list<InstSeqNum> mshrSeqNums;
+
+     //Stats::Scalar<> dcacheStallCycles;
+    Counter lastDcacheStall;
+
+    /** Wire to read information from the issue stage time queue. */
+    typename TimeBuffer<IssueStruct>::wire fromIssue;
+
+    // Make these per thread?
+    /** Whether or not the LSQ is stalled. */
+    bool stalled;
+    /** The store that causes the stall due to partial store to load
+     * forwarding.
+     */
+    InstSeqNum stallingStoreIsn;
+    /** The index of the above store. */
+    int stallingLoadIdx;
+
+    /** Whether or not a load is blocked due to the memory system.  It is
+     *  cleared when this value is checked via loadBlocked().
+     */
+    bool isLoadBlocked;
+
+    /** The oldest faulting load instruction. */
+    DynInstPtr loadFaultInst;
+    /** The oldest faulting store instruction. */
+    DynInstPtr storeFaultInst;
+
+    /** The oldest load that caused a memory ordering violation. */
+    DynInstPtr memDepViolator;
+
+    // Will also need how many read/write ports the Dcache has.  Or keep track
+    // of that in stage that is one level up, and only call executeLoad/Store
+    // the appropriate number of times.
+
+  public:
+    /** Executes the load at the given index. */
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    /** Executes the store at the given index. */
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+    /** Returns the index of the head load instruction. */
+    int getLoadHead() { return loadHead; }
+    /** Returns the sequence number of the head load instruction. */
+    InstSeqNum getLoadHeadSeqNum()
+    {
+        if (loadQueue[loadHead]) {
+            return loadQueue[loadHead]->seqNum;
+        } else {
+            return 0;
+        }
+
+    }
+
+    /** Returns the index of the head store instruction. */
+    int getStoreHead() { return storeHead; }
+    /** Returns the sequence number of the head store instruction. */
+    InstSeqNum getStoreHeadSeqNum()
+    {
+        if (storeQueue[storeHead].inst) {
+            return storeQueue[storeHead].inst->seqNum;
+        } else {
+            return 0;
+        }
+
+    }
+
+    /** Returns whether or not the LSQ unit is stalled. */
+    bool isStalled()  { return stalled; }
+};
+
+template <class Impl>
+template <class T>
+Fault
+OzoneLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+    //Depending on issue2execute delay a squashed load could
+    //execute if it is found to be squashed in the same
+    //cycle it is scheduled to execute
+    assert(loadQueue[load_idx]);
+
+    if (loadQueue[load_idx]->isExecuted()) {
+        panic("Should not reach this point with split ops!");
+
+        memcpy(&data,req->data,req->size);
+
+        return NoFault;
+    }
+
+    // Make sure this isn't an uncacheable access
+    // A bit of a hackish way to get uncached accesses to work only if they're
+    // at the head of the LSQ and are ready to commit (at the head of the ROB
+    // too).
+    // @todo: Fix uncached accesses.
+    if (req->flags & UNCACHEABLE &&
+        (load_idx != loadHead || !loadQueue[load_idx]->readyToCommit())) {
+
+        return TheISA::genMachineCheckFault();
+    }
+
+    // Check the SQ for any previous stores that might lead to forwarding
+    int store_idx = loadQueue[load_idx]->sqIdx;
+
+    int store_size = 0;
+
+    DPRINTF(OzoneLSQ, "Read called, load idx: %i, store idx: %i, "
+            "storeHead: %i addr: %#x\n",
+            load_idx, store_idx, storeHead, req->paddr);
+
+    while (store_idx != -1) {
+        // End once we've reached the top of the LSQ
+        if (store_idx == storeWBIdx) {
+            break;
+        }
+
+        // Move the index to one younger
+        if (--store_idx < 0)
+            store_idx += SQEntries;
+
+        assert(storeQueue[store_idx].inst);
+
+        store_size = storeQueue[store_idx].size;
+
+        if (store_size == 0)
+            continue;
+
+        // Check if the store data is within the lower and upper bounds of
+        // addresses that the request needs.
+        bool store_has_lower_limit =
+            req->vaddr >= storeQueue[store_idx].inst->effAddr;
+        bool store_has_upper_limit =
+            (req->vaddr + req->size) <= (storeQueue[store_idx].inst->effAddr +
+                                         store_size);
+        bool lower_load_has_store_part =
+            req->vaddr < (storeQueue[store_idx].inst->effAddr +
+                           store_size);
+        bool upper_load_has_store_part =
+            (req->vaddr + req->size) > storeQueue[store_idx].inst->effAddr;
+
+        // If the store's data has all of the data needed, we can forward.
+        if (store_has_lower_limit && store_has_upper_limit) {
+
+            int shift_amt = req->vaddr & (store_size - 1);
+            // Assumes byte addressing
+            shift_amt = shift_amt << 3;
+
+            // Cast this to type T?
+            data = storeQueue[store_idx].data >> shift_amt;
+
+            req->cmd = Read;
+            assert(!req->completionEvent);
+            req->completionEvent = NULL;
+            req->time = curTick;
+            assert(!req->data);
+            req->data = new uint8_t[64];
+
+            memcpy(req->data, &data, req->size);
+
+            DPRINTF(OzoneLSQ, "Forwarding from store idx %i to load to "
+                    "addr %#x, data %#x\n",
+                    store_idx, req->vaddr, *(req->data));
+
+            typename BackEnd::LdWritebackEvent *wb =
+                new typename BackEnd::LdWritebackEvent(loadQueue[load_idx],
+                                                       be);
+
+            // We'll say this has a 1 cycle load-store forwarding latency
+            // for now.
+            // FIXME - Need to make this a parameter.
+            wb->schedule(curTick);
+
+            // Should keep track of stat for forwarded data
+            return NoFault;
+        } else if ((store_has_lower_limit && lower_load_has_store_part) ||
+                   (store_has_upper_limit && upper_load_has_store_part) ||
+                   (lower_load_has_store_part && upper_load_has_store_part)) {
+            // This is the partial store-load forwarding case where a store
+            // has only part of the load's data.
+
+            // If it's already been written back, then don't worry about
+            // stalling on it.
+            if (storeQueue[store_idx].completed) {
+                continue;
+            }
+
+            // Must stall load and force it to retry, so long as it's the oldest
+            // load that needs to do so.
+            if (!stalled ||
+                (stalled &&
+                 loadQueue[load_idx]->seqNum <
+                 loadQueue[stallingLoadIdx]->seqNum)) {
+                stalled = true;
+                stallingStoreIsn = storeQueue[store_idx].inst->seqNum;
+                stallingLoadIdx = load_idx;
+            }
+
+            // Tell IQ/mem dep unit that this instruction will need to be
+            // rescheduled eventually
+            be->rescheduleMemInst(loadQueue[load_idx]);
+
+            DPRINTF(OzoneLSQ, "Load-store forwarding mis-match. "
+                    "Store idx %i to load addr %#x\n",
+                    store_idx, req->vaddr);
+
+            return NoFault;
+        }
+    }
+
+
+    // If there's no forwarding case, then go access memory
+    DynInstPtr inst = loadQueue[load_idx];
+
+    ++usedPorts;
+
+    // if we have a cache, do cache access too
+    if (dcacheInterface) {
+        if (dcacheInterface->isBlocked()) {
+            isLoadBlocked = true;
+            // No fault occurred, even though the interface is blocked.
+            return NoFault;
+        }
+
+        DPRINTF(OzoneLSQ, "D-cache: PC:%#x reading from paddr:%#x "
+                "vaddr:%#x flags:%i\n",
+                inst->readPC(), req->paddr, req->vaddr, req->flags);
+
+        // Setup MemReq pointer
+        req->cmd = Read;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+
+        assert(!req->completionEvent);
+        req->completionEvent =
+            new typename BackEnd::LdWritebackEvent(loadQueue[load_idx], be);
+
+        // Do Cache Access
+        MemAccessResult result = dcacheInterface->access(req);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        // @todo: Probably should support having no events
+        if (result != MA_HIT) {
+            DPRINTF(OzoneLSQ, "D-cache miss!\n");
+            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
+                    inst->seqNum);
+
+            lastDcacheStall = curTick;
+
+            _status = DcacheMissStall;
+
+        } else {
+//            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
+//                    inst->seqNum);
+
+            DPRINTF(OzoneLSQ, "D-cache hit!\n");
+        }
+    } else {
+        fatal("Must use D-cache with new memory system");
+    }
+
+    return NoFault;
+}
+
+template <class Impl>
+template <class T>
+Fault
+OzoneLSQ<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+    assert(storeQueue[store_idx].inst);
+
+    DPRINTF(OzoneLSQ, "Doing write to store idx %i, addr %#x data %#x"
+            " | storeHead:%i [sn:%i]\n",
+            store_idx, req->paddr, data, storeHead,
+            storeQueue[store_idx].inst->seqNum);
+
+    storeQueue[store_idx].req = req;
+    storeQueue[store_idx].size = sizeof(T);
+    storeQueue[store_idx].data = data;
+
+    // This function only writes the data to the store queue, so no fault
+    // can happen here.
+    return NoFault;
+}
+
+template <class Impl>
+inline bool
+OzoneLSQ<Impl>::loadBlocked()
+{
+    bool ret_val = isLoadBlocked;
+    isLoadBlocked = false;
+    return ret_val;
+}
+
+#endif // __CPU_OZONE_LSQ_UNIT_HH__
diff --git a/cpu/ozone/lsq_unit_impl.hh b/cpu/ozone/lsq_unit_impl.hh
new file mode 100644
index 000000000..6c7977250
--- /dev/null
+++ b/cpu/ozone/lsq_unit_impl.hh
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/isa_traits.hh"
+#include "base/str.hh"
+#include "cpu/ozone/lsq_unit.hh"
+
+template <class Impl>
+OzoneLSQ<Impl>::StoreCompletionEvent::StoreCompletionEvent(int store_idx,
+                                                          Event *wb_event,
+                                                          OzoneLSQ<Impl> *lsq_ptr)
+    : Event(&mainEventQueue),
+      storeIdx(store_idx),
+      wbEvent(wb_event),
+      lsqPtr(lsq_ptr)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::StoreCompletionEvent::process()
+{
+    DPRINTF(OzoneLSQ, "Cache miss complete for store idx:%i\n", storeIdx);
+
+    //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum);
+
+//    lsqPtr->cpu->wakeCPU();
+    if (wbEvent)
+        wbEvent->process();
+    lsqPtr->completeStore(storeIdx);
+}
+
+template <class Impl>
+const char *
+OzoneLSQ<Impl>::StoreCompletionEvent::description()
+{
+    return "LSQ store completion event";
+}
+
+template <class Impl>
+OzoneLSQ<Impl>::OzoneLSQ()
+    : loads(0), stores(0), storesToWB(0), stalled(false), isLoadBlocked(false)
+{
+}
+
+template<class Impl>
+void
+OzoneLSQ<Impl>::init(Params *params, unsigned maxLQEntries,
+                     unsigned maxSQEntries, unsigned id)
+
+{
+    DPRINTF(OzoneLSQ, "Creating OzoneLSQ%i object.\n",id);
+
+    lsqID = id;
+
+    LQEntries = maxLQEntries;
+    SQEntries = maxSQEntries;
+
+    loadQueue.resize(LQEntries);
+    storeQueue.resize(SQEntries);
+
+
+    // May want to initialize these entries to NULL
+
+    loadHead = loadTail = 0;
+
+    storeHead = storeWBIdx = storeTail = 0;
+
+    usedPorts = 0;
+    cachePorts = params->cachePorts;
+
+    dcacheInterface = params->dcacheInterface;
+
+    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+}
+
+template<class Impl>
+std::string
+OzoneLSQ<Impl>::name() const
+{
+    return "lsqunit";
+}
+
+template<class Impl>
+void
+OzoneLSQ<Impl>::clearLQ()
+{
+    loadQueue.clear();
+}
+
+template<class Impl>
+void
+OzoneLSQ<Impl>::clearSQ()
+{
+    storeQueue.clear();
+}
+
+template<class Impl>
+void
+OzoneLSQ<Impl>::setPageTable(PageTable *pt_ptr)
+{
+    DPRINTF(OzoneLSQ, "Setting the page table pointer.\n");
+    pTable = pt_ptr;
+}
+
+template<class Impl>
+void
+OzoneLSQ<Impl>::resizeLQ(unsigned size)
+{
+    assert( size >= LQEntries);
+
+    if (size > LQEntries) {
+        while (size > loadQueue.size()) {
+            DynInstPtr dummy;
+            loadQueue.push_back(dummy);
+            LQEntries++;
+        }
+    } else {
+        LQEntries = size;
+    }
+
+}
+
+template<class Impl>
+void
+OzoneLSQ<Impl>::resizeSQ(unsigned size)
+{
+    if (size > SQEntries) {
+        while (size > storeQueue.size()) {
+            SQEntry dummy;
+            storeQueue.push_back(dummy);
+            SQEntries++;
+        }
+    } else {
+        SQEntries = size;
+    }
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::insert(DynInstPtr &inst)
+{
+    // Make sure we really have a memory reference.
+    assert(inst->isMemRef());
+
+    // Make sure it's one of the two classes of memory references.
+    assert(inst->isLoad() || inst->isStore());
+
+    if (inst->isLoad()) {
+        insertLoad(inst);
+    } else {
+        insertStore(inst);
+    }
+
+//    inst->setInLSQ();
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::insertLoad(DynInstPtr &load_inst)
+{
+    assert((loadTail + 1) % LQEntries != loadHead && loads < LQEntries);
+
+    DPRINTF(OzoneLSQ, "Inserting load PC %#x, idx:%i [sn:%lli]\n",
+            load_inst->readPC(), loadTail, load_inst->seqNum);
+
+    load_inst->lqIdx = loadTail;
+
+    if (stores == 0) {
+        load_inst->sqIdx = -1;
+    } else {
+        load_inst->sqIdx = storeTail;
+    }
+
+    loadQueue[loadTail] = load_inst;
+
+    incrLdIdx(loadTail);
+
+    ++loads;
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::insertStore(DynInstPtr &store_inst)
+{
+    // Make sure it is not full before inserting an instruction.
+    assert((storeTail + 1) % SQEntries != storeHead);
+    assert(stores < SQEntries);
+
+    DPRINTF(OzoneLSQ, "Inserting store PC %#x, idx:%i [sn:%lli]\n",
+            store_inst->readPC(), storeTail, store_inst->seqNum);
+
+    store_inst->sqIdx = storeTail;
+    store_inst->lqIdx = loadTail;
+
+    storeQueue[storeTail] = SQEntry(store_inst);
+
+    incrStIdx(storeTail);
+
+    ++stores;
+
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+OzoneLSQ<Impl>::getMemDepViolator()
+{
+    DynInstPtr temp = memDepViolator;
+
+    memDepViolator = NULL;
+
+    return temp;
+}
+
+template <class Impl>
+unsigned
+OzoneLSQ<Impl>::numFreeEntries()
+{
+    unsigned free_lq_entries = LQEntries - loads;
+    unsigned free_sq_entries = SQEntries - stores;
+
+    // Both the LQ and SQ entries have an extra dummy entry to differentiate
+    // empty/full conditions.  Subtract 1 from the free entries.
+    if (free_lq_entries < free_sq_entries) {
+        return free_lq_entries - 1;
+    } else {
+        return free_sq_entries - 1;
+    }
+}
+
+template <class Impl>
+int
+OzoneLSQ<Impl>::numLoadsReady()
+{
+    int load_idx = loadHead;
+    int retval = 0;
+
+    while (load_idx != loadTail) {
+        assert(loadQueue[load_idx]);
+
+        if (loadQueue[load_idx]->readyToIssue()) {
+            ++retval;
+        }
+    }
+
+    return retval;
+}
+
+#if 0
+template <class Impl>
+Fault
+OzoneLSQ<Impl>::executeLoad()
+{
+    Fault load_fault = NoFault;
+    DynInstPtr load_inst;
+
+    assert(readyLoads.size() != 0);
+
+    // Execute a ready load.
+    LdMapIt ready_it = readyLoads.begin();
+
+    load_inst = (*ready_it).second;
+
+    // Execute the instruction, which is held in the data portion of the
+    // iterator.
+    load_fault = load_inst->execute();
+
+    // If it executed successfully, then switch it over to the executed
+    // loads list.
+    if (load_fault == NoFault) {
+        executedLoads[load_inst->seqNum] = load_inst;
+
+        readyLoads.erase(ready_it);
+    } else {
+        loadFaultInst = load_inst;
+    }
+
+    return load_fault;
+}
+#endif
+
+template <class Impl>
+Fault
+OzoneLSQ<Impl>::executeLoad(DynInstPtr &inst)
+{
+    // Execute a specific load.
+    Fault load_fault = NoFault;
+
+    DPRINTF(OzoneLSQ, "Executing load PC %#x, [sn:%lli]\n",
+            inst->readPC(),inst->seqNum);
+
+    // Make sure it's really in the list.
+    // Normally it should always be in the list.  However,
+    /* due to a syscall it may not be the list.
+#ifdef DEBUG
+    int i = loadHead;
+    while (1) {
+        if (i == loadTail && !find(inst)) {
+            assert(0 && "Load not in the queue!");
+        } else if (loadQueue[i] == inst) {
+            break;
+        }
+
+        i = i + 1;
+        if (i >= LQEntries) {
+            i = 0;
+        }
+    }
+#endif // DEBUG*/
+
+    load_fault = inst->initiateAcc();
+
+    // Might want to make sure that I'm not overwriting a previously faulting
+    // instruction that hasn't been checked yet.
+    // Actually probably want the oldest faulting load
+    if (load_fault != NoFault) {
+        // Maybe just set it as can commit here, although that might cause
+        // some other problems with sending traps to the ROB too quickly.
+//        iewStage->instToCommit(inst);
+//        iewStage->activityThisCycle();
+    }
+
+    return load_fault;
+}
+
+template <class Impl>
+Fault
+OzoneLSQ<Impl>::executeLoad(int lq_idx)
+{
+    // Very hackish.  Not sure the best way to check that this
+    // instruction is at the head of the ROB.  I should have some sort
+    // of extra information here so that I'm not overloading the
+    // canCommit signal for 15 different things.
+    loadQueue[lq_idx]->setCanCommit();
+    Fault ret_fault = executeLoad(loadQueue[lq_idx]);
+    loadQueue[lq_idx]->clearCanCommit();
+    return ret_fault;
+}
+
+template <class Impl>
+Fault
+OzoneLSQ<Impl>::executeStore(DynInstPtr &store_inst)
+{
+    // Make sure that a store exists.
+    assert(stores != 0);
+
+    int store_idx = store_inst->sqIdx;
+
+    DPRINTF(OzoneLSQ, "Executing store PC %#x [sn:%lli]\n",
+            store_inst->readPC(), store_inst->seqNum);
+
+    // Check the recently completed loads to see if any match this store's
+    // address.  If so, then we have a memory ordering violation.
+    int load_idx = store_inst->lqIdx;
+
+    Fault store_fault = store_inst->initiateAcc();
+
+    // Store size should now be available.  Use it to get proper offset for
+    // addr comparisons.
+    int size = storeQueue[store_idx].size;
+
+    if (size == 0) {
+        DPRINTF(OzoneLSQ,"Fault on Store PC %#x, [sn:%lli],Size = 0\n",
+                store_inst->readPC(),store_inst->seqNum);
+
+        return store_fault;
+    }
+
+    assert(store_fault == NoFault);
+
+    if (!storeFaultInst) {
+        if (store_fault != NoFault) {
+            panic("Fault in a store instruction!");
+            storeFaultInst = store_inst;
+        } else if (store_inst->isNonSpeculative()) {
+            // Nonspeculative accesses (namely store conditionals)
+            // need to set themselves as able to writeback if we
+            // haven't had a fault by here.
+            storeQueue[store_idx].canWB = true;
+
+            ++storesToWB;
+        }
+    }
+
+    if (!memDepViolator) {
+        while (load_idx != loadTail) {
+            // Actually should only check loads that have actually executed
+            // Might be safe because effAddr is set to InvalAddr when the
+            // dyn inst is created.
+
+            // Must actually check all addrs in the proper size range
+            // Which is more correct than needs to be.  What if for now we just
+            // assume all loads are quad-word loads, and do the addr based
+            // on that.
+            // @todo: Fix this, magic number being used here
+            if ((loadQueue[load_idx]->effAddr >> 8) ==
+                (store_inst->effAddr >> 8)) {
+                // A load incorrectly passed this store.  Squash and refetch.
+                // For now return a fault to show that it was unsuccessful.
+                memDepViolator = loadQueue[load_idx];
+
+                return TheISA::genMachineCheckFault();
+            }
+
+            incrLdIdx(load_idx);
+        }
+
+        // If we've reached this point, there was no violation.
+        memDepViolator = NULL;
+    }
+
+    return store_fault;
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::commitLoad()
+{
+    assert(loadQueue[loadHead]);
+
+    DPRINTF(OzoneLSQ, "[sn:%lli] Committing head load instruction, PC %#x\n",
+            loadQueue[loadHead]->seqNum, loadQueue[loadHead]->readPC());
+
+
+    loadQueue[loadHead] = NULL;
+
+    incrLdIdx(loadHead);
+
+    --loads;
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::commitLoad(InstSeqNum &inst)
+{
+    // Hopefully I don't use this function too much
+    panic("Don't use this function!");
+
+    int i = loadHead;
+    while (1) {
+        if (i == loadTail) {
+            assert(0 && "Load not in the queue!");
+        } else if (loadQueue[i]->seqNum == inst) {
+            break;
+        }
+
+        ++i;
+        if (i >= LQEntries) {
+            i = 0;
+        }
+    }
+
+//    loadQueue[i]->removeInLSQ();
+    loadQueue[i] = NULL;
+    --loads;
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::commitLoads(InstSeqNum &youngest_inst)
+{
+    assert(loads == 0 || loadQueue[loadHead]);
+
+    while (loads != 0 && loadQueue[loadHead]->seqNum <= youngest_inst) {
+        commitLoad();
+    }
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::commitStores(InstSeqNum &youngest_inst)
+{
+    assert(stores == 0 || storeQueue[storeHead].inst);
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail) {
+        assert(storeQueue[store_idx].inst);
+        if (!storeQueue[store_idx].canWB) {
+            if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
+                break;
+            }
+            DPRINTF(OzoneLSQ, "Marking store as able to write back, PC "
+                    "%#x [sn:%lli]\n",
+                    storeQueue[store_idx].inst->readPC(),
+                    storeQueue[store_idx].inst->seqNum);
+
+            storeQueue[store_idx].canWB = true;
+
+//            --stores;
+            ++storesToWB;
+        }
+
+        incrStIdx(store_idx);
+    }
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::writebackStores()
+{
+    while (storesToWB > 0 &&
+           storeWBIdx != storeTail &&
+           storeQueue[storeWBIdx].inst &&
+           storeQueue[storeWBIdx].canWB &&
+           usedPorts < cachePorts) {
+
+        if (storeQueue[storeWBIdx].size == 0) {
+            completeStore(storeWBIdx);
+
+            incrStIdx(storeWBIdx);
+
+            continue;
+        }
+
+        if (dcacheInterface && dcacheInterface->isBlocked()) {
+            DPRINTF(OzoneLSQ, "Unable to write back any more stores, cache"
+                    " is blocked!\n");
+            break;
+        }
+
+        ++usedPorts;
+
+        if (storeQueue[storeWBIdx].inst->isDataPrefetch()) {
+            incrStIdx(storeWBIdx);
+
+            continue;
+        }
+
+        assert(storeQueue[storeWBIdx].req);
+        assert(!storeQueue[storeWBIdx].committed);
+
+        MemReqPtr req = storeQueue[storeWBIdx].req;
+        storeQueue[storeWBIdx].committed = true;
+
+//	Fault fault = cpu->translateDataReadReq(req);
+        req->cmd = Write;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+        memcpy(req->data, (uint8_t *)&storeQueue[storeWBIdx].data, req->size);
+
+        DPRINTF(OzoneLSQ, "D-Cache: Writing back store idx:%i PC:%#x "
+                "to Addr:%#x, data:%#x [sn:%lli]\n",
+                storeWBIdx,storeQueue[storeWBIdx].inst->readPC(),
+                req->paddr, *(req->data),
+                storeQueue[storeWBIdx].inst->seqNum);
+
+//        if (fault != NoFault) {
+            //What should we do if there is a fault???
+            //for now panic
+//            panic("Page Table Fault!!!!!\n");
+//        }
+
+        if (dcacheInterface) {
+            MemAccessResult result = dcacheInterface->access(req);
+
+            //@todo temp fix for LL/SC (works fine for 1 CPU)
+            if (req->flags & LOCKED) {
+                req->result=1;
+                panic("LL/SC! oh no no support!!!");
+            }
+
+            if (isStalled() &&
+                storeQueue[storeWBIdx].inst->seqNum == stallingStoreIsn) {
+                DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] "
+                        "load idx:%i\n",
+                        stallingStoreIsn, stallingLoadIdx);
+                stalled = false;
+                stallingStoreIsn = 0;
+                be->replayMemInst(loadQueue[stallingLoadIdx]);
+            }
+
+            if (result != MA_HIT && dcacheInterface->doEvents()) {
+                Event *wb = NULL;
+/*
+                typename IEW::LdWritebackEvent *wb = NULL;
+                if (req->flags & LOCKED) {
+                    // Stx_C does not generate a system port transaction.
+                    req->result=0;
+                    wb = new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+                                                            iewStage);
+                }
+*/
+                DPRINTF(OzoneLSQ,"D-Cache Write Miss!\n");
+
+//                DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
+//                        storeQueue[storeWBIdx].inst->seqNum);
+
+                // Will stores need their own kind of writeback events?
+                // Do stores even need writeback events?
+                assert(!req->completionEvent);
+                req->completionEvent = new
+                    StoreCompletionEvent(storeWBIdx, wb, this);
+
+                lastDcacheStall = curTick;
+
+                _status = DcacheMissStall;
+
+                //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum);
+
+                //DPRINTF(OzoneLSQ, "Added MSHR. count = %i\n",mshrSeqNums.size());
+
+                // Increment stat here or something
+            } else {
+                DPRINTF(OzoneLSQ,"D-Cache: Write Hit on idx:%i !\n",
+                        storeWBIdx);
+
+//                DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
+//                        storeQueue[storeWBIdx].inst->seqNum);
+
+                if (req->flags & LOCKED) {
+                    // Stx_C does not generate a system port transaction.
+                    req->result=1;
+                    typename BackEnd::LdWritebackEvent *wb =
+                        new typename BackEnd::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+                                                               be);
+                    wb->schedule(curTick);
+                }
+
+                completeStore(storeWBIdx);
+            }
+
+            incrStIdx(storeWBIdx);
+        } else {
+            panic("Must HAVE DCACHE!!!!!\n");
+        }
+    }
+
+    // Not sure this should set it to 0.
+    usedPorts = 0;
+
+    assert(stores >= 0 && storesToWB >= 0);
+}
+
+/*template <class Impl>
+void
+OzoneLSQ<Impl>::removeMSHR(InstSeqNum seqNum)
+{
+    list<InstSeqNum>::iterator mshr_it = find(mshrSeqNums.begin(),
+                                              mshrSeqNums.end(),
+                                              seqNum);
+
+    if (mshr_it != mshrSeqNums.end()) {
+        mshrSeqNums.erase(mshr_it);
+        DPRINTF(OzoneLSQ, "Removing MSHR. count = %i\n",mshrSeqNums.size());
+    }
+}*/
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::squash(const InstSeqNum &squashed_num)
+{
+    DPRINTF(OzoneLSQ, "Squashing until [sn:%lli]!"
+            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+
+    int load_idx = loadTail;
+    decrLdIdx(load_idx);
+
+    while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
+
+        // Clear the smart pointer to make sure it is decremented.
+        DPRINTF(OzoneLSQ,"Load Instruction PC %#x squashed, "
+                "[sn:%lli]\n",
+                loadQueue[load_idx]->readPC(),
+                loadQueue[load_idx]->seqNum);
+
+        if (isStalled() && load_idx == stallingLoadIdx) {
+            stalled = false;
+            stallingStoreIsn = 0;
+            stallingLoadIdx = 0;
+        }
+
+        loadQueue[load_idx]->squashed = true;
+        loadQueue[load_idx] = NULL;
+        --loads;
+
+        // Inefficient!
+        loadTail = load_idx;
+
+        decrLdIdx(load_idx);
+    }
+
+    int store_idx = storeTail;
+    decrStIdx(store_idx);
+
+    while (stores != 0 && storeQueue[store_idx].inst->seqNum > squashed_num) {
+
+        // Clear the smart pointer to make sure it is decremented.
+        DPRINTF(OzoneLSQ,"Store Instruction PC %#x squashed, "
+                "idx:%i [sn:%lli]\n",
+                storeQueue[store_idx].inst->readPC(),
+                store_idx, storeQueue[store_idx].inst->seqNum);
+
+        // I don't think this can happen.  It should have been cleared by the
+        // stalling load.
+        if (isStalled() &&
+            storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+            panic("Is stalled should have been cleared by stalling load!\n");
+            stalled = false;
+            stallingStoreIsn = 0;
+        }
+
+        storeQueue[store_idx].inst->squashed = true;
+        storeQueue[store_idx].inst = NULL;
+        storeQueue[store_idx].canWB = 0;
+
+        if (storeQueue[store_idx].req) {
+            assert(!storeQueue[store_idx].req->completionEvent);
+        }
+        storeQueue[store_idx].req = NULL;
+        --stores;
+
+        // Inefficient!
+        storeTail = store_idx;
+
+        decrStIdx(store_idx);
+    }
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::dumpInsts()
+{
+    cprintf("Load store queue: Dumping instructions.\n");
+    cprintf("Load queue size: %i\n", loads);
+    cprintf("Load queue: ");
+
+    int load_idx = loadHead;
+
+    while (load_idx != loadTail && loadQueue[load_idx]) {
+        cprintf("[sn:%lli] %#x ", loadQueue[load_idx]->seqNum,
+                loadQueue[load_idx]->readPC());
+
+        incrLdIdx(load_idx);
+    }
+
+    cprintf("\nStore queue size: %i\n", stores);
+    cprintf("Store queue: ");
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail && storeQueue[store_idx].inst) {
+        cprintf("[sn:%lli] %#x ", storeQueue[store_idx].inst->seqNum,
+                storeQueue[store_idx].inst->readPC());
+
+        incrStIdx(store_idx);
+    }
+
+    cprintf("\n");
+}
+
+template <class Impl>
+void
+OzoneLSQ<Impl>::completeStore(int store_idx)
+{
+    assert(storeQueue[store_idx].inst);
+    storeQueue[store_idx].completed = true;
+    --storesToWB;
+    // A bit conservative because a store completion may not free up entries,
+    // but hopefully avoids two store completions in one cycle from making
+    // the CPU tick twice.
+//    cpu->activityThisCycle();
+
+    if (store_idx == storeHead) {
+        do {
+            incrStIdx(storeHead);
+
+            --stores;
+        } while (storeQueue[storeHead].completed &&
+                 storeHead != storeTail);
+
+//        be->updateLSQNextCycle = true;
+    }
+
+    DPRINTF(OzoneLSQ, "Store head idx:%i\n", storeHead);
+
+    if (isStalled() &&
+        storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+        DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] "
+                "load idx:%i\n",
+                stallingStoreIsn, stallingLoadIdx);
+        stalled = false;
+        stallingStoreIsn = 0;
+        be->replayMemInst(loadQueue[stallingLoadIdx]);
+    }
+}
+
+template <class Impl>
+inline void
+OzoneLSQ<Impl>::incrStIdx(int &store_idx)
+{
+    if (++store_idx >= SQEntries)
+        store_idx = 0;
+}
+
+template <class Impl>
+inline void
+OzoneLSQ<Impl>::decrStIdx(int &store_idx)
+{
+    if (--store_idx < 0)
+        store_idx += SQEntries;
+}
+
+template <class Impl>
+inline void
+OzoneLSQ<Impl>::incrLdIdx(int &load_idx)
+{
+    if (++load_idx >= LQEntries)
+        load_idx = 0;
+}
+
+template <class Impl>
+inline void
+OzoneLSQ<Impl>::decrLdIdx(int &load_idx)
+{
+    if (--load_idx < 0)
+        load_idx += LQEntries;
+}
diff --git a/cpu/ozone/null_predictor.hh b/cpu/ozone/null_predictor.hh
new file mode 100644
index 000000000..d19e2cd1c
--- /dev/null
+++ b/cpu/ozone/null_predictor.hh
@@ -0,0 +1,76 @@
+
+#ifndef __CPU_OZONE_NULL_PREDICTOR_HH__
+#define __CPU_OZONE_NULL_PREDICTOR_HH__
+
+#include "arch/isa_traits.hh"
+#include "cpu/inst_seq.hh"
+
+template <class Impl>
+class NullPredictor
+{
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    NullPredictor(Params *p) { }
+
+    struct BPredInfo {
+        BPredInfo()
+            : PC(0), nextPC(0)
+        { }
+
+        BPredInfo(const Addr &pc, const Addr &next_pc)
+            : PC(pc), nextPC(next_pc)
+        { }
+
+        Addr PC;
+        Addr nextPC;
+    };
+
+    BPredInfo lookup(Addr &PC) { return BPredInfo(PC, PC+4); }
+
+    void undo(BPredInfo &bp_info) { return; }
+
+    /**
+     * Predicts whether or not the instruction is a taken branch, and the
+     * target of the branch if it is taken.
+     * @param inst The branch instruction.
+     * @param PC The predicted PC is passed back through this parameter.
+     * @param tid The thread id.
+     * @return Returns if the branch is taken or not.
+     */
+    bool predict(DynInstPtr &inst, Addr &PC, unsigned tid)
+    { return false; }
+
+    /**
+     * Tells the branch predictor to commit any updates until the given
+     * sequence number.
+     * @param done_sn The sequence number to commit any older updates up until.
+     * @param tid The thread id.
+     */
+    void update(const InstSeqNum &done_sn, unsigned tid) { }
+
+    /**
+     * Squashes all outstanding updates until a given sequence number.
+     * @param squashed_sn The sequence number to squash any younger updates up
+     * until.
+     * @param tid The thread id.
+     */
+    void squash(const InstSeqNum &squashed_sn, unsigned tid) { }
+
+    /**
+     * Squashes all outstanding updates until a given sequence number, and
+     * corrects that sn's update with the proper address and taken/not taken.
+     * @param squashed_sn The sequence number to squash any younger updates up
+     * until.
+     * @param corr_target The correct branch target.
+     * @param actually_taken The correct branch direction.
+     * @param tid The thread id.
+     */
+    void squash(const InstSeqNum &squashed_sn, const Addr &corr_target,
+                bool actually_taken, unsigned tid)
+    { }
+
+};
+
+#endif // __CPU_OZONE_NULL_PREDICTOR_HH__
diff --git a/cpu/ozone/ozone_impl.hh b/cpu/ozone/ozone_impl.hh
new file mode 100644
index 000000000..a2c706c60
--- /dev/null
+++ b/cpu/ozone/ozone_impl.hh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_OZONE_OZONE_IMPL_HH__
+#define __CPU_OZONE_OZONE_IMPL_HH__
+
+#include "arch/alpha/isa_traits.hh"
+#include "cpu/o3/bpred_unit.hh"
+#include "cpu/ozone/back_end.hh"
+#include "cpu/ozone/front_end.hh"
+#include "cpu/ozone/inst_queue.hh"
+#include "cpu/ozone/lsq_unit.hh"
+#include "cpu/ozone/null_predictor.hh"
+#include "cpu/ozone/dyn_inst.hh"
+#include "cpu/ozone/simple_params.hh"
+
+template <class Impl>
+class OzoneCPU;
+
+template <class Impl>
+class OzoneDynInst;
+
+struct OzoneImpl {
+    typedef SimpleParams Params;
+    typedef OzoneCPU<OzoneImpl> OzoneCPU;
+    typedef OzoneCPU FullCPU;
+
+    // Would like to put these into their own area.
+//    typedef NullPredictor BranchPred;
+    typedef TwobitBPredUnit<OzoneImpl> BranchPred;
+    typedef FrontEnd<OzoneImpl> FrontEnd;
+    // Will need IQ, LSQ eventually
+    typedef BackEnd<OzoneImpl> BackEnd;
+
+    typedef InstQueue<OzoneImpl> InstQueue;
+    typedef OzoneLSQ<OzoneImpl> LdstQueue;
+
+    typedef OzoneDynInst<OzoneImpl> DynInst;
+    typedef RefCountingPtr<DynInst> DynInstPtr;
+
+    typedef uint64_t IssueStruct;
+
+    enum {
+        MaxThreads = 1
+    };
+};
+
+#endif // __CPU_OZONE_OZONE_IMPL_HH__
diff --git a/cpu/ozone/rename_table.cc b/cpu/ozone/rename_table.cc
new file mode 100644
index 000000000..fff41903e
--- /dev/null
+++ b/cpu/ozone/rename_table.cc
@@ -0,0 +1,7 @@
+
+#include "cpu/ozone/rename_table_impl.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
+
+template class RenameTable<OzoneImpl>;
+template class RenameTable<SimpleImpl>;
diff --git a/cpu/ozone/rename_table.hh b/cpu/ozone/rename_table.hh
new file mode 100644
index 000000000..afbf6ff32
--- /dev/null
+++ b/cpu/ozone/rename_table.hh
@@ -0,0 +1,25 @@
+#ifndef __CPU_OZONE_RENAME_TABLE_HH__
+#define __CPU_OZONE_RENAME_TABLE_HH__
+
+#include "arch/isa_traits.hh"
+
+/** Rename table that holds the rename of each architectural register to
+ *  producing DynInst. Needs to support copying from one table to another.
+ */
+
+template <class Impl>
+class RenameTable {
+  public:
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    RenameTable();
+
+    void copyFrom(const RenameTable<Impl> &table_to_copy);
+
+    DynInstPtr &operator [] (int index)
+    { return table[index]; }
+
+    DynInstPtr table[TheISA::TotalNumRegs];
+};
+
+#endif // __CPU_OZONE_RENAME_TABLE_HH__
diff --git a/cpu/ozone/rename_table_impl.hh b/cpu/ozone/rename_table_impl.hh
new file mode 100644
index 000000000..86fc1cc55
--- /dev/null
+++ b/cpu/ozone/rename_table_impl.hh
@@ -0,0 +1,23 @@
+
+#include <cstdlib>  // Not really sure what to include to get NULL
+#include "cpu/ozone/rename_table.hh"
+
+template <class Impl>
+RenameTable<Impl>::RenameTable()
+{
+    // Actually should set these to dummy dyn insts that have the initial value
+    // and force their values to be initialized.  This keeps everything the
+    // same.
+    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
+        table[i] = NULL;
+    }
+}
+
+template <class Impl>
+void
+RenameTable<Impl>::copyFrom(const RenameTable<Impl> &table_to_copy)
+{
+    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
+        table[i] = table_to_copy.table[i];
+    }
+}
diff --git a/cpu/ozone/simple_impl.hh b/cpu/ozone/simple_impl.hh
new file mode 100644
index 000000000..961bf2ea9
--- /dev/null
+++ b/cpu/ozone/simple_impl.hh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_OZONE_SIMPLE_IMPL_HH__
+#define __CPU_OZONE_SIMPLE_IMPL_HH__
+
+#include "arch/isa_traits.hh"
+#include "cpu/o3/bpred_unit.hh"
+#include "cpu/ozone/cpu.hh"
+#include "cpu/ozone/front_end.hh"
+#include "cpu/ozone/inorder_back_end.hh"
+#include "cpu/ozone/null_predictor.hh"
+#include "cpu/ozone/dyn_inst.hh"
+#include "cpu/ozone/simple_params.hh"
+
+//template <class Impl>
+//class OzoneCPU;
+
+template <class Impl>
+class OzoneDynInst;
+
+struct SimpleImpl {
+    typedef SimpleParams Params;
+    typedef OzoneCPU<SimpleImpl> OzoneCPU;
+    typedef OzoneCPU FullCPU;
+
+    // Would like to put these into their own area.
+//    typedef NullPredictor BranchPred;
+    typedef TwobitBPredUnit<SimpleImpl> BranchPred;
+    typedef FrontEnd<SimpleImpl> FrontEnd;
+    // Will need IQ, LSQ eventually
+    typedef InorderBackEnd<SimpleImpl> BackEnd;
+
+    typedef OzoneDynInst<SimpleImpl> DynInst;
+    typedef RefCountingPtr<DynInst> DynInstPtr;
+
+    typedef uint64_t IssueStruct;
+
+    enum {
+        MaxThreads = 1
+    };
+};
+
+#endif // __CPU_OZONE_SIMPLE_IMPL_HH__
diff --git a/cpu/ozone/simple_params.hh b/cpu/ozone/simple_params.hh
new file mode 100644
index 000000000..e503654aa
--- /dev/null
+++ b/cpu/ozone/simple_params.hh
@@ -0,0 +1,164 @@
+
+
+#ifndef __CPU_OZONE_SIMPLE_PARAMS_HH__
+#define __CPU_OZONE_SIMPLE_PARAMS_HH__
+
+#include "cpu/ozone/cpu.hh"
+
+//Forward declarations
+class AlphaDTB;
+class AlphaITB;
+class FUPool;
+class FunctionalMemory;
+class MemInterface;
+class PageTable;
+class Process;
+class System;
+
+/**
+ * This file defines the parameters that will be used for the OzoneCPU.
+ * This must be defined externally so that the Impl can have a params class
+ * defined that it can pass to all of the individual stages.
+ */
+
+class SimpleParams : public BaseCPU::Params
+{
+  public:
+
+#if FULL_SYSTEM
+    AlphaITB *itb; AlphaDTB *dtb;
+#else
+    std::vector<Process *> workload;
+//    Process *process;
+#endif // FULL_SYSTEM
+
+    //Page Table
+    PageTable *pTable;
+
+    FunctionalMemory *mem;
+
+    //
+    // Caches
+    //
+    MemInterface *icacheInterface;
+    MemInterface *dcacheInterface;
+
+    unsigned cachePorts;
+    unsigned width;
+    unsigned frontEndWidth;
+    unsigned backEndWidth;
+    unsigned backEndSquashLatency;
+    unsigned backEndLatency;
+    unsigned maxInstBufferSize;
+    unsigned numPhysicalRegs;
+    //
+    // Fetch
+    //
+    unsigned decodeToFetchDelay;
+    unsigned renameToFetchDelay;
+    unsigned iewToFetchDelay;
+    unsigned commitToFetchDelay;
+    unsigned fetchWidth;
+
+    //
+    // Decode
+    //
+    unsigned renameToDecodeDelay;
+    unsigned iewToDecodeDelay;
+    unsigned commitToDecodeDelay;
+    unsigned fetchToDecodeDelay;
+    unsigned decodeWidth;
+
+    //
+    // Rename
+    //
+    unsigned iewToRenameDelay;
+    unsigned commitToRenameDelay;
+    unsigned decodeToRenameDelay;
+    unsigned renameWidth;
+
+    //
+    // IEW
+    //
+    unsigned commitToIEWDelay;
+    unsigned renameToIEWDelay;
+    unsigned issueToExecuteDelay;
+    unsigned issueWidth;
+    unsigned executeWidth;
+    unsigned executeIntWidth;
+    unsigned executeFloatWidth;
+    unsigned executeBranchWidth;
+    unsigned executeMemoryWidth;
+    FUPool *fuPool;
+
+    //
+    // Commit
+    //
+    unsigned iewToCommitDelay;
+    unsigned renameToROBDelay;
+    unsigned commitWidth;
+    unsigned squashWidth;
+
+    //
+    // Branch predictor (BP & BTB)
+    //
+    unsigned localPredictorSize;
+    unsigned localCtrBits;
+    unsigned localHistoryTableSize;
+    unsigned localHistoryBits;
+    unsigned globalPredictorSize;
+    unsigned globalCtrBits;
+    unsigned globalHistoryBits;
+    unsigned choicePredictorSize;
+    unsigned choiceCtrBits;
+
+    unsigned BTBEntries;
+    unsigned BTBTagSize;
+
+    unsigned RASSize;
+
+    //
+    // Load store queue
+    //
+    unsigned LQEntries;
+    unsigned SQEntries;
+
+    //
+    // Memory dependence
+    //
+    unsigned SSITSize;
+    unsigned LFSTSize;
+
+    //
+    // Miscellaneous
+    //
+    unsigned numPhysIntRegs;
+    unsigned numPhysFloatRegs;
+    unsigned numIQEntries;
+    unsigned numROBEntries;
+
+    bool decoupledFrontEnd;
+    int dispatchWidth;
+    int wbWidth;
+
+    //SMT Parameters
+    unsigned smtNumFetchingThreads;
+
+    std::string   smtFetchPolicy;
+
+    std::string   smtIQPolicy;
+    unsigned smtIQThreshold;
+
+    std::string   smtLSQPolicy;
+    unsigned smtLSQThreshold;
+
+    std::string   smtCommitPolicy;
+
+    std::string   smtROBPolicy;
+    unsigned smtROBThreshold;
+
+    // Probably can get this from somewhere.
+    unsigned instShiftAmt;
+};
+
+#endif // __CPU_OZONE_SIMPLE_PARAMS_HH__
diff --git a/cpu/ozone/thread_state.hh b/cpu/ozone/thread_state.hh
new file mode 100644
index 000000000..c6d23a63b
--- /dev/null
+++ b/cpu/ozone/thread_state.hh
@@ -0,0 +1,171 @@
+
+#ifndef __CPU_OZONE_THREAD_STATE_HH__
+#define __CPU_OZONE_THREAD_STATE_HH__
+
+#include "arch/faults.hh"
+#include "arch/isa_traits.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/thread_state.hh"
+
+class Event;
+class Process;
+
+#if FULL_SYSTEM
+class EndQuiesceEvent;
+class FunctionProfile;
+class ProfileNode;
+#else
+class Process;
+class FunctionalMemory;
+#endif
+
+// Maybe this ozone thread state should only really have committed state?
+// I need to think about why I'm using this and what it's useful for.  Clearly
+// has benefits for SMT; basically serves same use as CPUExecContext.
+// Makes the ExecContext proxy easier.  Gives organization/central access point
+// to state of a thread that can be accessed normally (i.e. not in-flight
+// stuff within a OoO processor).  Does this need an XC proxy within it?
+template <class Impl>
+struct OzoneThreadState : public ThreadState {
+    typedef typename ExecContext::Status Status;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef TheISA::MiscReg MiscReg;
+
+#if FULL_SYSTEM
+    OzoneThreadState(FullCPU *_cpu, int _thread_num, FunctionalMemory *_mem)
+        : ThreadState(-1, _thread_num, _mem),
+          inSyscall(0), trapPending(0)
+    {
+        memset(&regs, 0, sizeof(TheISA::RegFile));
+    }
+#else
+    OzoneThreadState(FullCPU *_cpu, int _thread_num, Process *_process, int _asid)
+        : ThreadState(-1, _thread_num, NULL, _process, _asid),
+          cpu(_cpu), inSyscall(0), trapPending(0)
+    {
+        memset(&regs, 0, sizeof(TheISA::RegFile));
+    }
+
+    OzoneThreadState(FullCPU *_cpu, int _thread_num, FunctionalMemory *_mem,
+                     int _asid)
+        : ThreadState(-1, _thread_num, _mem, NULL, _asid),
+          cpu(_cpu), inSyscall(0), trapPending(0)
+    {
+        memset(&regs, 0, sizeof(TheISA::RegFile));
+    }
+#endif
+
+    Status _status;
+
+    Status status() const { return _status; }
+
+    void setStatus(Status new_status) { _status = new_status; }
+
+    RenameTable<Impl> renameTable; // Should I include backend and frontend
+    // tables here?  For the ozone CPU, maybe, for the new full CPU, probably
+    // not...you wouldn't want threads just accessing the backend/frontend
+    // rename tables.
+    Addr PC; // What should these be set to?  Probably the committed ones.
+    Addr nextPC;
+
+    // Current instruction?
+    TheISA::MachInst inst;
+
+    TheISA::RegFile regs;
+    // Front end?  Back end?
+//    MemReqPtr memReq;
+
+    typename Impl::FullCPU *cpu;
+
+    bool inSyscall;
+
+    bool trapPending;
+
+    ExecContext *xcProxy;
+
+    ExecContext *getXCProxy() { return xcProxy; }
+
+#if !FULL_SYSTEM
+
+    Fault dummyTranslation(MemReqPtr &req)
+    {
+#if 0
+        assert((req->vaddr >> 48 & 0xffff) == 0);
+#endif
+
+        // put the asid in the upper 16 bits of the paddr
+        req->paddr = req->vaddr & ~((Addr)0xffff << sizeof(Addr) * 8 - 16);
+        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
+        return NoFault;
+    }
+    Fault translateInstReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+    Fault translateDataReadReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+    Fault translateDataWriteReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+#else
+    Fault translateInstReq(MemReqPtr &req)
+    {
+        return cpu->itb->translate(req);
+    }
+
+    Fault translateDataReadReq(MemReqPtr &req)
+    {
+        return cpu->dtb->translate(req, false);
+    }
+
+    Fault translateDataWriteReq(MemReqPtr &req)
+    {
+        return cpu->dtb->translate(req, true);
+    }
+#endif
+
+    MiscReg readMiscReg(int misc_reg)
+    {
+        return regs.miscRegs.readReg(misc_reg);
+    }
+
+    MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault)
+    {
+        return regs.miscRegs.readRegWithEffect(misc_reg, fault, xcProxy);
+    }
+
+    Fault setMiscReg(int misc_reg, const MiscReg &val)
+    {
+        return regs.miscRegs.setReg(misc_reg, val);
+    }
+
+    Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val)
+    {
+        return regs.miscRegs.setRegWithEffect(misc_reg, val, xcProxy);
+    }
+
+    uint64_t readPC()
+    { return PC; }
+
+    void setPC(uint64_t val)
+    { PC = val; }
+
+    uint64_t readNextPC()
+    { return nextPC; }
+
+    void setNextPC(uint64_t val)
+    { nextPC = val; }
+
+    bool misspeculating() { return false; }
+
+    void setInst(TheISA::MachInst _inst) { inst = _inst; }
+
+    Counter readFuncExeInst() { return funcExeInst; }
+
+    void setFuncExeInst(Counter new_val) { funcExeInst = new_val; }
+};
+
+#endif // __CPU_OZONE_THREAD_STATE_HH__
diff --git a/python/m5/objects/OzoneCPU.py b/python/m5/objects/OzoneCPU.py
new file mode 100644
index 000000000..8186a44bb
--- /dev/null
+++ b/python/m5/objects/OzoneCPU.py
@@ -0,0 +1,86 @@
+from m5 import *
+from BaseCPU import BaseCPU
+
+class DerivOzoneCPU(BaseCPU):
+    type = 'DerivOzoneCPU'
+
+    numThreads = Param.Unsigned("number of HW thread contexts")
+
+    if not build_env['FULL_SYSTEM']:
+        mem = Param.FunctionalMemory(NULL, "memory")
+
+    width = Param.Unsigned("Width")
+    frontEndWidth = Param.Unsigned("Front end width")
+    backEndWidth = Param.Unsigned("Back end width")
+    backEndSquashLatency = Param.Unsigned("Back end squash latency")
+    backEndLatency = Param.Unsigned("Back end latency")
+    maxInstBufferSize = Param.Unsigned("Maximum instruction buffer size")
+    decodeToFetchDelay = Param.Unsigned("Decode to fetch delay")
+    renameToFetchDelay = Param.Unsigned("Rename to fetch delay")
+    iewToFetchDelay = Param.Unsigned("Issue/Execute/Writeback to fetch "
+               "delay")
+    commitToFetchDelay = Param.Unsigned("Commit to fetch delay")
+    fetchWidth = Param.Unsigned("Fetch width")
+
+    renameToDecodeDelay = Param.Unsigned("Rename to decode delay")
+    iewToDecodeDelay = Param.Unsigned("Issue/Execute/Writeback to decode "
+               "delay")
+    commitToDecodeDelay = Param.Unsigned("Commit to decode delay")
+    fetchToDecodeDelay = Param.Unsigned("Fetch to decode delay")
+    decodeWidth = Param.Unsigned("Decode width")
+
+    iewToRenameDelay = Param.Unsigned("Issue/Execute/Writeback to rename "
+               "delay")
+    commitToRenameDelay = Param.Unsigned("Commit to rename delay")
+    decodeToRenameDelay = Param.Unsigned("Decode to rename delay")
+    renameWidth = Param.Unsigned("Rename width")
+
+    commitToIEWDelay = Param.Unsigned("Commit to "
+               "Issue/Execute/Writeback delay")
+    renameToIEWDelay = Param.Unsigned("Rename to "
+               "Issue/Execute/Writeback delay")
+    issueToExecuteDelay = Param.Unsigned("Issue to execute delay (internal "
+              "to the IEW stage)")
+    issueWidth = Param.Unsigned("Issue width")
+    executeWidth = Param.Unsigned("Execute width")
+    executeIntWidth = Param.Unsigned("Integer execute width")
+    executeFloatWidth = Param.Unsigned("Floating point execute width")
+    executeBranchWidth = Param.Unsigned("Branch execute width")
+    executeMemoryWidth = Param.Unsigned("Memory execute width")
+
+    iewToCommitDelay = Param.Unsigned("Issue/Execute/Writeback to commit "
+               "delay")
+    renameToROBDelay = Param.Unsigned("Rename to reorder buffer delay")
+    commitWidth = Param.Unsigned("Commit width")
+    squashWidth = Param.Unsigned("Squash width")
+
+    localPredictorSize = Param.Unsigned("Size of local predictor")
+    localCtrBits = Param.Unsigned("Bits per counter")
+    localHistoryTableSize = Param.Unsigned("Size of local history table")
+    localHistoryBits = Param.Unsigned("Bits for the local history")
+    globalPredictorSize = Param.Unsigned("Size of global predictor")
+    globalCtrBits = Param.Unsigned("Bits per counter")
+    globalHistoryBits = Param.Unsigned("Bits of history")
+    choicePredictorSize = Param.Unsigned("Size of choice predictor")
+    choiceCtrBits = Param.Unsigned("Bits of choice counters")
+
+    BTBEntries = Param.Unsigned("Number of BTB entries")
+    BTBTagSize = Param.Unsigned("Size of the BTB tags, in bits")
+
+    RASSize = Param.Unsigned("RAS size")
+
+    LQEntries = Param.Unsigned("Number of load queue entries")
+    SQEntries = Param.Unsigned("Number of store queue entries")
+    LFSTSize = Param.Unsigned("Last fetched store table size")
+    SSITSize = Param.Unsigned("Store set ID table size")
+
+    numPhysIntRegs = Param.Unsigned("Number of physical integer registers")
+    numPhysFloatRegs = Param.Unsigned("Number of physical floating point "
+               "registers")
+    numIQEntries = Param.Unsigned("Number of instruction queue entries")
+    numROBEntries = Param.Unsigned("Number of reorder buffer entries")
+
+    instShiftAmt = Param.Unsigned("Number of bits to shift instructions by")
+
+    function_trace = Param.Bool(False, "Enable function trace")
+    function_trace_start = Param.Tick(0, "Cycle to start function trace")
diff --git a/python/m5/objects/SimpleOzoneCPU.py b/python/m5/objects/SimpleOzoneCPU.py
new file mode 100644
index 000000000..0d6403383
--- /dev/null
+++ b/python/m5/objects/SimpleOzoneCPU.py
@@ -0,0 +1,86 @@
+from m5 import *
+from BaseCPU import BaseCPU
+
+class SimpleOzoneCPU(BaseCPU):
+    type = 'SimpleOzoneCPU'
+
+    numThreads = Param.Unsigned("number of HW thread contexts")
+
+    if not build_env['FULL_SYSTEM']:
+        mem = Param.FunctionalMemory(NULL, "memory")
+
+    width = Param.Unsigned("Width")
+    frontEndWidth = Param.Unsigned("Front end width")
+    backEndWidth = Param.Unsigned("Back end width")
+    backEndSquashLatency = Param.Unsigned("Back end squash latency")
+    backEndLatency = Param.Unsigned("Back end latency")
+    maxInstBufferSize = Param.Unsigned("Maximum instruction buffer size")
+    decodeToFetchDelay = Param.Unsigned("Decode to fetch delay")
+    renameToFetchDelay = Param.Unsigned("Rename to fetch delay")
+    iewToFetchDelay = Param.Unsigned("Issue/Execute/Writeback to fetch "
+               "delay")
+    commitToFetchDelay = Param.Unsigned("Commit to fetch delay")
+    fetchWidth = Param.Unsigned("Fetch width")
+
+    renameToDecodeDelay = Param.Unsigned("Rename to decode delay")
+    iewToDecodeDelay = Param.Unsigned("Issue/Execute/Writeback to decode "
+               "delay")
+    commitToDecodeDelay = Param.Unsigned("Commit to decode delay")
+    fetchToDecodeDelay = Param.Unsigned("Fetch to decode delay")
+    decodeWidth = Param.Unsigned("Decode width")
+
+    iewToRenameDelay = Param.Unsigned("Issue/Execute/Writeback to rename "
+               "delay")
+    commitToRenameDelay = Param.Unsigned("Commit to rename delay")
+    decodeToRenameDelay = Param.Unsigned("Decode to rename delay")
+    renameWidth = Param.Unsigned("Rename width")
+
+    commitToIEWDelay = Param.Unsigned("Commit to "
+               "Issue/Execute/Writeback delay")
+    renameToIEWDelay = Param.Unsigned("Rename to "
+               "Issue/Execute/Writeback delay")
+    issueToExecuteDelay = Param.Unsigned("Issue to execute delay (internal "
+              "to the IEW stage)")
+    issueWidth = Param.Unsigned("Issue width")
+    executeWidth = Param.Unsigned("Execute width")
+    executeIntWidth = Param.Unsigned("Integer execute width")
+    executeFloatWidth = Param.Unsigned("Floating point execute width")
+    executeBranchWidth = Param.Unsigned("Branch execute width")
+    executeMemoryWidth = Param.Unsigned("Memory execute width")
+
+    iewToCommitDelay = Param.Unsigned("Issue/Execute/Writeback to commit "
+               "delay")
+    renameToROBDelay = Param.Unsigned("Rename to reorder buffer delay")
+    commitWidth = Param.Unsigned("Commit width")
+    squashWidth = Param.Unsigned("Squash width")
+
+    localPredictorSize = Param.Unsigned("Size of local predictor")
+    localCtrBits = Param.Unsigned("Bits per counter")
+    localHistoryTableSize = Param.Unsigned("Size of local history table")
+    localHistoryBits = Param.Unsigned("Bits for the local history")
+    globalPredictorSize = Param.Unsigned("Size of global predictor")
+    globalCtrBits = Param.Unsigned("Bits per counter")
+    globalHistoryBits = Param.Unsigned("Bits of history")
+    choicePredictorSize = Param.Unsigned("Size of choice predictor")
+    choiceCtrBits = Param.Unsigned("Bits of choice counters")
+
+    BTBEntries = Param.Unsigned("Number of BTB entries")
+    BTBTagSize = Param.Unsigned("Size of the BTB tags, in bits")
+
+    RASSize = Param.Unsigned("RAS size")
+
+    LQEntries = Param.Unsigned("Number of load queue entries")
+    SQEntries = Param.Unsigned("Number of store queue entries")
+    LFSTSize = Param.Unsigned("Last fetched store table size")
+    SSITSize = Param.Unsigned("Store set ID table size")
+
+    numPhysIntRegs = Param.Unsigned("Number of physical integer registers")
+    numPhysFloatRegs = Param.Unsigned("Number of physical floating point "
+               "registers")
+    numIQEntries = Param.Unsigned("Number of instruction queue entries")
+    numROBEntries = Param.Unsigned("Number of reorder buffer entries")
+
+    instShiftAmt = Param.Unsigned("Number of bits to shift instructions by")
+
+    function_trace = Param.Bool(False, "Enable function trace")
+    function_trace_start = Param.Tick(0, "Cycle to start function trace")

From f0baf0ec999e87b338e7107f2b2fba4e39cd3941 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 18:47:07 -0400
Subject: [PATCH 06/50] Update the python file for the CPU.

--HG--
extra : convert_revision : be899403d893f5ab6c11ae5a4334c0e36bd6ff61
---
 python/m5/objects/AlphaFullCPU.py | 33 ++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/python/m5/objects/AlphaFullCPU.py b/python/m5/objects/AlphaFullCPU.py
index 48989d057..284398b0e 100644
--- a/python/m5/objects/AlphaFullCPU.py
+++ b/python/m5/objects/AlphaFullCPU.py
@@ -9,6 +9,8 @@ class DerivAlphaFullCPU(BaseCPU):
     if not build_env['FULL_SYSTEM']:
         mem = Param.FunctionalMemory(NULL, "memory")
 
+    cachePorts = Param.Unsigned("Cache Ports")
+
     decodeToFetchDelay = Param.Unsigned("Decode to fetch delay")
     renameToFetchDelay = Param.Unsigned("Rename to fetch delay")
     iewToFetchDelay = Param.Unsigned("Issue/Execute/Writeback to fetch "
@@ -41,6 +43,7 @@ class DerivAlphaFullCPU(BaseCPU):
     executeFloatWidth = Param.Unsigned("Floating point execute width")
     executeBranchWidth = Param.Unsigned("Branch execute width")
     executeMemoryWidth = Param.Unsigned("Memory execute width")
+    fuPool = Param.FUPool(NULL, "Functional Unit pool")
 
     iewToCommitDelay = Param.Unsigned("Issue/Execute/Writeback to commit "
                "delay")
@@ -48,15 +51,15 @@ class DerivAlphaFullCPU(BaseCPU):
     commitWidth = Param.Unsigned("Commit width")
     squashWidth = Param.Unsigned("Squash width")
 
-    local_predictor_size = Param.Unsigned("Size of local predictor")
-    local_ctr_bits = Param.Unsigned("Bits per counter")
-    local_history_table_size = Param.Unsigned("Size of local history table")
-    local_history_bits = Param.Unsigned("Bits for the local history")
-    global_predictor_size = Param.Unsigned("Size of global predictor")
-    global_ctr_bits = Param.Unsigned("Bits per counter")
-    global_history_bits = Param.Unsigned("Bits of history")
-    choice_predictor_size = Param.Unsigned("Size of choice predictor")
-    choice_ctr_bits = Param.Unsigned("Bits of choice counters")
+    localPredictorSize = Param.Unsigned("Size of local predictor")
+    localCtrBits = Param.Unsigned("Bits per counter")
+    localHistoryTableSize = Param.Unsigned("Size of local history table")
+    localHistoryBits = Param.Unsigned("Bits for the local history")
+    globalPredictorSize = Param.Unsigned("Size of global predictor")
+    globalCtrBits = Param.Unsigned("Bits per counter")
+    globalHistoryBits = Param.Unsigned("Bits of history")
+    choicePredictorSize = Param.Unsigned("Size of choice predictor")
+    choiceCtrBits = Param.Unsigned("Bits of choice counters")
 
     BTBEntries = Param.Unsigned("Number of BTB entries")
     BTBTagSize = Param.Unsigned("Size of the BTB tags, in bits")
@@ -68,6 +71,8 @@ class DerivAlphaFullCPU(BaseCPU):
     LFSTSize = Param.Unsigned("Last fetched store table size")
     SSITSize = Param.Unsigned("Store set ID table size")
 
+    numRobs = Param.Unsigned("Number of Reorder Buffers");
+
     numPhysIntRegs = Param.Unsigned("Number of physical integer registers")
     numPhysFloatRegs = Param.Unsigned("Number of physical floating point "
                "registers")
@@ -78,3 +83,13 @@ class DerivAlphaFullCPU(BaseCPU):
 
     function_trace = Param.Bool(False, "Enable function trace")
     function_trace_start = Param.Tick(0, "Cycle to start function trace")
+
+    smtNumFetchingThreads = Param.Unsigned("SMT Number of Fetching Threads")
+    smtFetchPolicy = Param.String("SMT Fetch policy")
+    smtLSQPolicy    = Param.String("SMT LSQ Sharing Policy")
+    smtLSQThreshold = Param.String("SMT LSQ Threshold Sharing Parameter")
+    smtIQPolicy    = Param.String("SMT IQ Sharing Policy")
+    smtIQThreshold = Param.String("SMT IQ Threshold Sharing Parameter")
+    smtROBPolicy   = Param.String("SMT ROB Sharing Policy")
+    smtROBThreshold = Param.String("SMT ROB Threshold Sharing Parameter")
+    smtCommitPolicy = Param.String("SMT Commit Policy")

From bfc507e44ecc08c4ded037cf589e968f2ba42705 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 18:49:52 -0400
Subject: [PATCH 07/50] Remove unnecessary functions.

cpu/exec_context.hh:
    Remove functions that shouldn't be accessible to anything outside of the CPU.

--HG--
extra : convert_revision : 9793c3ceb6d5404484bafc7a75d75ed71815d9eb
---
 cpu/exec_context.hh | 47 ++-------------------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/cpu/exec_context.hh b/cpu/exec_context.hh
index 2b6c41bd7..039b04527 100644
--- a/cpu/exec_context.hh
+++ b/cpu/exec_context.hh
@@ -143,17 +143,6 @@ class ExecContext
 
     virtual int getThreadNum() = 0;
 
-    virtual bool validInstAddr(Addr addr) = 0;
-    virtual bool validDataAddr(Addr addr) = 0;
-    virtual int getInstAsid() = 0;
-    virtual int getDataAsid() = 0;
-
-    virtual Fault translateInstReq(MemReqPtr &req) = 0;
-
-    virtual Fault translateDataReadReq(MemReqPtr &req) = 0;
-
-    virtual Fault translateDataWriteReq(MemReqPtr &req) = 0;
-
     // Also somewhat obnoxious.  Really only used for the TLB fault.
     // However, may be quite useful in SPARC.
     virtual TheISA::MachInst getInst() = 0;
@@ -204,11 +193,7 @@ class ExecContext
     virtual void setStCondFailures(unsigned sc_failures) = 0;
 
 #if FULL_SYSTEM
-    virtual int readIntrFlag() = 0;
-    virtual void setIntrFlag(int val) = 0;
-    virtual Fault hwrei() = 0;
     virtual bool inPalMode() = 0;
-    virtual bool simPalCheck(int palFunc) = 0;
 #endif
 
     // Only really makes sense for old CPU model.  Still could be useful though.
@@ -222,12 +207,10 @@ class ExecContext
 
     virtual void setSyscallReturn(SyscallReturn return_value) = 0;
 
-    virtual void syscall() = 0;
+//    virtual void syscall() = 0;
 
     // Same with st cond failures.
     virtual Counter readFuncExeInst() = 0;
-
-    virtual void setFuncExeInst(Counter new_val) = 0;
 #endif
 };
 
@@ -305,20 +288,6 @@ class ProxyExecContext : public ExecContext
 
     int getThreadNum() { return actualXC->getThreadNum(); }
 
-    bool validInstAddr(Addr addr) { return actualXC->validInstAddr(addr); }
-    bool validDataAddr(Addr addr) { return actualXC->validDataAddr(addr); }
-    int getInstAsid() { return actualXC->getInstAsid(); }
-    int getDataAsid() { return actualXC->getDataAsid(); }
-
-    Fault translateInstReq(MemReqPtr &req)
-    { return actualXC->translateInstReq(req); }
-
-    Fault translateDataReadReq(MemReqPtr &req)
-    { return actualXC->translateDataReadReq(req); }
-
-    Fault translateDataWriteReq(MemReqPtr &req)
-    { return actualXC->translateDataWriteReq(req); }
-
     // @todo: Do I need this?
     MachInst getInst() { return actualXC->getInst(); }
 
@@ -379,17 +348,8 @@ class ProxyExecContext : public ExecContext
 
     void setStCondFailures(unsigned sc_failures)
     { actualXC->setStCondFailures(sc_failures); }
-
 #if FULL_SYSTEM
-    int readIntrFlag() { return actualXC->readIntrFlag(); }
-
-    void setIntrFlag(int val) { actualXC->setIntrFlag(val); }
-
-    Fault hwrei() { return actualXC->hwrei(); }
-
     bool inPalMode() { return actualXC->inPalMode(); }
-
-    bool simPalCheck(int palFunc) { return actualXC->simPalCheck(palFunc); }
 #endif
 
     // @todo: Fix this!
@@ -405,12 +365,9 @@ class ProxyExecContext : public ExecContext
     void setSyscallReturn(SyscallReturn return_value)
     { actualXC->setSyscallReturn(return_value); }
 
-    void syscall() { actualXC->syscall(); }
+//    void syscall() { actualXC->syscall(); }
 
     Counter readFuncExeInst() { return actualXC->readFuncExeInst(); }
-
-    void setFuncExeInst(Counter new_val)
-    { return actualXC->setFuncExeInst(new_val); }
 #endif
 };
 

From 6b4396111ba26fd16c7cf0047c4cb3e13036c298 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 19:10:39 -0400
Subject: [PATCH 08/50] Updates for OzoneCPU.

cpu/static_inst.hh:
    Updates for new CPU, also include a classification of quiesce instructions.

--HG--
extra : convert_revision : a34cd56da88fe57d7de24674fbb375bbf13f887f
---
 cpu/ozone/back_end.cc         |    2 +-
 cpu/ozone/back_end.hh         |    7 +
 cpu/ozone/back_end_impl.hh    |   85 +-
 cpu/ozone/cpu.hh              |   20 +-
 cpu/ozone/cpu_builder.cc      |   12 +-
 cpu/ozone/cpu_impl.hh         |   54 +-
 cpu/ozone/front_end.hh        |    4 +
 cpu/ozone/front_end_impl.hh   |   13 +-
 cpu/ozone/inorder_back_end.hh |    1 +
 cpu/ozone/lsq_unit.hh         |    9 +-
 cpu/ozone/lsq_unit_impl.hh    |    4 +-
 cpu/ozone/lw_back_end.cc      |    5 +
 cpu/ozone/lw_back_end.hh      |  503 +++++++++++
 cpu/ozone/lw_back_end_impl.hh | 1486 +++++++++++++++++++++++++++++++++
 cpu/ozone/lw_lsq.cc           |   34 +
 cpu/ozone/lw_lsq.hh           |  649 ++++++++++++++
 cpu/ozone/lw_lsq_impl.hh      |  766 +++++++++++++++++
 cpu/ozone/ozone_impl.hh       |    6 +-
 cpu/static_inst.hh            |    7 +
 19 files changed, 3619 insertions(+), 48 deletions(-)
 create mode 100644 cpu/ozone/lw_back_end.cc
 create mode 100644 cpu/ozone/lw_back_end.hh
 create mode 100644 cpu/ozone/lw_back_end_impl.hh
 create mode 100644 cpu/ozone/lw_lsq.cc
 create mode 100644 cpu/ozone/lw_lsq.hh
 create mode 100644 cpu/ozone/lw_lsq_impl.hh

diff --git a/cpu/ozone/back_end.cc b/cpu/ozone/back_end.cc
index dbab5435e..cb014e4cc 100644
--- a/cpu/ozone/back_end.cc
+++ b/cpu/ozone/back_end.cc
@@ -2,4 +2,4 @@
 #include "cpu/ozone/back_end_impl.hh"
 #include "cpu/ozone/ozone_impl.hh"
 
-template class BackEnd<OzoneImpl>;
+//template class BackEnd<OzoneImpl>;
diff --git a/cpu/ozone/back_end.hh b/cpu/ozone/back_end.hh
index 0713a0143..14b011ab8 100644
--- a/cpu/ozone/back_end.hh
+++ b/cpu/ozone/back_end.hh
@@ -125,6 +125,7 @@ class BackEnd
         InstList nonSpec;
         InstList replayList;
         ReadyInstQueue readyQueue;
+      public:
         int size;
         int numInsts;
         int width;
@@ -321,6 +322,12 @@ class BackEnd
     int numROBEntries;
     int numInsts;
 
+    bool squashPending;
+    InstSeqNum squashSeqNum;
+    Addr squashNextPC;
+
+    Fault faultFromFetch;
+
   private:
     typedef typename std::list<DynInstPtr>::iterator InstListIt;
 
diff --git a/cpu/ozone/back_end_impl.hh b/cpu/ozone/back_end_impl.hh
index 807afaf2e..0b0f04f59 100644
--- a/cpu/ozone/back_end_impl.hh
+++ b/cpu/ozone/back_end_impl.hh
@@ -100,6 +100,7 @@ BackEnd<Impl>::InstQueue::insert(DynInstPtr &inst)
     numInsts++;
     inst_count[0]++;
     if (!inst->isNonSpeculative()) {
+        DPRINTF(BE, "Instruction [sn:%lli] added to IQ\n", inst->seqNum);
         if (inst->readyToIssue()) {
             toBeScheduled.push_front(inst);
             inst->iqIt = toBeScheduled.begin();
@@ -110,6 +111,7 @@ BackEnd<Impl>::InstQueue::insert(DynInstPtr &inst)
             inst->iqItValid = true;
         }
     } else {
+        DPRINTF(BE, "Nonspeculative instruction [sn:%lli] added to IQ\n", inst->seqNum);
         nonSpec.push_front(inst);
         inst->iqIt = nonSpec.begin();
         inst->iqItValid = true;
@@ -159,6 +161,8 @@ BackEnd<Impl>::InstQueue::scheduleNonSpec(const InstSeqNum &sn)
 */
     DynInstPtr inst = nonSpec.back();
 
+    DPRINTF(BE, "Nonspeculative instruction [sn:%lli] scheduled\n", inst->seqNum);
+
     assert(inst->seqNum == sn);
 
     assert(find(NonSpec, inst->iqIt));
@@ -193,6 +197,7 @@ BackEnd<Impl>::InstQueue::squash(const InstSeqNum &sn)
     InstListIt iq_end_it = iq.end();
 
     while (iq_it != iq_end_it && (*iq_it)->seqNum > sn) {
+        DPRINTF(BE, "Instruction [sn:%lli] removed from IQ\n", (*iq_it)->seqNum);
         (*iq_it)->iqItValid = false;
         iq.erase(iq_it++);
         --numInsts;
@@ -202,6 +207,7 @@ BackEnd<Impl>::InstQueue::squash(const InstSeqNum &sn)
     iq_end_it = nonSpec.end();
 
     while (iq_it != iq_end_it && (*iq_it)->seqNum > sn) {
+        DPRINTF(BE, "Instruction [sn:%lli] removed from IQ\n", (*iq_it)->seqNum);
         (*iq_it)->iqItValid = false;
         nonSpec.erase(iq_it++);
         --numInsts;
@@ -212,6 +218,7 @@ BackEnd<Impl>::InstQueue::squash(const InstSeqNum &sn)
 
     while (iq_it != iq_end_it) {
         if ((*iq_it)->seqNum > sn) {
+            DPRINTF(BE, "Instruction [sn:%lli] removed from IQ\n", (*iq_it)->seqNum);
             (*iq_it)->iqItValid = false;
             replayList.erase(iq_it++);
             --numInsts;
@@ -243,20 +250,24 @@ BackEnd<Impl>::InstQueue::wakeDependents(DynInstPtr &inst)
     std::vector<DynInstPtr> &dependents = inst->getDependents();
     int num_outputs = dependents.size();
 
+    DPRINTF(BE, "Waking instruction [sn:%lli] dependents in IQ\n", inst->seqNum);
+
     for (int i = 0; i < num_outputs; i++) {
-        DynInstPtr inst = dependents[i];
-        inst->markSrcRegReady();
-        if (inst->readyToIssue() && inst->iqItValid) {
-            if (inst->isNonSpeculative()) {
-                assert(find(NonSpec, inst->iqIt));
-                nonSpec.erase(inst->iqIt);
+        DynInstPtr dep_inst = dependents[i];
+        dep_inst->markSrcRegReady();
+        DPRINTF(BE, "Marking source reg ready [sn:%lli] in IQ\n", dep_inst->seqNum);
+
+        if (dep_inst->readyToIssue() && dep_inst->iqItValid) {
+            if (dep_inst->isNonSpeculative()) {
+                assert(find(NonSpec, dep_inst->iqIt));
+                nonSpec.erase(dep_inst->iqIt);
             } else {
-                assert(find(IQ, inst->iqIt));
-                iq.erase(inst->iqIt);
+                assert(find(IQ, dep_inst->iqIt));
+                iq.erase(dep_inst->iqIt);
             }
 
-            toBeScheduled.push_front(inst);
-            inst->iqIt = toBeScheduled.begin();
+            toBeScheduled.push_front(dep_inst);
+            dep_inst->iqIt = toBeScheduled.begin();
         }
     }
     return num_outputs;
@@ -266,6 +277,7 @@ template <class Impl>
 void
 BackEnd<Impl>::InstQueue::rescheduleMemInst(DynInstPtr &inst)
 {
+    DPRINTF(BE, "Rescheduling memory instruction [sn:%lli]\n", inst->seqNum);
     assert(!inst->iqItValid);
     replayList.push_front(inst);
     inst->iqIt = replayList.begin();
@@ -277,11 +289,14 @@ template <class Impl>
 void
 BackEnd<Impl>::InstQueue::replayMemInst(DynInstPtr &inst)
 {
+    DPRINTF(BE, "Replaying memory instruction [sn:%lli]\n", inst->seqNum);
     assert(find(ReplayList, inst->iqIt));
     InstListIt iq_it = --replayList.end();
     InstListIt iq_end_it = replayList.end();
     while (iq_it != iq_end_it) {
         DynInstPtr rescheduled_inst = (*iq_it);
+
+        DPRINTF(BE, "Memory instruction [sn:%lli] also replayed\n", inst->seqNum);
         replayList.erase(iq_it--);
         toBeScheduled.push_front(rescheduled_inst);
         rescheduled_inst->iqIt = toBeScheduled.begin();
@@ -952,6 +967,9 @@ BackEnd<Impl>::tick()
 
     commitInsts();
 
+    DPRINTF(BE, "IQ entries in use: %i, ROB entries in use: %i, LSQ loads: %i, LSQ stores: %i\n",
+            IQ.numInsts, numInsts, LSQ.numLoads(), LSQ.numStores());
+
     assert(numInsts == instList.size());
 }
 
@@ -1034,11 +1052,11 @@ BackEnd<Impl>::dispatchInsts()
         // Get instruction from front of time buffer
         DynInstPtr inst = dispatch.front();
         dispatch.pop_front();
+        --dispatchSize;
 
         if (inst->isSquashed())
             continue;
 
-        --dispatchSize;
         ++numInsts;
         instList.push_back(inst);
 
@@ -1118,6 +1136,7 @@ template <class Impl>
 void
 BackEnd<Impl>::checkDispatchStatus()
 {
+    DPRINTF(BE, "Checking dispatch status\n");
     assert(dispatchStatus == Blocked);
     if (!IQ.isFull() && !LSQ.isFull() && !isFull()) {
         DPRINTF(BE, "Dispatch no longer blocked\n");
@@ -1526,6 +1545,24 @@ BackEnd<Impl>::commitInst(int inst_num)
     // Write the done sequence number here.
     toIEW->doneSeqNum = inst->seqNum;
 
+#if FULL_SYSTEM
+    int count = 0;
+    Addr oldpc;
+    do {
+        if (count == 0)
+            assert(!thread->inSyscall && !thread->trapPending);
+        oldpc = thread->readPC();
+        cpu->system->pcEventQueue.service(
+            thread->getXCProxy());
+        count++;
+    } while (oldpc != thread->readPC());
+    if (count > 1) {
+        DPRINTF(BE, "PC skip function event, stopping commit\n");
+//        completed_last_inst = false;
+//        squashPending = true;
+        return false;
+    }
+#endif
     return true;
 }
 
@@ -1566,7 +1603,11 @@ BackEnd<Impl>::squash(const InstSeqNum &sn)
 
     while (insts_it != dispatch_end && (*insts_it)->seqNum > sn)
     {
-        DPRINTF(BE, "Squashing instruction PC %#x, [sn:%lli].\n",
+        if ((*insts_it)->isSquashed()) {
+            --insts_it;
+            continue;
+        }
+        DPRINTF(BE, "Squashing instruction on dispatch list PC %#x, [sn:%lli].\n",
                 (*insts_it)->readPC(),
                 (*insts_it)->seqNum);
 
@@ -1576,9 +1617,12 @@ BackEnd<Impl>::squash(const InstSeqNum &sn)
 
         (*insts_it)->setCanCommit();
 
+        // Be careful with IPRs and such here
         for (int i = 0; i < (*insts_it)->numDestRegs(); ++i) {
-            renameTable[(*insts_it)->destRegIdx(i)] =
-                (*insts_it)->getPrevDestInst(i);
+            DynInstPtr prev_dest = (*insts_it)->getPrevDestInst(i);
+            DPRINTF(BE, "Commit rename map setting register %i to [sn:%lli]\n",
+                    (int)(*insts_it)->destRegIdx(i), prev_dest);
+            renameTable[(*insts_it)->destRegIdx(i)] = prev_dest;
             ++freed_regs;
         }
 
@@ -1592,7 +1636,11 @@ BackEnd<Impl>::squash(const InstSeqNum &sn)
 
     while (!instList.empty() && (*insts_it)->seqNum > sn)
     {
-        DPRINTF(BE, "Squashing instruction PC %#x, [sn:%lli].\n",
+        if ((*insts_it)->isSquashed()) {
+            --insts_it;
+            continue;
+        }
+        DPRINTF(BE, "Squashing instruction on inst list PC %#x, [sn:%lli].\n",
                 (*insts_it)->readPC(),
                 (*insts_it)->seqNum);
 
@@ -1603,8 +1651,10 @@ BackEnd<Impl>::squash(const InstSeqNum &sn)
         (*insts_it)->setCanCommit();
 
         for (int i = 0; i < (*insts_it)->numDestRegs(); ++i) {
-            renameTable[(*insts_it)->destRegIdx(i)] =
-                (*insts_it)->getPrevDestInst(i);
+            DynInstPtr prev_dest = (*insts_it)->getPrevDestInst(i);
+            DPRINTF(BE, "Commit rename map setting register %i to [sn:%lli]\n",
+                    (int)(*insts_it)->destRegIdx(i), prev_dest);
+            renameTable[(*insts_it)->destRegIdx(i)] = prev_dest;
             ++freed_regs;
         }
 
@@ -1649,6 +1699,7 @@ template <class Impl>
 void
 BackEnd<Impl>::fetchFault(Fault &fault)
 {
+    faultFromFetch = fault;
 }
 
 template <class Impl>
diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index 200ced265..17e0f5c42 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -42,6 +42,7 @@
 #include "cpu/pc_event.hh"
 #include "cpu/static_inst.hh"
 #include "mem/mem_interface.hh"
+#include "mem/page_table.hh"
 #include "sim/eventq.hh"
 
 // forward declarations
@@ -427,34 +428,22 @@ class OzoneCPU : public BaseCPU
     int getInstAsid() { return thread.asid; }
     int getDataAsid() { return thread.asid; }
 
-    Fault dummyTranslation(MemReqPtr &req)
-    {
-#if 0
-        assert((req->vaddr >> 48 & 0xffff) == 0);
-#endif
-
-        // put the asid in the upper 16 bits of the paddr
-        req->paddr = req->vaddr & ~((Addr)0xffff << sizeof(Addr) * 8 - 16);
-        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
-        return NoFault;
-    }
-
     /** Translates instruction requestion in syscall emulation mode. */
     Fault translateInstReq(MemReqPtr &req)
     {
-        return dummyTranslation(req);
+        return this->pTable->translate(req);
     }
 
     /** Translates data read request in syscall emulation mode. */
     Fault translateDataReadReq(MemReqPtr &req)
     {
-        return dummyTranslation(req);
+        return this->pTable->translate(req);
     }
 
     /** Translates data write request in syscall emulation mode. */
     Fault translateDataWriteReq(MemReqPtr &req)
     {
-        return dummyTranslation(req);
+        return this->pTable->translate(req);
     }
 #endif
     /** CPU read function, forwards read to LSQ. */
@@ -500,6 +489,7 @@ class OzoneCPU : public BaseCPU
     bool inPalMode() { return AlphaISA::PcPAL(thread.PC); }
     bool inPalMode(Addr pc) { return AlphaISA::PcPAL(pc); }
     bool simPalCheck(int palFunc);
+    void processInterrupts();
 #else
     void syscall();
     void setSyscallReturn(SyscallReturn return_value, int tid);
diff --git a/cpu/ozone/cpu_builder.cc b/cpu/ozone/cpu_builder.cc
index 0146dd1bd..8ac6858b0 100644
--- a/cpu/ozone/cpu_builder.cc
+++ b/cpu/ozone/cpu_builder.cc
@@ -45,7 +45,7 @@ SimObjectParam<AlphaITB *> itb;
 SimObjectParam<AlphaDTB *> dtb;
 #else
 SimObjectVectorParam<Process *> workload;
-//SimObjectParam<PageTable *> page_table;
+SimObjectParam<PageTable *> page_table;
 #endif // FULL_SYSTEM
 
 SimObjectParam<FunctionalMemory *> mem;
@@ -159,7 +159,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
     INIT_PARAM(dtb, "Data translation buffer"),
 #else
     INIT_PARAM(workload, "Processes to run"),
-//    INIT_PARAM(page_table, "Page table"),
+    INIT_PARAM(page_table, "Page table"),
 #endif // FULL_SYSTEM
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
@@ -310,7 +310,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
     params->dtb = dtb;
 #else
     params->workload = workload;
-//    params->pTable = page_table;
+    params->pTable = page_table;
 #endif // FULL_SYSTEM
 
     params->mem = mem;
@@ -440,7 +440,7 @@ SimObjectParam<AlphaITB *> itb;
 SimObjectParam<AlphaDTB *> dtb;
 #else
 SimObjectVectorParam<Process *> workload;
-//SimObjectParam<PageTable *> page_table;
+SimObjectParam<PageTable *> page_table;
 #endif // FULL_SYSTEM
 
 SimObjectParam<FunctionalMemory *> mem;
@@ -554,7 +554,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(SimpleOzoneCPU)
     INIT_PARAM(dtb, "Data translation buffer"),
 #else
     INIT_PARAM(workload, "Processes to run"),
-//    INIT_PARAM(page_table, "Page table"),
+    INIT_PARAM(page_table, "Page table"),
 #endif // FULL_SYSTEM
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
@@ -705,7 +705,7 @@ CREATE_SIM_OBJECT(SimpleOzoneCPU)
     params->dtb = dtb;
 #else
     params->workload = workload;
-//    params->pTable = page_table;
+    params->pTable = page_table;
 #endif // FULL_SYSTEM
 
     params->mem = mem;
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index 36ec30b2c..c205ad319 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -765,7 +765,7 @@ void
 OzoneCPU<Impl>::squashFromXC()
 {
     thread.inSyscall = true;
-    backEnd->squashFromXC();
+    backEnd->generateXCEvent();
 }
 
 #if !FULL_SYSTEM
@@ -832,6 +832,58 @@ OzoneCPU<Impl>::hwrei()
     return NoFault;
 }
 
+template <class Impl>
+void
+OzoneCPU<Impl>::processInterrupts()
+{
+    // Check for interrupts here.  For now can copy the code that
+    // exists within isa_fullsys_traits.hh.  Also assume that thread 0
+    // is the one that handles the interrupts.
+
+    // Check if there are any outstanding interrupts
+    //Handle the interrupts
+    int ipl = 0;
+    int summary = 0;
+
+    checkInterrupts = false;
+
+    if (thread.readMiscReg(IPR_ASTRR))
+        panic("asynchronous traps not implemented\n");
+
+    if (thread.readMiscReg(IPR_SIRR)) {
+        for (int i = INTLEVEL_SOFTWARE_MIN;
+             i < INTLEVEL_SOFTWARE_MAX; i++) {
+            if (thread.readMiscReg(IPR_SIRR) & (ULL(1) << i)) {
+                // See table 4-19 of the 21164 hardware reference
+                ipl = (i - INTLEVEL_SOFTWARE_MIN) + 1;
+                summary |= (ULL(1) << i);
+            }
+        }
+    }
+
+    uint64_t interrupts = intr_status();
+
+    if (interrupts) {
+        for (int i = INTLEVEL_EXTERNAL_MIN;
+             i < INTLEVEL_EXTERNAL_MAX; i++) {
+            if (interrupts & (ULL(1) << i)) {
+                // See table 4-19 of the 21164 hardware reference
+                ipl = i;
+                summary |= (ULL(1) << i);
+            }
+        }
+    }
+
+    if (ipl && ipl > thread.readMiscReg(IPR_IPLR)) {
+        thread.setMiscReg(IPR_ISR, summary);
+        thread.setMiscReg(IPR_INTID, ipl);
+        Fault fault = new InterruptFault;
+        fault->invoke(thread.getXCProxy());
+        DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
+                thread.readMiscReg(IPR_IPLR), ipl, summary);
+    }
+}
+
 template <class Impl>
 bool
 OzoneCPU<Impl>::simPalCheck(int palFunc)
diff --git a/cpu/ozone/front_end.hh b/cpu/ozone/front_end.hh
index 5e257b506..251f4200c 100644
--- a/cpu/ozone/front_end.hh
+++ b/cpu/ozone/front_end.hh
@@ -76,6 +76,10 @@ class FrontEnd
     bool processBarriers(DynInstPtr &inst);
 
     void handleFault(Fault &fault);
+  public:
+    Fault getFault() { return fetchFault; }
+  private:
+    Fault fetchFault;
 
     // Align an address (typically a PC) to the start of an I-cache block.
     // We fold in the PISA 64- to 32-bit conversion here as well.
diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh
index 0136d0ef0..af452fe95 100644
--- a/cpu/ozone/front_end_impl.hh
+++ b/cpu/ozone/front_end_impl.hh
@@ -48,6 +48,7 @@ FrontEnd<Impl>::FrontEnd(Params *params)
 #if !FULL_SYSTEM
     pTable = params->pTable;
 #endif
+    fetchFault = NoFault;
 }
 
 template <class Impl>
@@ -273,6 +274,7 @@ FrontEnd<Impl>::tick()
             Fault fault = fetchCacheLine();
             if (fault != NoFault) {
                 handleFault(fault);
+                fetchFault = fault;
                 return;
             }
             fetchCacheLineNextCycle = false;
@@ -349,7 +351,7 @@ FrontEnd<Impl>::fetchCacheLine()
     // Read a cache line, based on the current PC.
 #if FULL_SYSTEM
     // Flag to say whether or not address is physical addr.
-    unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
+    unsigned flags = cpu->inPalMode(PC) ? PHYSICAL : 0;
 #else
     unsigned flags = 0;
 #endif // FULL_SYSTEM
@@ -503,6 +505,9 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
     DPRINTF(FE, "Squashing from [sn:%lli], setting PC to %#x\n",
             squash_num, next_PC);
 
+    if (fetchFault != NoFault)
+        fetchFault = NoFault;
+
     while (!instBuffer.empty() &&
            instBuffer.back()->seqNum > squash_num) {
         DynInstPtr inst = instBuffer.back();
@@ -604,9 +609,13 @@ FrontEnd<Impl>::addFreeRegs(int num_freed)
         status = Running;
     }
 
+    DPRINTF(FE, "Adding %i freed registers\n", num_freed);
+
     freeRegs+= num_freed;
 
-    assert(freeRegs <= numPhysRegs);
+//    assert(freeRegs <= numPhysRegs);
+    if (freeRegs > numPhysRegs)
+        freeRegs = numPhysRegs;
 }
 
 template <class Impl>
diff --git a/cpu/ozone/inorder_back_end.hh b/cpu/ozone/inorder_back_end.hh
index e621f6c01..6519b79e5 100644
--- a/cpu/ozone/inorder_back_end.hh
+++ b/cpu/ozone/inorder_back_end.hh
@@ -54,6 +54,7 @@ class InorderBackEnd
     void squash(const InstSeqNum &squash_num, const Addr &next_PC);
 
     void squashFromXC();
+    void generateXCEvent() { }
 
     bool robEmpty() { return instList.empty(); }
 
diff --git a/cpu/ozone/lsq_unit.hh b/cpu/ozone/lsq_unit.hh
index 3c3e3988c..4b600af67 100644
--- a/cpu/ozone/lsq_unit.hh
+++ b/cpu/ozone/lsq_unit.hh
@@ -567,8 +567,11 @@ OzoneLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         req->data = new uint8_t[64];
 
         assert(!req->completionEvent);
-        req->completionEvent =
-            new typename BackEnd::LdWritebackEvent(loadQueue[load_idx], be);
+        typedef typename BackEnd::LdWritebackEvent LdWritebackEvent;
+
+        LdWritebackEvent *wb = new LdWritebackEvent(loadQueue[load_idx], be);
+
+        req->completionEvent = wb;
 
         // Do Cache Access
         MemAccessResult result = dcacheInterface->access(req);
@@ -586,6 +589,8 @@ OzoneLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
             _status = DcacheMissStall;
 
+            wb->setDcacheMiss();
+
         } else {
 //            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
 //                    inst->seqNum);
diff --git a/cpu/ozone/lsq_unit_impl.hh b/cpu/ozone/lsq_unit_impl.hh
index 6c7977250..726348d76 100644
--- a/cpu/ozone/lsq_unit_impl.hh
+++ b/cpu/ozone/lsq_unit_impl.hh
@@ -698,7 +698,7 @@ OzoneLSQ<Impl>::squash(const InstSeqNum &squashed_num)
             stallingLoadIdx = 0;
         }
 
-        loadQueue[load_idx]->squashed = true;
+//        loadQueue[load_idx]->squashed = true;
         loadQueue[load_idx] = NULL;
         --loads;
 
@@ -728,7 +728,7 @@ OzoneLSQ<Impl>::squash(const InstSeqNum &squashed_num)
             stallingStoreIsn = 0;
         }
 
-        storeQueue[store_idx].inst->squashed = true;
+//        storeQueue[store_idx].inst->squashed = true;
         storeQueue[store_idx].inst = NULL;
         storeQueue[store_idx].canWB = 0;
 
diff --git a/cpu/ozone/lw_back_end.cc b/cpu/ozone/lw_back_end.cc
new file mode 100644
index 000000000..8e9a56ef5
--- /dev/null
+++ b/cpu/ozone/lw_back_end.cc
@@ -0,0 +1,5 @@
+
+#include "cpu/ozone/lw_back_end_impl.hh"
+#include "cpu/ozone/ozone_impl.hh"
+
+template class LWBackEnd<OzoneImpl>;
diff --git a/cpu/ozone/lw_back_end.hh b/cpu/ozone/lw_back_end.hh
new file mode 100644
index 000000000..b89957aad
--- /dev/null
+++ b/cpu/ozone/lw_back_end.hh
@@ -0,0 +1,503 @@
+
+#ifndef __CPU_OZONE_LW_BACK_END_HH__
+#define __CPU_OZONE_LW_BACK_END_HH__
+
+#include <list>
+#include <queue>
+#include <set>
+#include <string>
+
+#include "arch/faults.hh"
+#include "base/timebuf.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/ozone/rename_table.hh"
+#include "cpu/ozone/thread_state.hh"
+#include "mem/functional/functional.hh"
+#include "mem/mem_interface.hh"
+#include "mem/mem_req.hh"
+#include "sim/eventq.hh"
+
+class ExecContext;
+
+template <class Impl>
+class OzoneThreadState;
+
+template <class Impl>
+class LWBackEnd
+{
+  public:
+    typedef OzoneThreadState<Impl> Thread;
+
+    typedef typename Impl::Params Params;
+    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::FrontEnd FrontEnd;
+    typedef typename Impl::FullCPU::CommStruct CommStruct;
+
+    struct SizeStruct {
+        int size;
+    };
+
+    typedef SizeStruct DispatchToIssue;
+    typedef SizeStruct IssueToExec;
+    typedef SizeStruct ExecToCommit;
+    typedef SizeStruct Writeback;
+
+    TimeBuffer<DispatchToIssue> d2i;
+    typename TimeBuffer<DispatchToIssue>::wire instsToDispatch;
+    TimeBuffer<IssueToExec> i2e;
+    typename TimeBuffer<IssueToExec>::wire instsToExecute;
+    TimeBuffer<ExecToCommit> e2c;
+    TimeBuffer<Writeback> numInstsToWB;
+
+    TimeBuffer<CommStruct> *comm;
+    typename TimeBuffer<CommStruct>::wire toIEW;
+    typename TimeBuffer<CommStruct>::wire fromCommit;
+
+    class TrapEvent : public Event {
+      private:
+        LWBackEnd<Impl> *be;
+
+      public:
+        TrapEvent(LWBackEnd<Impl> *_be);
+
+        void process();
+        const char *description();
+    };
+
+    /** LdWriteback event for a load completion. */
+    class LdWritebackEvent : public Event {
+      private:
+        /** Instruction that is writing back data to the register file. */
+        DynInstPtr inst;
+        /** Pointer to IEW stage. */
+        LWBackEnd *be;
+
+        bool dcacheMiss;
+
+      public:
+        /** Constructs a load writeback event. */
+        LdWritebackEvent(DynInstPtr &_inst, LWBackEnd *be);
+
+        /** Processes writeback event. */
+        virtual void process();
+        /** Returns the description of the writeback event. */
+        virtual const char *description();
+
+        void setDcacheMiss() { dcacheMiss = true; be->addDcacheMiss(inst); }
+    };
+
+    LWBackEnd(Params *params);
+
+    std::string name() const;
+
+    void regStats();
+
+    void setCPU(FullCPU *cpu_ptr)
+    { cpu = cpu_ptr; }
+
+    void setFrontEnd(FrontEnd *front_end_ptr)
+    { frontEnd = front_end_ptr; }
+
+    void setXC(ExecContext *xc_ptr)
+    { xc = xc_ptr; }
+
+    void setThreadState(Thread *thread_ptr)
+    { thread = thread_ptr; }
+
+    void setCommBuffer(TimeBuffer<CommStruct> *_comm);
+
+    void tick();
+    void squash();
+    void generateXCEvent() { xcSquash = true; }
+    void squashFromXC();
+    void squashFromTrap();
+    void checkInterrupts();
+    bool trapSquash;
+    bool xcSquash;
+
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+    Addr readCommitPC() { return commitPC; }
+
+    Addr commitPC;
+
+    bool robEmpty() { return instList.empty(); }
+
+    bool isFull() { return numInsts >= numROBEntries; }
+    bool isBlocked() { return status == Blocked || dispatchStatus == Blocked; }
+
+    void fetchFault(Fault &fault);
+
+    int wakeDependents(DynInstPtr &inst);
+
+    /** Tells memory dependence unit that a memory instruction needs to be
+     * rescheduled. It will re-execute once replayMemInst() is called.
+     */
+    void rescheduleMemInst(DynInstPtr &inst);
+
+    /** Re-executes all rescheduled memory instructions. */
+    void replayMemInst(DynInstPtr &inst);
+
+    /** Completes memory instruction. */
+    void completeMemInst(DynInstPtr &inst) { }
+
+    void addDcacheMiss(DynInstPtr &inst)
+    {
+        waitingMemOps.insert(inst->seqNum);
+        numWaitingMemOps++;
+        DPRINTF(BE, "Adding a Dcache miss mem op [sn:%lli], total %i\n",
+                inst->seqNum, numWaitingMemOps);
+    }
+
+    void removeDcacheMiss(DynInstPtr &inst)
+    {
+        assert(waitingMemOps.find(inst->seqNum) != waitingMemOps.end());
+        waitingMemOps.erase(inst->seqNum);
+        numWaitingMemOps--;
+        DPRINTF(BE, "Removing a Dcache miss mem op [sn:%lli], total %i\n",
+                inst->seqNum, numWaitingMemOps);
+    }
+
+    void addWaitingMemOp(DynInstPtr &inst)
+    {
+        waitingMemOps.insert(inst->seqNum);
+        numWaitingMemOps++;
+        DPRINTF(BE, "Adding a waiting mem op [sn:%lli], total %i\n",
+                inst->seqNum, numWaitingMemOps);
+    }
+
+    void removeWaitingMemOp(DynInstPtr &inst)
+    {
+        assert(waitingMemOps.find(inst->seqNum) != waitingMemOps.end());
+        waitingMemOps.erase(inst->seqNum);
+        numWaitingMemOps--;
+        DPRINTF(BE, "Removing a waiting mem op [sn:%lli], total %i\n",
+                inst->seqNum, numWaitingMemOps);
+    }
+
+    void instToCommit(DynInstPtr &inst);
+
+  private:
+    void generateTrapEvent(Tick latency = 0);
+    void handleFault(Fault &fault, Tick latency = 0);
+    void updateStructures();
+    void dispatchInsts();
+    void dispatchStall();
+    void checkDispatchStatus();
+    void executeInsts();
+    void commitInsts();
+    void addToLSQ(DynInstPtr &inst);
+    void writebackInsts();
+    bool commitInst(int inst_num);
+    void squash(const InstSeqNum &sn);
+    void squashDueToBranch(DynInstPtr &inst);
+    void squashDueToMemViolation(DynInstPtr &inst);
+    void squashDueToMemBlocked(DynInstPtr &inst);
+    void updateExeInstStats(DynInstPtr &inst);
+    void updateComInstStats(DynInstPtr &inst);
+
+  public:
+    FullCPU *cpu;
+
+    FrontEnd *frontEnd;
+
+    ExecContext *xc;
+
+    Thread *thread;
+
+    enum Status {
+        Running,
+        Idle,
+        DcacheMissStall,
+        DcacheMissComplete,
+        Blocked,
+        TrapPending
+    };
+
+    Status status;
+
+    Status dispatchStatus;
+
+    Status commitStatus;
+
+    Counter funcExeInst;
+
+  private:
+//    typedef typename Impl::InstQueue InstQueue;
+
+//    InstQueue IQ;
+
+    typedef typename Impl::LdstQueue LdstQueue;
+
+    LdstQueue LSQ;
+  public:
+    RenameTable<Impl> commitRenameTable;
+
+    RenameTable<Impl> renameTable;
+  private:
+    class DCacheCompletionEvent : public Event
+    {
+      private:
+        LWBackEnd *be;
+
+      public:
+        DCacheCompletionEvent(LWBackEnd *_be);
+
+        virtual void process();
+        virtual const char *description();
+    };
+
+    friend class DCacheCompletionEvent;
+
+    DCacheCompletionEvent cacheCompletionEvent;
+
+    MemInterface *dcacheInterface;
+
+    MemReqPtr memReq;
+
+    // General back end width. Used if the more specific isn't given.
+    int width;
+
+    // Dispatch width.
+    int dispatchWidth;
+    int numDispatchEntries;
+    int dispatchSize;
+
+    int waitingInsts;
+
+    int issueWidth;
+
+    // Writeback width
+    int wbWidth;
+
+    // Commit width
+    int commitWidth;
+
+    /** Index into queue of instructions being written back. */
+    unsigned wbNumInst;
+
+    /** Cycle number within the queue of instructions being written
+     * back.  Used in case there are too many instructions writing
+     * back at the current cycle and writesbacks need to be scheduled
+     * for the future. See comments in instToCommit().
+     */
+    unsigned wbCycle;
+
+    int numROBEntries;
+    int numInsts;
+
+    std::set<InstSeqNum> waitingMemOps;
+    typedef std::set<InstSeqNum>::iterator MemIt;
+    int numWaitingMemOps;
+    unsigned maxOutstandingMemOps;
+
+    bool squashPending;
+    InstSeqNum squashSeqNum;
+    Addr squashNextPC;
+
+    Fault faultFromFetch;
+    bool fetchHasFault;
+
+  private:
+    struct pqCompare {
+        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
+        {
+            return lhs->seqNum > rhs->seqNum;
+        }
+    };
+
+    typedef typename std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare> ReadyInstQueue;
+    ReadyInstQueue exeList;
+
+    typedef typename std::list<DynInstPtr>::iterator InstListIt;
+
+    std::list<DynInstPtr> instList;
+    std::list<DynInstPtr> waitingList;
+    std::list<DynInstPtr> replayList;
+    std::list<DynInstPtr> writeback;
+
+    int latency;
+
+    int squashLatency;
+
+    bool exactFullStall;
+
+    bool fetchRedirect[Impl::MaxThreads];
+
+    // number of cycles stalled for D-cache misses
+/*    Stats::Scalar<> dcacheStallCycles;
+      Counter lastDcacheStall;
+*/
+    Stats::Vector<> rob_cap_events;
+    Stats::Vector<> rob_cap_inst_count;
+    Stats::Vector<> iq_cap_events;
+    Stats::Vector<> iq_cap_inst_count;
+    // total number of instructions executed
+    Stats::Vector<> exe_inst;
+    Stats::Vector<> exe_swp;
+    Stats::Vector<> exe_nop;
+    Stats::Vector<> exe_refs;
+    Stats::Vector<> exe_loads;
+    Stats::Vector<> exe_branches;
+
+    Stats::Vector<> issued_ops;
+
+    // total number of loads forwaded from LSQ stores
+    Stats::Vector<> lsq_forw_loads;
+
+    // total number of loads ignored due to invalid addresses
+    Stats::Vector<> inv_addr_loads;
+
+    // total number of software prefetches ignored due to invalid addresses
+    Stats::Vector<> inv_addr_swpfs;
+    // ready loads blocked due to memory disambiguation
+    Stats::Vector<> lsq_blocked_loads;
+
+    Stats::Scalar<> lsqInversion;
+
+    Stats::Vector<> n_issued_dist;
+    Stats::VectorDistribution<> issue_delay_dist;
+
+    Stats::VectorDistribution<> queue_res_dist;
+/*
+    Stats::Vector<> stat_fu_busy;
+    Stats::Vector2d<> stat_fuBusy;
+    Stats::Vector<> dist_unissued;
+    Stats::Vector2d<> stat_issued_inst_type;
+
+    Stats::Formula misspec_cnt;
+    Stats::Formula misspec_ipc;
+    Stats::Formula issue_rate;
+    Stats::Formula issue_stores;
+    Stats::Formula issue_op_rate;
+    Stats::Formula fu_busy_rate;
+    Stats::Formula commit_stores;
+    Stats::Formula commit_ipc;
+    Stats::Formula commit_ipb;
+    Stats::Formula lsq_inv_rate;
+*/
+    Stats::Vector<> writeback_count;
+    Stats::Vector<> producer_inst;
+    Stats::Vector<> consumer_inst;
+    Stats::Vector<> wb_penalized;
+
+    Stats::Formula wb_rate;
+    Stats::Formula wb_fanout;
+    Stats::Formula wb_penalized_rate;
+
+    // total number of instructions committed
+    Stats::Vector<> stat_com_inst;
+    Stats::Vector<> stat_com_swp;
+    Stats::Vector<> stat_com_refs;
+    Stats::Vector<> stat_com_loads;
+    Stats::Vector<> stat_com_membars;
+    Stats::Vector<> stat_com_branches;
+
+    Stats::Distribution<> n_committed_dist;
+
+    Stats::Scalar<> commit_eligible_samples;
+    Stats::Vector<> commit_eligible;
+
+    Stats::Scalar<> ROB_fcount;
+    Stats::Formula ROB_full_rate;
+
+    Stats::Vector<>  ROB_count;	 // cumulative ROB occupancy
+    Stats::Formula ROB_occ_rate;
+    Stats::VectorDistribution<> ROB_occ_dist;
+  public:
+    void dumpInsts();
+};
+
+template <class Impl>
+template <class T>
+Fault
+LWBackEnd<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+/*    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+    Fault fault = cpu->translateDataReadReq(memReq);
+
+    // if we have a cache, do cache access too
+    if (fault == NoFault && dcacheInterface) {
+        memReq->cmd = Read;
+        memReq->completionEvent = NULL;
+        memReq->time = curTick;
+        memReq->flags &= ~INST_READ;
+        MemAccessResult result = dcacheInterface->access(memReq);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT && dcacheInterface->doEvents()) {
+            // Fix this hack for keeping funcExeInst correct with loads that
+            // are executed twice.
+            --funcExeInst;
+
+            memReq->completionEvent = &cacheCompletionEvent;
+            lastDcacheStall = curTick;
+//	    unscheduleTickEvent();
+//	    status = DcacheMissStall;
+            DPRINTF(OzoneCPU, "Dcache miss stall!\n");
+        } else {
+            // do functional access
+            fault = thread->mem->read(memReq, data);
+
+        }
+    }
+*/
+/*
+    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+        recordEvent("Uncached Read");
+*/
+    return LSQ.read(req, data, load_idx);
+}
+
+template <class Impl>
+template <class T>
+Fault
+LWBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+/*
+    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+    Fault fault = cpu->translateDataWriteReq(memReq);
+
+    if (fault == NoFault && dcacheInterface) {
+        memReq->cmd = Write;
+        memcpy(memReq->data,(uint8_t *)&data,memReq->size);
+        memReq->completionEvent = NULL;
+        memReq->time = curTick;
+        memReq->flags &= ~INST_READ;
+        MemAccessResult result = dcacheInterface->access(memReq);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT && dcacheInterface->doEvents()) {
+            memReq->completionEvent = &cacheCompletionEvent;
+            lastDcacheStall = curTick;
+//	    unscheduleTickEvent();
+//	    status = DcacheMissStall;
+            DPRINTF(OzoneCPU, "Dcache miss stall!\n");
+        }
+    }
+
+    if (res && (fault == NoFault))
+        *res = memReq->result;
+        */
+/*
+    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+        recordEvent("Uncached Write");
+*/
+    return LSQ.write(req, data, store_idx);
+}
+
+#endif // __CPU_OZONE_LW_BACK_END_HH__
diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh
new file mode 100644
index 000000000..115821787
--- /dev/null
+++ b/cpu/ozone/lw_back_end_impl.hh
@@ -0,0 +1,1486 @@
+
+#include "encumbered/cpu/full/op_class.hh"
+#include "cpu/ozone/lw_back_end.hh"
+
+template <class Impl>
+void
+LWBackEnd<Impl>::generateTrapEvent(Tick latency)
+{
+    DPRINTF(BE, "Generating trap event\n");
+
+    TrapEvent *trap = new TrapEvent(this);
+
+    trap->schedule(curTick + latency);
+
+    thread->trapPending = true;
+}
+
+template <class Impl>
+int
+LWBackEnd<Impl>::wakeDependents(DynInstPtr &inst)
+{
+    assert(!inst->isSquashed());
+    std::vector<DynInstPtr> &dependents = inst->getDependents();
+    int num_outputs = dependents.size();
+
+    DPRINTF(BE, "Waking instruction [sn:%lli] dependents in IQ\n", inst->seqNum);
+
+    for (int i = 0; i < num_outputs; i++) {
+        DynInstPtr dep_inst = dependents[i];
+        dep_inst->markSrcRegReady();
+        DPRINTF(BE, "Marking source reg ready [sn:%lli] in IQ\n", dep_inst->seqNum);
+
+        if (dep_inst->readyToIssue() && dep_inst->isInROB() &&
+            !dep_inst->isNonSpeculative()) {
+            DPRINTF(BE, "Adding instruction to exeList [sn:%lli]\n",
+                    dep_inst->seqNum);
+            exeList.push(dep_inst);
+            if (dep_inst->iqItValid) {
+                DPRINTF(BE, "Removing instruction from waiting list\n");
+                waitingList.erase(dep_inst->iqIt);
+                waitingInsts--;
+                dep_inst->iqItValid = false;
+                assert(waitingInsts >= 0);
+            }
+            if (dep_inst->isMemRef()) {
+                removeWaitingMemOp(dep_inst);
+                DPRINTF(BE, "Issued a waiting mem op [sn:%lli]\n",
+                        dep_inst->seqNum);
+            }
+        }
+    }
+    return num_outputs;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::rescheduleMemInst(DynInstPtr &inst)
+{
+    replayList.push_front(inst);
+}
+
+template <class Impl>
+LWBackEnd<Impl>::TrapEvent::TrapEvent(LWBackEnd<Impl> *_be)
+    : Event(&mainEventQueue, CPU_Tick_Pri), be(_be)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::TrapEvent::process()
+{
+    be->trapSquash = true;
+}
+
+template <class Impl>
+const char *
+LWBackEnd<Impl>::TrapEvent::description()
+{
+    return "Trap event";
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::replayMemInst(DynInstPtr &inst)
+{
+    bool found_inst = false;
+    while (!replayList.empty()) {
+        exeList.push(replayList.front());
+        if (replayList.front() == inst) {
+            found_inst = true;
+        }
+        replayList.pop_front();
+    }
+    assert(found_inst);
+}
+
+template<class Impl>
+LWBackEnd<Impl>::LdWritebackEvent::LdWritebackEvent(DynInstPtr &_inst,
+                                                  LWBackEnd<Impl> *_be)
+    : Event(&mainEventQueue), inst(_inst), be(_be), dcacheMiss(false)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template<class Impl>
+void
+LWBackEnd<Impl>::LdWritebackEvent::process()
+{
+    DPRINTF(BE, "Load writeback event [sn:%lli]\n", inst->seqNum);
+//    DPRINTF(Activity, "Activity: Ld Writeback event [sn:%lli]\n", inst->seqNum);
+
+    //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
+
+//    iewStage->wakeCPU();
+
+    if (dcacheMiss) {
+        be->removeDcacheMiss(inst);
+    }
+
+    if (inst->isSquashed()) {
+        inst = NULL;
+        return;
+    }
+
+    if (!inst->isExecuted()) {
+        inst->setExecuted();
+
+        // Execute again to copy data to proper place.
+        inst->completeAcc();
+    }
+
+    // Need to insert instruction into queue to commit
+    be->instToCommit(inst);
+
+    //wroteToTimeBuffer = true;
+//    iewStage->activityThisCycle();
+
+    inst = NULL;
+}
+
+template<class Impl>
+const char *
+LWBackEnd<Impl>::LdWritebackEvent::description()
+{
+    return "Load writeback event";
+}
+
+
+template <class Impl>
+LWBackEnd<Impl>::DCacheCompletionEvent::DCacheCompletionEvent(LWBackEnd *_be)
+    : Event(&mainEventQueue, CPU_Tick_Pri), be(_be)
+{
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::DCacheCompletionEvent::process()
+{
+}
+
+template <class Impl>
+const char *
+LWBackEnd<Impl>::DCacheCompletionEvent::description()
+{
+    return "Cache completion event";
+}
+
+template <class Impl>
+LWBackEnd<Impl>::LWBackEnd(Params *params)
+    : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5),
+      xcSquash(false), cacheCompletionEvent(this),
+      dcacheInterface(params->dcacheInterface), width(params->backEndWidth),
+      exactFullStall(true)
+{
+    numROBEntries = params->numROBEntries;
+    numInsts = 0;
+    numDispatchEntries = 32;
+    maxOutstandingMemOps = 4;
+    numWaitingMemOps = 0;
+    waitingInsts = 0;
+//    IQ.setBE(this);
+    LSQ.setBE(this);
+
+    // Setup IQ and LSQ with their parameters here.
+    instsToDispatch = d2i.getWire(-1);
+
+    instsToExecute = i2e.getWire(-1);
+
+//    IQ.setIssueExecQueue(&i2e);
+
+    dispatchWidth = params->dispatchWidth ? params->dispatchWidth : width;
+    issueWidth = params->issueWidth ? params->issueWidth : width;
+    wbWidth = params->wbWidth ? params->wbWidth : width;
+    commitWidth = params->commitWidth ? params->commitWidth : width;
+
+    LSQ.init(params, params->LQEntries, params->SQEntries, 0);
+
+    dispatchStatus = Running;
+}
+
+template <class Impl>
+std::string
+LWBackEnd<Impl>::name() const
+{
+    return cpu->name() + ".backend";
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::regStats()
+{
+    using namespace Stats;
+    rob_cap_events
+        .init(cpu->number_of_threads)
+        .name(name() + ".ROB:cap_events")
+        .desc("number of cycles where ROB cap was active")
+        .flags(total)
+        ;
+
+    rob_cap_inst_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".ROB:cap_inst")
+        .desc("number of instructions held up by ROB cap")
+        .flags(total)
+        ;
+
+    iq_cap_events
+        .init(cpu->number_of_threads)
+        .name(name() +".IQ:cap_events" )
+        .desc("number of cycles where IQ cap was active")
+        .flags(total)
+        ;
+
+    iq_cap_inst_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".IQ:cap_inst")
+        .desc("number of instructions held up by IQ cap")
+        .flags(total)
+        ;
+
+
+    exe_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:count")
+        .desc("number of insts issued")
+        .flags(total)
+        ;
+
+    exe_swp
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:swp")
+        .desc("number of swp insts issued")
+        .flags(total)
+        ;
+
+    exe_nop
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:nop")
+        .desc("number of nop insts issued")
+        .flags(total)
+        ;
+
+    exe_refs
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:refs")
+        .desc("number of memory reference insts issued")
+        .flags(total)
+        ;
+
+    exe_loads
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:loads")
+        .desc("number of load insts issued")
+        .flags(total)
+        ;
+
+    exe_branches
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:branches")
+        .desc("Number of branches issued")
+        .flags(total)
+        ;
+
+    issued_ops
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:op_count")
+        .desc("number of insts issued")
+        .flags(total)
+        ;
+
+/*
+    for (int i=0; i<Num_OpClasses; ++i) {
+        stringstream subname;
+        subname << opClassStrings[i] << "_delay";
+        issue_delay_dist.subname(i, subname.str());
+    }
+*/
+    //
+    //  Other stats
+    //
+    lsq_forw_loads
+        .init(cpu->number_of_threads)
+        .name(name() + ".LSQ:forw_loads")
+        .desc("number of loads forwarded via LSQ")
+        .flags(total)
+        ;
+
+    inv_addr_loads
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:addr_loads")
+        .desc("number of invalid-address loads")
+        .flags(total)
+        ;
+
+    inv_addr_swpfs
+        .init(cpu->number_of_threads)
+        .name(name() + ".ISSUE:addr_swpfs")
+        .desc("number of invalid-address SW prefetches")
+        .flags(total)
+        ;
+
+    lsq_blocked_loads
+        .init(cpu->number_of_threads)
+        .name(name() + ".LSQ:blocked_loads")
+        .desc("number of ready loads not issued due to memory disambiguation")
+        .flags(total)
+        ;
+
+    lsqInversion
+        .name(name() + ".ISSUE:lsq_invert")
+        .desc("Number of times LSQ instruction issued early")
+        ;
+
+    n_issued_dist
+        .init(issueWidth + 1)
+        .name(name() + ".ISSUE:issued_per_cycle")
+        .desc("Number of insts issued each cycle")
+        .flags(total | pdf | dist)
+        ;
+    issue_delay_dist
+        .init(Num_OpClasses,0,99,2)
+        .name(name() + ".ISSUE:")
+        .desc("cycles from operands ready to issue")
+        .flags(pdf | cdf)
+        ;
+
+    queue_res_dist
+        .init(Num_OpClasses, 0, 99, 2)
+        .name(name() + ".IQ:residence:")
+        .desc("cycles from dispatch to issue")
+        .flags(total | pdf | cdf )
+        ;
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        queue_res_dist.subname(i, opClassStrings[i]);
+    }
+
+    writeback_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:count")
+        .desc("cumulative count of insts written-back")
+        .flags(total)
+        ;
+
+    producer_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:producers")
+        .desc("num instructions producing a value")
+        .flags(total)
+        ;
+
+    consumer_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:consumers")
+        .desc("num instructions consuming a value")
+        .flags(total)
+        ;
+
+    wb_penalized
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:penalized")
+        .desc("number of instrctions required to write to 'other' IQ")
+        .flags(total)
+        ;
+
+
+    wb_penalized_rate
+        .name(name() + ".WB:penalized_rate")
+        .desc ("fraction of instructions written-back that wrote to 'other' IQ")
+        .flags(total)
+        ;
+
+    wb_penalized_rate = wb_penalized / writeback_count;
+
+    wb_fanout
+        .name(name() + ".WB:fanout")
+        .desc("average fanout of values written-back")
+        .flags(total)
+        ;
+
+    wb_fanout = producer_inst / consumer_inst;
+
+    wb_rate
+        .name(name() + ".WB:rate")
+        .desc("insts written-back per cycle")
+        .flags(total)
+        ;
+    wb_rate = writeback_count / cpu->numCycles;
+
+    stat_com_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:count")
+        .desc("Number of instructions committed")
+        .flags(total)
+        ;
+
+    stat_com_swp
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:swp_count")
+        .desc("Number of s/w prefetches committed")
+        .flags(total)
+        ;
+
+    stat_com_refs
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:refs")
+        .desc("Number of memory references committed")
+        .flags(total)
+        ;
+
+    stat_com_loads
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:loads")
+        .desc("Number of loads committed")
+        .flags(total)
+        ;
+
+    stat_com_membars
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:membars")
+        .desc("Number of memory barriers committed")
+        .flags(total)
+        ;
+
+    stat_com_branches
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:branches")
+        .desc("Number of branches committed")
+        .flags(total)
+        ;
+    n_committed_dist
+        .init(0,commitWidth,1)
+        .name(name() + ".COM:committed_per_cycle")
+        .desc("Number of insts commited each cycle")
+        .flags(pdf)
+        ;
+
+    //
+    //  Commit-Eligible instructions...
+    //
+    //  -> The number of instructions eligible to commit in those
+    //  cycles where we reached our commit BW limit (less the number
+    //  actually committed)
+    //
+    //  -> The average value is computed over ALL CYCLES... not just
+    //  the BW limited cycles
+    //
+    //  -> The standard deviation is computed only over cycles where
+    //  we reached the BW limit
+    //
+    commit_eligible
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:bw_limited")
+        .desc("number of insts not committed due to BW limits")
+        .flags(total)
+        ;
+
+    commit_eligible_samples
+        .name(name() + ".COM:bw_lim_events")
+        .desc("number cycles where commit BW limit reached")
+        ;
+
+    ROB_fcount
+        .name(name() + ".ROB:full_count")
+        .desc("number of cycles where ROB was full")
+        ;
+
+    ROB_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".ROB:occupancy")
+        .desc(name() + ".ROB occupancy (cumulative)")
+        .flags(total)
+        ;
+
+    ROB_full_rate
+        .name(name() + ".ROB:full_rate")
+        .desc("ROB full per cycle")
+        ;
+    ROB_full_rate = ROB_fcount / cpu->numCycles;
+
+    ROB_occ_rate
+        .name(name() + ".ROB:occ_rate")
+        .desc("ROB occupancy rate")
+        .flags(total)
+        ;
+    ROB_occ_rate = ROB_count / cpu->numCycles;
+
+    ROB_occ_dist
+        .init(cpu->number_of_threads,0,numROBEntries,2)
+        .name(name() + ".ROB:occ_dist")
+        .desc("ROB Occupancy per cycle")
+        .flags(total | cdf)
+        ;
+
+//    IQ.regStats();
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::setCommBuffer(TimeBuffer<CommStruct> *_comm)
+{
+    comm = _comm;
+    toIEW = comm->getWire(0);
+    fromCommit = comm->getWire(-1);
+}
+
+#if FULL_SYSTEM
+template <class Impl>
+void
+LWBackEnd<Impl>::checkInterrupts()
+{
+    if (cpu->checkInterrupts &&
+        cpu->check_interrupts() &&
+        !cpu->inPalMode(thread->readPC()) &&
+        !trapSquash &&
+        !xcSquash) {
+        // Will need to squash all instructions currently in flight and have
+        // the interrupt handler restart at the last non-committed inst.
+        // Most of that can be handled through the trap() function.  The
+        // processInterrupts() function really just checks for interrupts
+        // and then calls trap() if there is an interrupt present.
+
+        // Not sure which thread should be the one to interrupt.  For now
+        // always do thread 0.
+        assert(!thread->inSyscall);
+        thread->inSyscall = true;
+
+        // CPU will handle implementation of the interrupt.
+        cpu->processInterrupts();
+
+        // Now squash or record that I need to squash this cycle.
+        commitStatus = TrapPending;
+
+        // Exit state update mode to avoid accidental updating.
+        thread->inSyscall = false;
+
+        // Generate trap squash event.
+        generateTrapEvent();
+
+        DPRINTF(BE, "Interrupt detected.\n");
+    }
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::handleFault(Fault &fault, Tick latency)
+{
+    DPRINTF(BE, "Handling fault!");
+
+    assert(!thread->inSyscall);
+
+    thread->inSyscall = true;
+
+    // Consider holding onto the trap and waiting until the trap event
+    // happens for this to be executed.
+    fault->invoke(thread->getXCProxy());
+
+    // Exit state update mode to avoid accidental updating.
+    thread->inSyscall = false;
+
+    commitStatus = TrapPending;
+
+    // Generate trap squash event.
+    generateTrapEvent(latency);
+}
+#endif
+
+template <class Impl>
+void
+LWBackEnd<Impl>::tick()
+{
+    DPRINTF(BE, "Ticking back end\n");
+
+    ROB_count[0]+= numInsts;
+
+    wbCycle = 0;
+
+#if FULL_SYSTEM
+    checkInterrupts();
+
+    if (trapSquash) {
+        assert(!xcSquash);
+        squashFromTrap();
+    } else if (xcSquash) {
+        squashFromXC();
+    } else if (fetchHasFault && robEmpty() && frontEnd->isEmpty()) {
+        DPRINTF(BE, "ROB and front end empty, handling fetch fault\n");
+        Fault fetch_fault = frontEnd->getFault();
+        if (fetch_fault == NoFault) {
+            DPRINTF(BE, "Fetch no longer has a fault, cancelling out.\n");
+            fetchHasFault = false;
+        } else {
+            handleFault(fetch_fault);
+            fetchHasFault = false;
+        }
+    }
+#endif
+
+    // Read in any done instruction information and update the IQ or LSQ.
+    updateStructures();
+
+    if (dispatchStatus != Blocked) {
+        dispatchInsts();
+    } else {
+        checkDispatchStatus();
+    }
+
+    if (commitStatus != TrapPending) {
+        executeInsts();
+
+        commitInsts();
+    }
+
+    LSQ.writebackStores();
+
+    DPRINTF(BE, "Waiting insts: %i, mem ops: %i, ROB entries in use: %i, "
+            "LSQ loads: %i, LSQ stores: %i\n",
+            waitingInsts, numWaitingMemOps, numInsts,
+            LSQ.numLoads(), LSQ.numStores());
+
+#ifdef DEBUG
+    assert(numInsts == instList.size());
+    assert(waitingInsts == waitingList.size());
+    assert(numWaitingMemOps == waitingMemOps.size());
+#endif
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::updateStructures()
+{
+    if (fromCommit->doneSeqNum) {
+        LSQ.commitLoads(fromCommit->doneSeqNum);
+        LSQ.commitStores(fromCommit->doneSeqNum);
+    }
+
+    if (fromCommit->nonSpecSeqNum) {
+        if (fromCommit->uncached) {
+//            LSQ.executeLoad(fromCommit->lqIdx);
+        } else {
+//            IQ.scheduleNonSpec(
+//                fromCommit->nonSpecSeqNum);
+        }
+    }
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::addToLSQ(DynInstPtr &inst)
+{
+    // Do anything LSQ specific here?
+    LSQ.insert(inst);
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::dispatchInsts()
+{
+    DPRINTF(BE, "Trying to dispatch instructions.\n");
+
+    while (numInsts < numROBEntries &&
+           numWaitingMemOps < maxOutstandingMemOps) {
+        // Get instruction from front of time buffer
+        DynInstPtr inst = frontEnd->getInst();
+        if (!inst) {
+            break;
+        } else if (inst->isSquashed()) {
+            continue;
+        }
+
+        ++numInsts;
+        instList.push_front(inst);
+
+        inst->setInROB();
+
+        DPRINTF(BE, "Dispatching instruction [sn:%lli] PC:%#x\n",
+                inst->seqNum, inst->readPC());
+
+        for (int i = 0; i < inst->numDestRegs(); ++i)
+            renameTable[inst->destRegIdx(i)] = inst;
+
+        if (inst->readyToIssue() && !inst->isNonSpeculative()) {
+            DPRINTF(BE, "Instruction [sn:%lli] ready, addding to exeList.\n",
+                    inst->seqNum);
+            exeList.push(inst);
+            if (inst->isMemRef()) {
+                LSQ.insert(inst);
+            }
+        } else {
+            if (inst->isNonSpeculative()) {
+                inst->setCanCommit();
+                DPRINTF(BE, "Adding non speculative instruction\n");
+            }
+
+            if (inst->isMemRef()) {
+                addWaitingMemOp(inst);
+                LSQ.insert(inst);
+            }
+
+            DPRINTF(BE, "Instruction [sn:%lli] not ready, addding to "
+                    "waitingList.\n",
+                    inst->seqNum);
+            waitingList.push_front(inst);
+            inst->iqIt = waitingList.begin();
+            inst->iqItValid = true;
+            waitingInsts++;
+        }
+    }
+
+    // Check if IQ or LSQ is full.  If so we'll need to break and stop
+    // removing instructions.  Also update the number of insts to remove
+    // from the queue.  Check here if we don't care about exact stall
+    // conditions.
+/*
+    bool stall = false;
+    if (IQ.isFull()) {
+        DPRINTF(BE, "IQ is full!\n");
+        stall = true;
+    } else if (LSQ.isFull()) {
+        DPRINTF(BE, "LSQ is full!\n");
+        stall = true;
+    } else if (isFull()) {
+        DPRINTF(BE, "ROB is full!\n");
+        stall = true;
+        ROB_fcount++;
+    }
+    if (stall) {
+        d2i.advance();
+        dispatchStall();
+        return;
+    }
+*/
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::dispatchStall()
+{
+    dispatchStatus = Blocked;
+    if (!cpu->decoupledFrontEnd) {
+        // Tell front end to stall here through a timebuffer, or just tell
+        // it directly.
+    }
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::checkDispatchStatus()
+{
+    DPRINTF(BE, "Checking dispatch status\n");
+    assert(dispatchStatus == Blocked);
+    if (!LSQ.isFull() && !isFull()) {
+        DPRINTF(BE, "Dispatch no longer blocked\n");
+        dispatchStatus = Running;
+        dispatchInsts();
+    }
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::executeInsts()
+{
+    DPRINTF(BE, "Trying to execute instructions\n");
+
+    int num_executed = 0;
+    while (!exeList.empty() && num_executed < issueWidth) {
+        DynInstPtr inst = exeList.top();
+
+        DPRINTF(BE, "Executing inst [sn:%lli] PC: %#x\n",
+                inst->seqNum, inst->readPC());
+
+        // Check if the instruction is squashed; if so then skip it
+        // and don't count it towards the FU usage.
+        if (inst->isSquashed()) {
+            DPRINTF(BE, "Execute: Instruction was squashed.\n");
+
+            // Not sure how to handle this plus the method of sending # of
+            // instructions to use.  Probably will just have to count it
+            // towards the bandwidth usage, but not the FU usage.
+            ++num_executed;
+
+            // Consider this instruction executed so that commit can go
+            // ahead and retire the instruction.
+            inst->setExecuted();
+
+            // Not sure if I should set this here or just let commit try to
+            // commit any squashed instructions.  I like the latter a bit more.
+            inst->setCanCommit();
+
+//            ++iewExecSquashedInsts;
+            exeList.pop();
+
+            continue;
+        }
+
+        Fault fault = NoFault;
+
+        // Execute instruction.
+        // Note that if the instruction faults, it will be handled
+        // at the commit stage.
+        if (inst->isMemRef() &&
+            (!inst->isDataPrefetch() && !inst->isInstPrefetch())) {
+            if (dcacheInterface->isBlocked()) {
+                // Should I move the instruction aside?
+                DPRINTF(BE, "Execute: dcache is blocked\n");
+                break;
+            }
+            DPRINTF(BE, "Execute: Initiating access for memory "
+                    "reference.\n");
+
+            if (inst->isLoad()) {
+                LSQ.executeLoad(inst);
+            } else if (inst->isStore()) {
+                LSQ.executeStore(inst);
+                if (inst->req && !(inst->req->flags & LOCKED)) {
+                    inst->setExecuted();
+
+                    instToCommit(inst);
+                }
+            } else {
+                panic("Unknown mem type!");
+            }
+        } else {
+            inst->execute();
+
+            inst->setExecuted();
+
+            instToCommit(inst);
+        }
+
+        updateExeInstStats(inst);
+
+        ++funcExeInst;
+        ++num_executed;
+        // keep an instruction count
+        thread->numInst++;
+        thread->numInsts++;
+
+        exeList.pop();
+
+        if (inst->mispredicted()) {
+            squashDueToBranch(inst);
+            break;
+        } else if (LSQ.violation()) {
+            // Get the DynInst that caused the violation.  Note that this
+            // clears the violation signal.
+            DynInstPtr violator;
+            violator = LSQ.getMemDepViolator();
+
+            DPRINTF(BE, "LDSTQ detected a violation.  Violator PC: "
+                    "%#x, inst PC: %#x.  Addr is: %#x.\n",
+                    violator->readPC(), inst->readPC(), inst->physEffAddr);
+
+            // Squash.
+            squashDueToMemViolation(inst);
+        }
+    }
+
+    issued_ops[0]+= num_executed;
+    n_issued_dist[num_executed]++;
+}
+
+template<class Impl>
+void
+LWBackEnd<Impl>::instToCommit(DynInstPtr &inst)
+{
+
+    DPRINTF(BE, "Sending instructions to commit [sn:%lli] PC %#x.\n",
+            inst->seqNum, inst->readPC());
+
+    if (!inst->isSquashed()) {
+        DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n",
+                inst->seqNum, inst->readPC());
+
+        inst->setCanCommit();
+
+        if (inst->isExecuted()) {
+            inst->setCompleted();
+            int dependents = wakeDependents(inst);
+            if (dependents) {
+                producer_inst[0]++;
+                consumer_inst[0]+= dependents;
+            }
+        }
+    }
+
+    writeback_count[0]++;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::writebackInsts()
+{
+    int wb_width = wbWidth;
+    // Using this method I'm not quite sure how to prevent an
+    // instruction from waking its own dependents multiple times,
+    // without the guarantee that commit always has enough bandwidth
+    // to accept all instructions being written back.  This guarantee
+    // might not be too unrealistic.
+    InstListIt wb_inst_it = writeback.begin();
+    InstListIt wb_end_it = writeback.end();
+    int inst_num = 0;
+    int consumer_insts = 0;
+
+    for (; inst_num < wb_width &&
+             wb_inst_it != wb_end_it; inst_num++) {
+        DynInstPtr inst = (*wb_inst_it);
+
+        // Some instructions will be sent to commit without having
+        // executed because they need commit to handle them.
+        // E.g. Uncached loads have not actually executed when they
+        // are first sent to commit.  Instead commit must tell the LSQ
+        // when it's ready to execute the uncached load.
+        if (!inst->isSquashed()) {
+            DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n",
+                    inst->seqNum, inst->readPC());
+
+            inst->setCanCommit();
+            inst->setCompleted();
+
+            if (inst->isExecuted()) {
+                int dependents = wakeDependents(inst);
+                if (dependents) {
+                    producer_inst[0]++;
+                    consumer_insts+= dependents;
+                }
+            }
+        }
+
+        writeback.erase(wb_inst_it++);
+    }
+    LSQ.writebackStores();
+    consumer_inst[0]+= consumer_insts;
+    writeback_count[0]+= inst_num;
+}
+
+template <class Impl>
+bool
+LWBackEnd<Impl>::commitInst(int inst_num)
+{
+    // Read instruction from the head of the ROB
+    DynInstPtr inst = instList.back();
+
+    // Make sure instruction is valid
+    assert(inst);
+
+    if (!inst->readyToCommit())
+        return false;
+
+    DPRINTF(BE, "Trying to commit instruction [sn:%lli] PC:%#x\n",
+            inst->seqNum, inst->readPC());
+
+    thread->setPC(inst->readPC());
+    thread->setNextPC(inst->readNextPC());
+    inst->reachedCommit = true;
+
+    // If the instruction is not executed yet, then it is a non-speculative
+    // or store inst.  Signal backwards that it should be executed.
+    if (!inst->isExecuted()) {
+        if (inst->isNonSpeculative()) {
+#if !FULL_SYSTEM
+            // Hack to make sure syscalls aren't executed until all stores
+            // write back their data.  This direct communication shouldn't
+            // be used for anything other than this.
+            if (inst_num > 0 || LSQ.hasStoresToWB())
+#else
+            if ((inst->isMemBarrier() || inst->isWriteBarrier() ||
+                    inst->isQuiesce()) &&
+                LSQ.hasStoresToWB())
+#endif
+            {
+                DPRINTF(BE, "Waiting for all stores to writeback.\n");
+                return false;
+            }
+
+            DPRINTF(BE, "Encountered a store or non-speculative "
+                    "instruction at the head of the ROB, PC %#x.\n",
+                    inst->readPC());
+
+            // Send back the non-speculative instruction's sequence number.
+            if (inst->iqItValid) {
+                DPRINTF(BE, "Removing instruction from waiting list\n");
+                waitingList.erase(inst->iqIt);
+                inst->iqItValid = false;
+                waitingInsts--;
+                assert(waitingInsts >= 0);
+                if (inst->isStore())
+                    removeWaitingMemOp(inst);
+            }
+
+            exeList.push(inst);
+
+            // Change the instruction so it won't try to commit again until
+            // it is executed.
+            inst->clearCanCommit();
+
+//            ++commitNonSpecStalls;
+
+            return false;
+        } else if (inst->isLoad()) {
+            DPRINTF(BE, "[sn:%lli]: Uncached load, PC %#x.\n",
+                    inst->seqNum, inst->readPC());
+
+            // Send back the non-speculative instruction's sequence
+            // number.  Maybe just tell the lsq to re-execute the load.
+
+            // Send back the non-speculative instruction's sequence number.
+            if (inst->iqItValid) {
+                DPRINTF(BE, "Removing instruction from waiting list\n");
+                waitingList.erase(inst->iqIt);
+                inst->iqItValid = false;
+                waitingInsts--;
+                assert(waitingInsts >= 0);
+                removeWaitingMemOp(inst);
+            }
+            replayMemInst(inst);
+
+            inst->clearCanCommit();
+
+            return false;
+        } else {
+            panic("Trying to commit un-executed instruction "
+                  "of unknown type!\n");
+        }
+    }
+
+    // Now check if it's one of the special trap or barrier or
+    // serializing instructions.
+    if (inst->isThreadSync())
+    {
+        // Not handled for now.
+        panic("Thread sync instructions are not handled yet.\n");
+    }
+
+    // Check if the instruction caused a fault.  If so, trap.
+    Fault inst_fault = inst->getFault();
+
+    if (inst_fault != NoFault) {
+        if (!inst->isNop()) {
+            DPRINTF(BE, "Inst [sn:%lli] PC %#x has a fault\n",
+                    inst->seqNum, inst->readPC());
+            thread->setInst(
+                static_cast<TheISA::MachInst>(inst->staticInst->machInst));
+#if FULL_SYSTEM
+            handleFault(inst_fault);
+            return false;
+#else // !FULL_SYSTEM
+            panic("fault (%d) detected @ PC %08p", inst_fault,
+                  inst->PC);
+#endif // FULL_SYSTEM
+        }
+    }
+
+    if (inst->isControl()) {
+//        ++commitCommittedBranches;
+    }
+
+    int freed_regs = 0;
+
+    for (int i = 0; i < inst->numDestRegs(); ++i) {
+        DPRINTF(BE, "Commit rename map setting reg %i to [sn:%lli]\n",
+                (int)inst->destRegIdx(i), inst->seqNum);
+        thread->renameTable[inst->destRegIdx(i)] = inst;
+        ++freed_regs;
+    }
+
+    if (inst->traceData) {
+        inst->traceData->finalize();
+        inst->traceData = NULL;
+    }
+
+    inst->clearDependents();
+
+    frontEnd->addFreeRegs(freed_regs);
+
+    instList.pop_back();
+
+    --numInsts;
+    cpu->numInst++;
+    thread->numInsts++;
+    ++thread->funcExeInst;
+    // Maybe move this to where teh fault is handled; if the fault is handled,
+    // don't try to set this myself as the fault will set it.  If not, then
+    // I set thread->PC = thread->nextPC and thread->nextPC = thread->nextPC + 4.
+    thread->setPC(thread->readNextPC());
+    updateComInstStats(inst);
+
+    // Write the done sequence number here.
+//    LSQ.commitLoads(inst->seqNum);
+//    LSQ.commitStores(inst->seqNum);
+    toIEW->doneSeqNum = inst->seqNum;
+
+#if FULL_SYSTEM
+    int count = 0;
+    Addr oldpc;
+    do {
+        if (count == 0)
+            assert(!thread->inSyscall && !thread->trapPending);
+        oldpc = thread->readPC();
+        cpu->system->pcEventQueue.service(
+            thread->getXCProxy());
+        count++;
+    } while (oldpc != thread->readPC());
+    if (count > 1) {
+        DPRINTF(BE, "PC skip function event, stopping commit\n");
+        xcSquash = true;
+        return false;
+    }
+#endif
+    return true;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::commitInsts()
+{
+    int commit_width = commitWidth ? commitWidth : width;
+
+    // Not sure this should be a loop or not.
+    int inst_num = 0;
+    while (!instList.empty() && inst_num < commit_width) {
+        if (instList.back()->isSquashed()) {
+            instList.back()->clearDependents();
+            instList.pop_back();
+            --numInsts;
+            continue;
+        }
+
+        if (!commitInst(inst_num++)) {
+            DPRINTF(BE, "Can't commit, Instruction [sn:%lli] PC "
+                    "%#x is head of ROB and not ready\n",
+                    instList.back()->seqNum, instList.back()->readPC());
+            break;
+        }
+    }
+    n_committed_dist.sample(inst_num);
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::squash(const InstSeqNum &sn)
+{
+    LSQ.squash(sn);
+
+    int freed_regs = 0;
+    InstListIt waiting_list_end = waitingList.end();
+    InstListIt insts_it = waitingList.begin();
+
+    while (insts_it != waiting_list_end && (*insts_it)->seqNum > sn)
+    {
+        if ((*insts_it)->isSquashed()) {
+            ++insts_it;
+            continue;
+        }
+        DPRINTF(BE, "Squashing instruction on waitingList PC %#x, [sn:%lli].\n",
+                (*insts_it)->readPC(),
+                (*insts_it)->seqNum);
+
+        if ((*insts_it)->isMemRef()) {
+            DPRINTF(BE, "Squashing a waiting mem op [sn:%lli]\n",
+                    (*insts_it)->seqNum);
+            removeWaitingMemOp((*insts_it));
+        }
+
+        waitingList.erase(insts_it++);
+        waitingInsts--;
+    }
+    assert(waitingInsts >= 0);
+
+    insts_it = instList.begin();
+
+    while (!instList.empty() && (*insts_it)->seqNum > sn)
+    {
+        if ((*insts_it)->isSquashed()) {
+            ++insts_it;
+            continue;
+        }
+        DPRINTF(BE, "Squashing instruction on inst list PC %#x, [sn:%lli].\n",
+                (*insts_it)->readPC(),
+                (*insts_it)->seqNum);
+
+        // Mark the instruction as squashed, and ready to commit so that
+        // it can drain out of the pipeline.
+        (*insts_it)->setSquashed();
+
+        (*insts_it)->setCanCommit();
+
+        (*insts_it)->removeInROB();
+
+        for (int i = 0; i < (*insts_it)->numDestRegs(); ++i) {
+            DynInstPtr prev_dest = (*insts_it)->getPrevDestInst(i);
+            DPRINTF(BE, "Commit rename map setting reg %i to [sn:%lli]\n",
+                    (int)(*insts_it)->destRegIdx(i), prev_dest->seqNum);
+            renameTable[(*insts_it)->destRegIdx(i)] = prev_dest;
+            ++freed_regs;
+        }
+
+        (*insts_it)->clearDependents();
+
+        instList.erase(insts_it++);
+        --numInsts;
+    }
+
+    insts_it = waitingList.begin();
+    while (!waitingList.empty() && insts_it != waitingList.end()) {
+        if ((*insts_it)->seqNum < sn) {
+            ++insts_it;
+            continue;
+        }
+        assert((*insts_it)->isSquashed());
+
+        waitingList.erase(insts_it++);
+        waitingInsts--;
+    }
+
+    frontEnd->addFreeRegs(freed_regs);
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::squashFromXC()
+{
+    InstSeqNum squashed_inst = robEmpty() ? 0 : instList.back()->seqNum - 1;
+    squash(squashed_inst);
+    frontEnd->squash(squashed_inst, thread->readPC(),
+                     false, false);
+
+    thread->trapPending = false;
+    thread->inSyscall = false;
+    xcSquash = false;
+    commitStatus = Running;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::squashFromTrap()
+{
+    InstSeqNum squashed_inst = robEmpty() ? 0 : instList.back()->seqNum - 1;
+    squash(squashed_inst);
+    frontEnd->squash(squashed_inst, thread->readPC(),
+                     false, false);
+
+    thread->trapPending = false;
+    thread->inSyscall = false;
+    trapSquash = false;
+    commitStatus = Running;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::squashDueToBranch(DynInstPtr &inst)
+{
+    // Update the branch predictor state I guess
+    DPRINTF(BE, "Squashing due to branch [sn:%lli], will restart at PC %#x\n",
+            inst->seqNum, inst->readNextPC());
+    squash(inst->seqNum);
+    frontEnd->squash(inst->seqNum, inst->readNextPC(),
+                     true, inst->mispredicted());
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::squashDueToMemViolation(DynInstPtr &inst)
+{
+    // Update the branch predictor state I guess
+    DPRINTF(BE, "Squashing due to violation [sn:%lli], will restart at PC %#x\n",
+            inst->seqNum, inst->readNextPC());
+    squash(inst->seqNum);
+    frontEnd->squash(inst->seqNum, inst->readNextPC(),
+                     false, inst->mispredicted());
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::squashDueToMemBlocked(DynInstPtr &inst)
+{
+    DPRINTF(IEW, "Memory blocked, squashing load and younger insts, "
+            "PC: %#x [sn:%i].\n", inst->readPC(), inst->seqNum);
+
+    squash(inst->seqNum - 1);
+    frontEnd->squash(inst->seqNum - 1, inst->readPC());
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::fetchFault(Fault &fault)
+{
+    faultFromFetch = fault;
+    fetchHasFault = true;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::updateExeInstStats(DynInstPtr &inst)
+{
+    int thread_number = inst->threadNumber;
+
+    //
+    //  Pick off the software prefetches
+    //
+#ifdef TARGET_ALPHA
+    if (inst->isDataPrefetch())
+        exe_swp[thread_number]++;
+    else
+        exe_inst[thread_number]++;
+#else
+    exe_inst[thread_number]++;
+#endif
+
+    //
+    //  Control operations
+    //
+    if (inst->isControl())
+        exe_branches[thread_number]++;
+
+    //
+    //  Memory operations
+    //
+    if (inst->isMemRef()) {
+        exe_refs[thread_number]++;
+
+        if (inst->isLoad())
+            exe_loads[thread_number]++;
+    }
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst)
+{
+    unsigned thread = inst->threadNumber;
+
+    //
+    //  Pick off the software prefetches
+    //
+#ifdef TARGET_ALPHA
+    if (inst->isDataPrefetch()) {
+        stat_com_swp[thread]++;
+    } else {
+        stat_com_inst[thread]++;
+    }
+#else
+    stat_com_inst[thread]++;
+#endif
+
+    //
+    //  Control Instructions
+    //
+    if (inst->isControl())
+        stat_com_branches[thread]++;
+
+    //
+    //  Memory references
+    //
+    if (inst->isMemRef()) {
+        stat_com_refs[thread]++;
+
+        if (inst->isLoad()) {
+            stat_com_loads[thread]++;
+        }
+    }
+
+    if (inst->isMemBarrier()) {
+        stat_com_membars[thread]++;
+    }
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::dumpInsts()
+{
+    int num = 0;
+    int valid_num = 0;
+
+    InstListIt inst_list_it = --(instList.end());
+
+    cprintf("ExeList size: %i\n", exeList.size());
+
+    cprintf("Inst list size: %i\n", instList.size());
+
+    while (inst_list_it != instList.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it--;
+        ++num;
+    }
+
+    cprintf("Waiting list size: %i\n", waitingList.size());
+
+    inst_list_it = --(waitingList.end());
+
+    while (inst_list_it != waitingList.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it--;
+        ++num;
+    }
+
+    cprintf("waitingMemOps list size: %i\n", waitingMemOps.size());
+
+    MemIt waiting_it = waitingMemOps.begin();
+
+    while (waiting_it != waitingMemOps.end())
+    {
+        cprintf("[sn:%lli] ", (*waiting_it));
+        waiting_it++;
+        ++num;
+    }
+    cprintf("\n");
+}
diff --git a/cpu/ozone/lw_lsq.cc b/cpu/ozone/lw_lsq.cc
new file mode 100644
index 000000000..922228b09
--- /dev/null
+++ b/cpu/ozone/lw_lsq.cc
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/lw_lsq_impl.hh"
+
+// Force the instantiation of LDSTQ for all the implementations we care about.
+template class OzoneLWLSQ<OzoneImpl>;
+
diff --git a/cpu/ozone/lw_lsq.hh b/cpu/ozone/lw_lsq.hh
new file mode 100644
index 000000000..2b2c25b58
--- /dev/null
+++ b/cpu/ozone/lw_lsq.hh
@@ -0,0 +1,649 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_OZONE_LW_LSQ_HH__
+#define __CPU_OZONE_LW_LSQ_HH__
+
+#include <list>
+#include <map>
+#include <queue>
+#include <algorithm>
+
+#include "arch/faults.hh"
+#include "arch/isa_traits.hh"
+#include "config/full_system.hh"
+#include "base/hashmap.hh"
+#include "cpu/inst_seq.hh"
+#include "mem/mem_interface.hh"
+//#include "mem/page_table.hh"
+#include "sim/sim_object.hh"
+
+class PageTable;
+
+/**
+ * Class that implements the actual LQ and SQ for each specific thread.
+ * Both are circular queues; load entries are freed upon committing, while
+ * store entries are freed once they writeback. The LSQUnit tracks if there
+ * are memory ordering violations, and also detects partial load to store
+ * forwarding cases (a store only has part of a load's data) that requires
+ * the load to wait until the store writes back. In the former case it
+ * holds onto the instruction until the dependence unit looks at it, and
+ * in the latter it stalls the LSQ until the store writes back. At that
+ * point the load is replayed.
+ */
+template <class Impl>
+class OzoneLWLSQ {
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::BackEnd BackEnd;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::IssueStruct IssueStruct;
+
+    typedef TheISA::IntReg IntReg;
+
+    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator LdMapIt;
+
+  private:
+    class StoreCompletionEvent : public Event {
+      public:
+        /** Constructs a store completion event. */
+        StoreCompletionEvent(DynInstPtr &inst, BackEnd *be,
+                             Event *wb_event, OzoneLWLSQ *lsq_ptr);
+
+        /** Processes the store completion event. */
+        void process();
+
+        /** Returns the description of this event. */
+        const char *description();
+
+      private:
+        /** The store index of the store being written back. */
+        DynInstPtr inst;
+
+        BackEnd *be;
+        /** The writeback event for the store.  Needed for store
+         * conditionals.
+         */
+        Event *wbEvent;
+        /** The pointer to the LSQ unit that issued the store. */
+        OzoneLWLSQ<Impl> *lsqPtr;
+    };
+
+  public:
+    /** Constructs an LSQ unit. init() must be called prior to use. */
+    OzoneLWLSQ();
+
+    /** Initializes the LSQ unit with the specified number of entries. */
+    void init(Params *params, unsigned maxLQEntries,
+              unsigned maxSQEntries, unsigned id);
+
+    /** Returns the name of the LSQ unit. */
+    std::string name() const;
+
+    /** Sets the CPU pointer. */
+    void setCPU(FullCPU *cpu_ptr)
+    { cpu = cpu_ptr; }
+
+    /** Sets the back-end stage pointer. */
+    void setBE(BackEnd *be_ptr)
+    { be = be_ptr; }
+
+    /** Sets the page table pointer. */
+    void setPageTable(PageTable *pt_ptr);
+
+    /** Ticks the LSQ unit, which in this case only resets the number of
+     * used cache ports.
+     * @todo: Move the number of used ports up to the LSQ level so it can
+     * be shared by all LSQ units.
+     */
+    void tick() { usedPorts = 0; }
+
+    /** Inserts an instruction. */
+    void insert(DynInstPtr &inst);
+    /** Inserts a load instruction. */
+    void insertLoad(DynInstPtr &load_inst);
+    /** Inserts a store instruction. */
+    void insertStore(DynInstPtr &store_inst);
+
+    /** Executes a load instruction. */
+    Fault executeLoad(DynInstPtr &inst);
+
+//    Fault executeLoad(int lq_idx);
+    /** Executes a store instruction. */
+    Fault executeStore(DynInstPtr &inst);
+
+    /** Commits the head load. */
+    void commitLoad();
+    /** Commits loads older than a specific sequence number. */
+    void commitLoads(InstSeqNum &youngest_inst);
+
+    /** Commits stores older than a specific sequence number. */
+    void commitStores(InstSeqNum &youngest_inst);
+
+    /** Writes back stores. */
+    void writebackStores();
+
+    // @todo: Include stats in the LSQ unit.
+    //void regStats();
+
+    /** Clears all the entries in the LQ. */
+    void clearLQ();
+
+    /** Clears all the entries in the SQ. */
+    void clearSQ();
+
+    /** Resizes the LQ to a given size. */
+    void resizeLQ(unsigned size);
+
+    /** Resizes the SQ to a given size. */
+    void resizeSQ(unsigned size);
+
+    /** Squashes all instructions younger than a specific sequence number. */
+    void squash(const InstSeqNum &squashed_num);
+
+    /** Returns if there is a memory ordering violation. Value is reset upon
+     * call to getMemDepViolator().
+     */
+    bool violation() { return memDepViolator; }
+
+    /** Returns the memory ordering violator. */
+    DynInstPtr getMemDepViolator();
+
+    /** Returns if a load became blocked due to the memory system.  It clears
+     *  the bool's value upon this being called.
+     */
+    bool loadBlocked()
+    { return isLoadBlocked; }
+
+    void clearLoadBlocked()
+    { isLoadBlocked = false; }
+
+    bool isLoadBlockedHandled()
+    { return loadBlockedHandled; }
+
+    void setLoadBlockedHandled()
+    { loadBlockedHandled = true; }
+
+    /** Returns the number of free entries (min of free LQ and SQ entries). */
+    unsigned numFreeEntries();
+
+    /** Returns the number of loads ready to execute. */
+    int numLoadsReady();
+
+    /** Returns the number of loads in the LQ. */
+    int numLoads() { return loads; }
+
+    /** Returns the number of stores in the SQ. */
+    int numStores() { return stores; }
+
+    /** Returns if either the LQ or SQ is full. */
+    bool isFull() { return lqFull() || sqFull(); }
+
+    /** Returns if the LQ is full. */
+    bool lqFull() { return loads >= (LQEntries - 1); }
+
+    /** Returns if the SQ is full. */
+    bool sqFull() { return stores >= (SQEntries - 1); }
+
+    /** Debugging function to dump instructions in the LSQ. */
+    void dumpInsts();
+
+    /** Returns the number of instructions in the LSQ. */
+    unsigned getCount() { return loads + stores; }
+
+    /** Returns if there are any stores to writeback. */
+    bool hasStoresToWB() { return storesToWB; }
+
+    /** Returns the number of stores to writeback. */
+    int numStoresToWB() { return storesToWB; }
+
+    /** Returns if the LSQ unit will writeback on this cycle. */
+    bool willWB() { return storeQueue.back().canWB &&
+                        !storeQueue.back().completed &&
+                        !dcacheInterface->isBlocked(); }
+
+  private:
+    /** Completes the store at the specified index. */
+    void completeStore(int store_idx);
+
+  private:
+    /** Pointer to the CPU. */
+    FullCPU *cpu;
+
+    /** Pointer to the back-end stage. */
+    BackEnd *be;
+
+    /** Pointer to the D-cache. */
+    MemInterface *dcacheInterface;
+
+    /** Pointer to the page table. */
+    PageTable *pTable;
+
+  public:
+    struct SQEntry {
+        /** Constructs an empty store queue entry. */
+        SQEntry()
+            : inst(NULL), req(NULL), size(0), data(0),
+              canWB(0), committed(0), completed(0), lqIt(NULL)
+        { }
+
+        /** Constructs a store queue entry for a given instruction. */
+        SQEntry(DynInstPtr &_inst)
+            : inst(_inst), req(NULL), size(0), data(0),
+              canWB(0), committed(0), completed(0), lqIt(NULL)
+        { }
+
+        /** The store instruction. */
+        DynInstPtr inst;
+        /** The memory request for the store. */
+        MemReqPtr req;
+        /** The size of the store. */
+        int size;
+        /** The store data. */
+        IntReg data;
+        /** Whether or not the store can writeback. */
+        bool canWB;
+        /** Whether or not the store is committed. */
+        bool committed;
+        /** Whether or not the store is completed. */
+        bool completed;
+
+        typename std::list<DynInstPtr>::iterator lqIt;
+    };
+
+    enum Status {
+        Running,
+        Idle,
+        DcacheMissStall,
+        DcacheMissSwitch
+    };
+
+  private:
+    /** The OzoneLWLSQ thread id. */
+    unsigned lsqID;
+
+    /** The status of the LSQ unit. */
+    Status _status;
+
+    /** The store queue. */
+//    std::vector<SQEntry> storeQueue;
+    std::list<SQEntry> storeQueue;
+    /** The load queue. */
+//    std::vector<DynInstPtr> loadQueue;
+    std::list<DynInstPtr> loadQueue;
+
+    typedef typename std::list<SQEntry>::iterator SQIt;
+    typedef typename std::list<DynInstPtr>::iterator LQIt;
+
+
+    struct HashFn {
+    size_t operator() (const int a) const
+    {
+        unsigned hash = (((a >> 14) ^ ((a >> 2) & 0xffff))) & 0x7FFFFFFF;
+
+        return hash;
+    }
+    };
+
+    m5::hash_map<int, SQIt, HashFn> SQItHash;
+    std::queue<int> SQIndices;
+    m5::hash_map<int, LQIt, HashFn> LQItHash;
+    std::queue<int> LQIndices;
+
+    typedef typename m5::hash_map<int, LQIt, HashFn>::iterator LQHashIt;
+    typedef typename m5::hash_map<int, SQIt, HashFn>::iterator SQHashIt;
+    // Consider making these 16 bits
+    /** The number of LQ entries. */
+    unsigned LQEntries;
+    /** The number of SQ entries. */
+    unsigned SQEntries;
+
+    /** The number of load instructions in the LQ. */
+    int loads;
+    /** The number of store instructions in the SQ (excludes those waiting to
+     * writeback).
+     */
+    int stores;
+
+    int storesToWB;
+
+    /// @todo Consider moving to a more advanced model with write vs read ports
+    /** The number of cache ports available each cycle. */
+    int cachePorts;
+
+    /** The number of used cache ports in this cycle. */
+    int usedPorts;
+
+    //list<InstSeqNum> mshrSeqNums;
+
+     //Stats::Scalar<> dcacheStallCycles;
+    Counter lastDcacheStall;
+
+    // Make these per thread?
+    /** Whether or not the LSQ is stalled. */
+    bool stalled;
+    /** The store that causes the stall due to partial store to load
+     * forwarding.
+     */
+    InstSeqNum stallingStoreIsn;
+    /** The index of the above store. */
+//    int stallingLoadIdx;
+    LQIt stallingLoad;
+
+    /** Whether or not a load is blocked due to the memory system.  It is
+     *  cleared when this value is checked via loadBlocked().
+     */
+    bool isLoadBlocked;
+
+    bool loadBlockedHandled;
+
+    InstSeqNum blockedLoadSeqNum;
+
+    /** The oldest faulting load instruction. */
+    DynInstPtr loadFaultInst;
+    /** The oldest faulting store instruction. */
+    DynInstPtr storeFaultInst;
+
+    /** The oldest load that caused a memory ordering violation. */
+    DynInstPtr memDepViolator;
+
+    // Will also need how many read/write ports the Dcache has.  Or keep track
+    // of that in stage that is one level up, and only call executeLoad/Store
+    // the appropriate number of times.
+
+  public:
+    /** Executes the load at the given index. */
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    /** Executes the store at the given index. */
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+    /** Returns the index of the head load instruction. */
+//    int getLoadHead() { return loadHead; }
+    /** Returns the sequence number of the head load instruction. */
+    InstSeqNum getLoadHeadSeqNum()
+    {
+        if (!loadQueue.empty()) {
+            return loadQueue.back()->seqNum;
+        } else {
+            return 0;
+        }
+
+    }
+
+    /** Returns the index of the head store instruction. */
+//    int getStoreHead() { return storeHead; }
+    /** Returns the sequence number of the head store instruction. */
+    InstSeqNum getStoreHeadSeqNum()
+    {
+        if (!storeQueue.empty()) {
+            return storeQueue.back().inst->seqNum;
+        } else {
+            return 0;
+        }
+
+    }
+
+    /** Returns whether or not the LSQ unit is stalled. */
+    bool isStalled()  { return stalled; }
+};
+
+template <class Impl>
+template <class T>
+Fault
+OzoneLWLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+    //Depending on issue2execute delay a squashed load could
+    //execute if it is found to be squashed in the same
+    //cycle it is scheduled to execute
+    typename m5::hash_map<int, LQIt, HashFn>::iterator
+        lq_hash_it = LQItHash.find(load_idx);
+    assert(lq_hash_it != LQItHash.end());
+    DynInstPtr inst = (*(*lq_hash_it).second);
+
+    if (inst->isExecuted()) {
+        panic("Should not reach this point with split ops!");
+
+        memcpy(&data,req->data,req->size);
+
+        return NoFault;
+    }
+
+    // Make sure this isn't an uncacheable access
+    // A bit of a hackish way to get uncached accesses to work only if they're
+    // at the head of the LSQ and are ready to commit (at the head of the ROB
+    // too).
+    // @todo: Fix uncached accesses.
+    if (req->flags & UNCACHEABLE &&
+        (inst != loadQueue.back() || !inst->reachedCommit)) {
+        DPRINTF(OzoneLSQ, "[sn:%lli] Uncached load and not head of "
+                "commit/LSQ!\n",
+                inst->seqNum);
+        be->rescheduleMemInst(inst);
+        return TheISA::genMachineCheckFault();
+    }
+
+    // Check the SQ for any previous stores that might lead to forwarding
+    SQIt sq_it = storeQueue.begin();
+    int store_size = 0;
+
+    DPRINTF(OzoneLSQ, "Read called, load idx: %i addr: %#x\n",
+            load_idx, req->paddr);
+
+    while (sq_it != storeQueue.end() && (*sq_it).inst->seqNum > inst->seqNum)
+        ++sq_it;
+
+    while (1) {
+        // End once we've reached the top of the LSQ
+        if (sq_it == storeQueue.end()) {
+            break;
+        }
+
+        assert((*sq_it).inst);
+
+        store_size = (*sq_it).size;
+
+        if (store_size == 0) {
+            sq_it++;
+            continue;
+        }
+
+        // Check if the store data is within the lower and upper bounds of
+        // addresses that the request needs.
+        bool store_has_lower_limit =
+            req->vaddr >= (*sq_it).inst->effAddr;
+        bool store_has_upper_limit =
+            (req->vaddr + req->size) <= ((*sq_it).inst->effAddr +
+                                         store_size);
+        bool lower_load_has_store_part =
+            req->vaddr < ((*sq_it).inst->effAddr +
+                           store_size);
+        bool upper_load_has_store_part =
+            (req->vaddr + req->size) > (*sq_it).inst->effAddr;
+
+        // If the store's data has all of the data needed, we can forward.
+        if (store_has_lower_limit && store_has_upper_limit) {
+
+            int shift_amt = req->vaddr & (store_size - 1);
+            // Assumes byte addressing
+            shift_amt = shift_amt << 3;
+
+            // Cast this to type T?
+            data = (*sq_it).data >> shift_amt;
+
+            req->cmd = Read;
+            assert(!req->completionEvent);
+            req->completionEvent = NULL;
+            req->time = curTick;
+            assert(!req->data);
+            req->data = new uint8_t[64];
+
+            memcpy(req->data, &data, req->size);
+
+            DPRINTF(OzoneLSQ, "Forwarding from store [sn:%lli] to load to "
+                    "[sn:%lli] addr %#x, data %#x\n",
+                    (*sq_it).inst->seqNum, inst->seqNum, req->vaddr, *(req->data));
+
+            typename BackEnd::LdWritebackEvent *wb =
+                new typename BackEnd::LdWritebackEvent(inst,
+                                                       be);
+
+            // We'll say this has a 1 cycle load-store forwarding latency
+            // for now.
+            // FIXME - Need to make this a parameter.
+            wb->schedule(curTick);
+
+            // Should keep track of stat for forwarded data
+            return NoFault;
+        } else if ((store_has_lower_limit && lower_load_has_store_part) ||
+                   (store_has_upper_limit && upper_load_has_store_part) ||
+                   (lower_load_has_store_part && upper_load_has_store_part)) {
+            // This is the partial store-load forwarding case where a store
+            // has only part of the load's data.
+
+            // If it's already been written back, then don't worry about
+            // stalling on it.
+            if ((*sq_it).completed) {
+                sq_it++;
+                break;
+            }
+
+            // Must stall load and force it to retry, so long as it's the oldest
+            // load that needs to do so.
+            if (!stalled ||
+                (stalled &&
+                 inst->seqNum <
+                 (*stallingLoad)->seqNum)) {
+                stalled = true;
+                stallingStoreIsn = (*sq_it).inst->seqNum;
+                stallingLoad = (*lq_hash_it).second;
+            }
+
+            // Tell IQ/mem dep unit that this instruction will need to be
+            // rescheduled eventually
+            be->rescheduleMemInst(inst);
+
+            DPRINTF(OzoneLSQ, "Load-store forwarding mis-match. "
+                    "Store [sn:%lli] to load addr %#x\n",
+                    (*sq_it).inst->seqNum, req->vaddr);
+
+            return NoFault;
+        }
+        sq_it++;
+    }
+
+
+    // If there's no forwarding case, then go access memory
+    ++usedPorts;
+
+    // if we have a cache, do cache access too
+    if (dcacheInterface) {
+        if (dcacheInterface->isBlocked()) {
+            // There's an older load that's already going to squash.
+            if (isLoadBlocked && blockedLoadSeqNum < inst->seqNum)
+                return NoFault;
+
+            isLoadBlocked = true;
+            loadBlockedHandled = false;
+            blockedLoadSeqNum = inst->seqNum;
+            // No fault occurred, even though the interface is blocked.
+            return NoFault;
+        }
+
+        DPRINTF(OzoneLSQ, "D-cache: PC:%#x reading from paddr:%#x "
+                "vaddr:%#x flags:%i\n",
+                inst->readPC(), req->paddr, req->vaddr, req->flags);
+
+        // Setup MemReq pointer
+        req->cmd = Read;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+
+        assert(!req->completionEvent);
+        req->completionEvent =
+            new typename BackEnd::LdWritebackEvent(inst, be);
+
+        // Do Cache Access
+        MemAccessResult result = dcacheInterface->access(req);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        // @todo: Probably should support having no events
+        if (result != MA_HIT) {
+            DPRINTF(OzoneLSQ, "D-cache miss!\n");
+            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
+                    inst->seqNum);
+
+            lastDcacheStall = curTick;
+
+            _status = DcacheMissStall;
+
+        } else {
+//            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
+//                    inst->seqNum);
+
+            DPRINTF(OzoneLSQ, "D-cache hit!\n");
+        }
+    } else {
+        fatal("Must use D-cache with new memory system");
+    }
+
+    return NoFault;
+}
+
+template <class Impl>
+template <class T>
+Fault
+OzoneLWLSQ<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+    SQHashIt sq_hash_it = SQItHash.find(store_idx);
+    assert(sq_hash_it != SQItHash.end());
+
+    SQIt sq_it = (*sq_hash_it).second;
+    assert((*sq_it).inst);
+
+    DPRINTF(OzoneLSQ, "Doing write to store idx %i, addr %#x data %#x"
+            " | [sn:%lli]\n",
+            store_idx, req->paddr, data, (*sq_it).inst->seqNum);
+
+    (*sq_it).req = req;
+    (*sq_it).size = sizeof(T);
+    (*sq_it).data = data;
+
+    // This function only writes the data to the store queue, so no fault
+    // can happen here.
+    return NoFault;
+}
+
+#endif // __CPU_OZONE_LW_LSQ_HH__
diff --git a/cpu/ozone/lw_lsq_impl.hh b/cpu/ozone/lw_lsq_impl.hh
new file mode 100644
index 000000000..54d7ead6c
--- /dev/null
+++ b/cpu/ozone/lw_lsq_impl.hh
@@ -0,0 +1,766 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/isa_traits.hh"
+#include "base/str.hh"
+#include "cpu/ozone/lw_lsq.hh"
+
+template <class Impl>
+OzoneLWLSQ<Impl>::StoreCompletionEvent::StoreCompletionEvent(DynInstPtr &_inst,
+                                                             BackEnd *_be,
+                                                             Event *wb_event,
+                                                             OzoneLWLSQ<Impl> *lsq_ptr)
+    : Event(&mainEventQueue),
+      inst(_inst),
+      be(_be),
+      wbEvent(wb_event),
+      lsqPtr(lsq_ptr)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::StoreCompletionEvent::process()
+{
+    DPRINTF(OzoneLSQ, "Cache miss complete for store [sn:%lli]\n",
+            inst->seqNum);
+
+    //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum);
+
+//    lsqPtr->cpu->wakeCPU();
+    if (wbEvent) {
+        wbEvent->process();
+        delete wbEvent;
+    }
+
+    lsqPtr->completeStore(inst->sqIdx);
+    be->removeDcacheMiss(inst);
+}
+
+template <class Impl>
+const char *
+OzoneLWLSQ<Impl>::StoreCompletionEvent::description()
+{
+    return "LSQ store completion event";
+}
+
+template <class Impl>
+OzoneLWLSQ<Impl>::OzoneLWLSQ()
+    : loads(0), stores(0), storesToWB(0), stalled(false), isLoadBlocked(false),
+      loadBlockedHandled(false)
+{
+}
+
+template<class Impl>
+void
+OzoneLWLSQ<Impl>::init(Params *params, unsigned maxLQEntries,
+                     unsigned maxSQEntries, unsigned id)
+
+{
+    DPRINTF(OzoneLSQ, "Creating OzoneLWLSQ%i object.\n",id);
+
+    lsqID = id;
+
+    LQEntries = maxLQEntries;
+    SQEntries = maxSQEntries;
+
+    for (int i = 0; i < LQEntries * 10; i++) {
+        LQIndices.push(i);
+        SQIndices.push(i);
+    }
+
+    // May want to initialize these entries to NULL
+
+//    loadHead = loadTail = 0;
+
+//    storeHead = storeWBIdx = storeTail = 0;
+
+    usedPorts = 0;
+    cachePorts = params->cachePorts;
+
+    dcacheInterface = params->dcacheInterface;
+
+    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+
+    blockedLoadSeqNum = 0;
+}
+
+template<class Impl>
+std::string
+OzoneLWLSQ<Impl>::name() const
+{
+    return "lsqunit";
+}
+
+template<class Impl>
+void
+OzoneLWLSQ<Impl>::clearLQ()
+{
+    loadQueue.clear();
+}
+
+template<class Impl>
+void
+OzoneLWLSQ<Impl>::clearSQ()
+{
+    storeQueue.clear();
+}
+
+template<class Impl>
+void
+OzoneLWLSQ<Impl>::setPageTable(PageTable *pt_ptr)
+{
+    DPRINTF(OzoneLSQ, "Setting the page table pointer.\n");
+    pTable = pt_ptr;
+}
+
+template<class Impl>
+void
+OzoneLWLSQ<Impl>::resizeLQ(unsigned size)
+{
+    assert( size >= LQEntries);
+
+    if (size > LQEntries) {
+        while (size > loadQueue.size()) {
+            DynInstPtr dummy;
+            loadQueue.push_back(dummy);
+            LQEntries++;
+        }
+    } else {
+        LQEntries = size;
+    }
+
+}
+
+template<class Impl>
+void
+OzoneLWLSQ<Impl>::resizeSQ(unsigned size)
+{
+    if (size > SQEntries) {
+        while (size > storeQueue.size()) {
+            SQEntry dummy;
+            storeQueue.push_back(dummy);
+            SQEntries++;
+        }
+    } else {
+        SQEntries = size;
+    }
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::insert(DynInstPtr &inst)
+{
+    // Make sure we really have a memory reference.
+    assert(inst->isMemRef());
+
+    // Make sure it's one of the two classes of memory references.
+    assert(inst->isLoad() || inst->isStore());
+
+    if (inst->isLoad()) {
+        insertLoad(inst);
+    } else {
+        insertStore(inst);
+    }
+
+//    inst->setInLSQ();
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::insertLoad(DynInstPtr &load_inst)
+{
+    assert(!LQIndices.empty());
+    int load_index = LQIndices.front();
+    LQIndices.pop();
+
+    DPRINTF(OzoneLSQ, "Inserting load PC %#x, idx:%i [sn:%lli]\n",
+            load_inst->readPC(), load_index, load_inst->seqNum);
+
+    load_inst->lqIdx = load_index;
+
+    loadQueue.push_front(load_inst);
+    LQItHash[load_index] = loadQueue.begin();
+
+    ++loads;
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::insertStore(DynInstPtr &store_inst)
+{
+    // Make sure it is not full before inserting an instruction.
+    assert(stores - storesToWB < SQEntries);
+
+    assert(!SQIndices.empty());
+    int store_index = SQIndices.front();
+    SQIndices.pop();
+
+    DPRINTF(OzoneLSQ, "Inserting store PC %#x, idx:%i [sn:%lli]\n",
+            store_inst->readPC(), store_index, store_inst->seqNum);
+
+    store_inst->sqIdx = store_index;
+    SQEntry entry(store_inst);
+    if (loadQueue.empty()) {
+        entry.lqIt = loadQueue.end();
+    } else {
+        entry.lqIt = loadQueue.begin();
+    }
+    storeQueue.push_front(entry);
+
+    SQItHash[store_index] = storeQueue.begin();
+
+    ++stores;
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+OzoneLWLSQ<Impl>::getMemDepViolator()
+{
+    DynInstPtr temp = memDepViolator;
+
+    memDepViolator = NULL;
+
+    return temp;
+}
+
+template <class Impl>
+unsigned
+OzoneLWLSQ<Impl>::numFreeEntries()
+{
+    unsigned free_lq_entries = LQEntries - loads;
+    unsigned free_sq_entries = SQEntries - stores;
+
+    // Both the LQ and SQ entries have an extra dummy entry to differentiate
+    // empty/full conditions.  Subtract 1 from the free entries.
+    if (free_lq_entries < free_sq_entries) {
+        return free_lq_entries - 1;
+    } else {
+        return free_sq_entries - 1;
+    }
+}
+
+template <class Impl>
+int
+OzoneLWLSQ<Impl>::numLoadsReady()
+{
+    int retval = 0;
+    LQIt lq_it = loadQueue.begin();
+    LQIt end_it = loadQueue.end();
+
+    while (lq_it != end_it) {
+        if ((*lq_it)->readyToIssue()) {
+            ++retval;
+        }
+    }
+
+    return retval;
+}
+
+template <class Impl>
+Fault
+OzoneLWLSQ<Impl>::executeLoad(DynInstPtr &inst)
+{
+    // Execute a specific load.
+    Fault load_fault = NoFault;
+
+    DPRINTF(OzoneLSQ, "Executing load PC %#x, [sn:%lli]\n",
+            inst->readPC(),inst->seqNum);
+
+    // Make sure it's really in the list.
+    // Normally it should always be in the list.  However,
+    /* due to a syscall it may not be the list.
+#ifdef DEBUG
+    int i = loadHead;
+    while (1) {
+        if (i == loadTail && !find(inst)) {
+            assert(0 && "Load not in the queue!");
+        } else if (loadQueue[i] == inst) {
+            break;
+        }
+
+        i = i + 1;
+        if (i >= LQEntries) {
+            i = 0;
+        }
+    }
+#endif // DEBUG*/
+
+    load_fault = inst->initiateAcc();
+
+    // Might want to make sure that I'm not overwriting a previously faulting
+    // instruction that hasn't been checked yet.
+    // Actually probably want the oldest faulting load
+    if (load_fault != NoFault) {
+        DPRINTF(OzoneLSQ, "Load [sn:%lli] has a fault\n", inst->seqNum);
+        // Maybe just set it as can commit here, although that might cause
+        // some other problems with sending traps to the ROB too quickly.
+        be->instToCommit(inst);
+//        iewStage->activityThisCycle();
+    }
+
+    return load_fault;
+}
+
+template <class Impl>
+Fault
+OzoneLWLSQ<Impl>::executeStore(DynInstPtr &store_inst)
+{
+    // Make sure that a store exists.
+    assert(stores != 0);
+
+    int store_idx = store_inst->sqIdx;
+    SQHashIt sq_hash_it = SQItHash.find(store_idx);
+    assert(sq_hash_it != SQItHash.end());
+    DPRINTF(OzoneLSQ, "Executing store PC %#x [sn:%lli]\n",
+            store_inst->readPC(), store_inst->seqNum);
+
+    SQIt sq_it = (*sq_hash_it).second;
+
+    Fault store_fault = store_inst->initiateAcc();
+
+    // Store size should now be available.  Use it to get proper offset for
+    // addr comparisons.
+    int size = (*sq_it).size;
+
+    if (size == 0) {
+        DPRINTF(OzoneLSQ,"Fault on Store PC %#x, [sn:%lli],Size = 0\n",
+                store_inst->readPC(),store_inst->seqNum);
+
+        return store_fault;
+    }
+
+    assert(store_fault == NoFault);
+
+    if (!storeFaultInst) {
+        if (store_fault != NoFault) {
+            panic("Fault in a store instruction!");
+            storeFaultInst = store_inst;
+        } else if (store_inst->isNonSpeculative()) {
+            // Nonspeculative accesses (namely store conditionals)
+            // need to set themselves as able to writeback if we
+            // haven't had a fault by here.
+            (*sq_it).canWB = true;
+
+            ++storesToWB;
+            DPRINTF(OzoneLSQ, "Nonspeculative store! storesToWB:%i\n",
+                    storesToWB);
+        }
+    }
+
+    LQIt lq_it = --(loadQueue.end());
+
+    if (!memDepViolator) {
+        while (lq_it != loadQueue.end()) {
+            if ((*lq_it)->seqNum < store_inst->seqNum) {
+                lq_it--;
+                continue;
+            }
+            // Actually should only check loads that have actually executed
+            // Might be safe because effAddr is set to InvalAddr when the
+            // dyn inst is created.
+
+            // Must actually check all addrs in the proper size range
+            // Which is more correct than needs to be.  What if for now we just
+            // assume all loads are quad-word loads, and do the addr based
+            // on that.
+            // @todo: Fix this, magic number being used here
+            if (((*lq_it)->effAddr >> 8) ==
+                (store_inst->effAddr >> 8)) {
+                // A load incorrectly passed this store.  Squash and refetch.
+                // For now return a fault to show that it was unsuccessful.
+                memDepViolator = (*lq_it);
+
+                return TheISA::genMachineCheckFault();
+            }
+
+            lq_it--;
+        }
+
+        // If we've reached this point, there was no violation.
+        memDepViolator = NULL;
+    }
+
+    return store_fault;
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::commitLoad()
+{
+    assert(!loadQueue.empty());
+
+    DPRINTF(OzoneLSQ, "[sn:%lli] Committing head load instruction, PC %#x\n",
+            loadQueue.back()->seqNum, loadQueue.back()->readPC());
+
+    LQIndices.push(loadQueue.back()->lqIdx);
+    LQItHash.erase(loadQueue.back()->lqIdx);
+
+    loadQueue.pop_back();
+
+    --loads;
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::commitLoads(InstSeqNum &youngest_inst)
+{
+    assert(loads == 0 || !loadQueue.empty());
+
+    while (loads != 0 &&
+           loadQueue.back()->seqNum <= youngest_inst) {
+        commitLoad();
+    }
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::commitStores(InstSeqNum &youngest_inst)
+{
+    assert(stores == 0 || !storeQueue.empty());
+
+    SQIt sq_it = --(storeQueue.end());
+    while (!storeQueue.empty() && sq_it != storeQueue.end()) {
+        assert((*sq_it).inst);
+        if (!(*sq_it).canWB) {
+            if ((*sq_it).inst->seqNum > youngest_inst) {
+                break;
+            }
+            ++storesToWB;
+
+            DPRINTF(OzoneLSQ, "Marking store as able to write back, PC "
+                    "%#x [sn:%lli], storesToWB:%i\n",
+                    (*sq_it).inst->readPC(),
+                    (*sq_it).inst->seqNum,
+                    storesToWB);
+
+            (*sq_it).canWB = true;
+        }
+
+        sq_it--;
+    }
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::writebackStores()
+{
+    SQIt sq_it = --(storeQueue.end());
+    while (storesToWB > 0 &&
+           sq_it != storeQueue.end() &&
+           (*sq_it).inst &&
+           (*sq_it).canWB &&
+           usedPorts < cachePorts) {
+
+        DynInstPtr inst = (*sq_it).inst;
+
+        if ((*sq_it).size == 0 && !(*sq_it).completed) {
+            sq_it--;
+            completeStore(inst->sqIdx);
+
+            continue;
+        }
+
+        if (inst->isDataPrefetch() || (*sq_it).committed) {
+            sq_it--;
+            continue;
+        }
+
+        if (dcacheInterface && dcacheInterface->isBlocked()) {
+            DPRINTF(OzoneLSQ, "Unable to write back any more stores, cache"
+                    " is blocked!\n");
+            break;
+        }
+
+        ++usedPorts;
+
+        assert((*sq_it).req);
+        assert(!(*sq_it).committed);
+
+        MemReqPtr req = (*sq_it).req;
+        (*sq_it).committed = true;
+
+        req->cmd = Write;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+        memcpy(req->data, (uint8_t *)&(*sq_it).data, req->size);
+
+        DPRINTF(OzoneLSQ, "D-Cache: Writing back store idx:%i PC:%#x "
+                "to Addr:%#x, data:%#x [sn:%lli]\n",
+                inst->sqIdx,inst->readPC(),
+                req->paddr, *(req->data),
+                inst->seqNum);
+
+        if (dcacheInterface) {
+            MemAccessResult result = dcacheInterface->access(req);
+
+            if (isStalled() &&
+                inst->seqNum == stallingStoreIsn) {
+                DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] "
+                        "load [sn:%lli]\n",
+                        stallingStoreIsn, (*stallingLoad)->seqNum);
+                stalled = false;
+                stallingStoreIsn = 0;
+                be->replayMemInst((*stallingLoad));
+            }
+
+            if (result != MA_HIT && dcacheInterface->doEvents()) {
+//                Event *wb = NULL;
+
+                typename BackEnd::LdWritebackEvent *wb = NULL;
+                if (req->flags & LOCKED) {
+                    // Stx_C does not generate a system port transaction.
+                    req->result=1;
+                    wb = new typename BackEnd::LdWritebackEvent(inst,
+                                                            be);
+                }
+
+                DPRINTF(OzoneLSQ,"D-Cache Write Miss!\n");
+
+//                DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
+//                        inst->seqNum);
+
+                // Will stores need their own kind of writeback events?
+                // Do stores even need writeback events?
+                assert(!req->completionEvent);
+                req->completionEvent = new
+                    StoreCompletionEvent(inst, be, wb, this);
+                be->addDcacheMiss(inst);
+
+                lastDcacheStall = curTick;
+
+                _status = DcacheMissStall;
+
+                // Increment stat here or something
+
+                sq_it--;
+            } else {
+                DPRINTF(OzoneLSQ,"D-Cache: Write Hit on idx:%i !\n",
+                        inst->sqIdx);
+
+//                DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
+//                        inst->seqNum);
+
+                if (req->flags & LOCKED) {
+                    // Stx_C does not generate a system port transaction.
+                    if (req->flags & UNCACHEABLE) {
+                        req->result = 2;
+                    } else {
+                        req->result = 1;
+                    }
+
+                    typename BackEnd::LdWritebackEvent *wb =
+                        new typename BackEnd::LdWritebackEvent(inst,
+                                                               be);
+                    wb->schedule(curTick);
+                }
+                sq_it--;
+                completeStore(inst->sqIdx);
+            }
+        } else {
+            panic("Must HAVE DCACHE!!!!!\n");
+        }
+    }
+
+    // Not sure this should set it to 0.
+    usedPorts = 0;
+
+    assert(stores >= 0 && storesToWB >= 0);
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::squash(const InstSeqNum &squashed_num)
+{
+    DPRINTF(OzoneLSQ, "Squashing until [sn:%lli]!"
+            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+
+
+    LQIt lq_it = loadQueue.begin();
+
+    while (loads != 0 && (*lq_it)->seqNum > squashed_num) {
+        assert(!loadQueue.empty());
+        // Clear the smart pointer to make sure it is decremented.
+        DPRINTF(OzoneLSQ,"Load Instruction PC %#x squashed, "
+                "[sn:%lli]\n",
+                (*lq_it)->readPC(),
+                (*lq_it)->seqNum);
+
+        if (isStalled() && lq_it == stallingLoad) {
+            stalled = false;
+            stallingStoreIsn = 0;
+            stallingLoad = NULL;
+        }
+
+        --loads;
+
+        // Inefficient!
+        LQHashIt lq_hash_it = LQItHash.find((*lq_it)->lqIdx);
+        assert(lq_hash_it != LQItHash.end());
+        LQItHash.erase(lq_hash_it);
+        LQIndices.push((*lq_it)->lqIdx);
+        loadQueue.erase(lq_it++);
+    }
+
+    if (isLoadBlocked) {
+        if (squashed_num < blockedLoadSeqNum) {
+            isLoadBlocked = false;
+            loadBlockedHandled = false;
+            blockedLoadSeqNum = 0;
+        }
+    }
+
+    SQIt sq_it = storeQueue.begin();
+
+    while (stores != 0 && (*sq_it).inst->seqNum > squashed_num) {
+        assert(!storeQueue.empty());
+        // Clear the smart pointer to make sure it is decremented.
+        DPRINTF(OzoneLSQ,"Store Instruction PC %#x idx:%i squashed [sn:%lli]\n",
+                (*sq_it).inst->readPC(), (*sq_it).inst->sqIdx,
+                (*sq_it).inst->seqNum);
+
+        // I don't think this can happen.  It should have been cleared by the
+        // stalling load.
+        if (isStalled() &&
+            (*sq_it).inst->seqNum == stallingStoreIsn) {
+            panic("Is stalled should have been cleared by stalling load!\n");
+            stalled = false;
+            stallingStoreIsn = 0;
+        }
+
+        SQHashIt sq_hash_it = SQItHash.find((*sq_it).inst->sqIdx);
+        assert(sq_hash_it != SQItHash.end());
+        SQItHash.erase(sq_hash_it);
+        SQIndices.push((*sq_it).inst->sqIdx);
+        (*sq_it).inst = NULL;
+        (*sq_it).canWB = 0;
+
+        if ((*sq_it).req) {
+            assert(!(*sq_it).req->completionEvent);
+        }
+        (*sq_it).req = NULL;
+        --stores;
+        storeQueue.erase(sq_it++);
+    }
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::dumpInsts()
+{
+    cprintf("Load store queue: Dumping instructions.\n");
+    cprintf("Load queue size: %i\n", loads);
+    cprintf("Load queue: ");
+
+    LQIt lq_it = --(loadQueue.end());
+
+    while (lq_it != loadQueue.end() && (*lq_it)) {
+        cprintf("[sn:%lli] %#x ", (*lq_it)->seqNum,
+                (*lq_it)->readPC());
+
+        lq_it--;
+    }
+
+    cprintf("\nStore queue size: %i\n", stores);
+    cprintf("Store queue: ");
+
+    SQIt sq_it = --(storeQueue.end());
+
+    while (sq_it != storeQueue.end() && (*sq_it).inst) {
+        cprintf("[sn:%lli]\nPC:%#x\nSize:%i\nCommitted:%i\nCompleted:%i\ncanWB:%i\n",
+                (*sq_it).inst->seqNum,
+                (*sq_it).inst->readPC(),
+                (*sq_it).size,
+                (*sq_it).committed,
+                (*sq_it).completed,
+                (*sq_it).canWB);
+
+        sq_it--;
+    }
+
+    cprintf("\n");
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::completeStore(int store_idx)
+{
+    SQHashIt sq_hash_it = SQItHash.find(store_idx);
+    assert(sq_hash_it != SQItHash.end());
+    SQIt sq_it = (*sq_hash_it).second;
+
+    assert((*sq_it).inst);
+    (*sq_it).completed = true;
+    DynInstPtr inst = (*sq_it).inst;
+
+    --storesToWB;
+
+    if (isStalled() &&
+        inst->seqNum == stallingStoreIsn) {
+        DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] "
+                "load [sn:%lli]\n",
+                stallingStoreIsn, (*stallingLoad)->seqNum);
+        stalled = false;
+        stallingStoreIsn = 0;
+        be->replayMemInst((*stallingLoad));
+    }
+
+    DPRINTF(OzoneLSQ, "Completing store idx:%i [sn:%lli], storesToWB:%i\n",
+            inst->sqIdx, inst->seqNum, storesToWB);
+
+    // A bit conservative because a store completion may not free up entries,
+    // but hopefully avoids two store completions in one cycle from making
+    // the CPU tick twice.
+//    cpu->activityThisCycle();
+    assert(!storeQueue.empty());
+    SQItHash.erase(sq_hash_it);
+    SQIndices.push(inst->sqIdx);
+    storeQueue.erase(sq_it);
+    --stores;
+/*
+    SQIt oldest_store_it = --(storeQueue.end());
+    if (sq_it == oldest_store_it) {
+        do {
+            inst = (*oldest_store_it).inst;
+            sq_hash_it = SQItHash.find(inst->sqIdx);
+            assert(sq_hash_it != SQItHash.end());
+            SQItHash.erase(sq_hash_it);
+            SQIndices.push(inst->sqIdx);
+            storeQueue.erase(oldest_store_it--);
+
+            --stores;
+        } while ((*oldest_store_it).completed &&
+                 oldest_store_it != storeQueue.end());
+
+//        be->updateLSQNextCycle = true;
+    }
+*/
+}
diff --git a/cpu/ozone/ozone_impl.hh b/cpu/ozone/ozone_impl.hh
index a2c706c60..1f543ec6e 100644
--- a/cpu/ozone/ozone_impl.hh
+++ b/cpu/ozone/ozone_impl.hh
@@ -35,6 +35,8 @@
 #include "cpu/ozone/front_end.hh"
 #include "cpu/ozone/inst_queue.hh"
 #include "cpu/ozone/lsq_unit.hh"
+#include "cpu/ozone/lw_lsq.hh"
+#include "cpu/ozone/lw_back_end.hh"
 #include "cpu/ozone/null_predictor.hh"
 #include "cpu/ozone/dyn_inst.hh"
 #include "cpu/ozone/simple_params.hh"
@@ -55,10 +57,10 @@ struct OzoneImpl {
     typedef TwobitBPredUnit<OzoneImpl> BranchPred;
     typedef FrontEnd<OzoneImpl> FrontEnd;
     // Will need IQ, LSQ eventually
-    typedef BackEnd<OzoneImpl> BackEnd;
+    typedef LWBackEnd<OzoneImpl> BackEnd;
 
     typedef InstQueue<OzoneImpl> InstQueue;
-    typedef OzoneLSQ<OzoneImpl> LdstQueue;
+    typedef OzoneLWLSQ<OzoneImpl> LdstQueue;
 
     typedef OzoneDynInst<OzoneImpl> DynInst;
     typedef RefCountingPtr<DynInst> DynInstPtr;
diff --git a/cpu/static_inst.hh b/cpu/static_inst.hh
index 20116554e..550609ed7 100644
--- a/cpu/static_inst.hh
+++ b/cpu/static_inst.hh
@@ -40,12 +40,17 @@
 
 // forward declarations
 struct AlphaSimpleImpl;
+struct OzoneImpl;
+struct SimpleImpl;
 class ExecContext;
 class DynInst;
 
 template <class Impl>
 class AlphaDynInst;
 
+template <class Impl>
+class OzoneDynInst;
+
 class FastCPU;
 class SimpleCPU;
 class InorderCPU;
@@ -121,6 +126,7 @@ class StaticInstBase : public RefCounted
         IsWriteBarrier,	///< Is a write barrier
 
         IsNonSpeculative, ///< Should not be executed speculatively
+        IsQuiesce,
 
         NumFlags
     };
@@ -208,6 +214,7 @@ class StaticInstBase : public RefCounted
     bool isMemBarrier()   const { return flags[IsMemBarrier]; }
     bool isWriteBarrier() const { return flags[IsWriteBarrier]; }
     bool isNonSpeculative() const { return flags[IsNonSpeculative]; }
+    bool isQuiesce() const { return flags[IsQuiesce]; }
     //@}
 
     /// Operation class.  Used to select appropriate function unit in issue.

From 7bab47ac3a70c0f2ad3f38ee1c67fd1364eff042 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 19:17:05 -0400
Subject: [PATCH 09/50] Include new OzoneCPU files

--HG--
extra : convert_revision : f8c8751aab62df5d57c6491c5ce9b90b5a176e86
---
 cpu/SConscript | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpu/SConscript b/cpu/SConscript
index 888dbdc22..c34971acf 100644
--- a/cpu/SConscript
+++ b/cpu/SConscript
@@ -146,6 +146,8 @@ if 'OzoneCPU' in env['CPU_MODELS']:
     sources += Split('''
         ozone/back_end.cc
         ozone/lsq_unit.cc
+        ozone/lw_back_end.cc
+        ozone/lw_lsq.cc
         ''')
 
 # FullCPU sources are included from m5/SConscript since they're not

From 55db48891c6d60d50f4f96c2e31f5281c23ed41f Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 16:55:31 -0400
Subject: [PATCH 10/50] Use dwarf-2 debugging symbols (they work much better).

--HG--
extra : convert_revision : 669e4c32f2bc2c035a4199d6152a638b75a25148
---
 SConscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SConscript b/SConscript
index 062661557..5546e6f71 100644
--- a/SConscript
+++ b/SConscript
@@ -381,7 +381,7 @@ env.Append(CPPPATH='./libelf')
 # Debug binary
 debugEnv = env.Copy(OBJSUFFIX='.do')
 debugEnv.Label = 'debug'
-debugEnv.Append(CCFLAGS=Split('-g -gstabs+ -O0'))
+debugEnv.Append(CCFLAGS=Split('-g -gdwarf-2 -O0'))
 debugEnv.Append(CPPDEFINES='DEBUG')
 tlist = debugEnv.Program(target = 'm5.debug',
                          source = make_objs(sources, debugEnv))

From b363a3703da7f9773f4afe2469c0206e14de1813 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 16:56:24 -0400
Subject: [PATCH 11/50] Allow the switching on and off of PC symbols for
 tracing.

--HG--
extra : convert_revision : a2422e30ace9874ba1be44cd0e1d3024cabbf1ed
---
 cpu/exetrace.cc | 6 +++++-
 cpu/exetrace.hh | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpu/exetrace.cc b/cpu/exetrace.cc
index 84b5eacf7..d5eacd839 100644
--- a/cpu/exetrace.cc
+++ b/cpu/exetrace.cc
@@ -84,7 +84,8 @@ Trace::InstRecord::dump(ostream &outs)
         std::string sym_str;
         Addr sym_addr;
         if (debugSymbolTable
-            && debugSymbolTable->findNearestSymbol(PC, sym_str, sym_addr)) {
+            && debugSymbolTable->findNearestSymbol(PC, sym_str, sym_addr)
+            && flags[PC_SYMBOL]) {
             if (PC != sym_addr)
                 sym_str += csprintf("+%d", PC - sym_addr);
             outs << "@" << sym_str << " : ";
@@ -191,6 +192,8 @@ Param<bool> exe_trace_print_fetchseq(&exeTraceParams, "print_fetchseq",
                                   "print fetch sequence number", false);
 Param<bool> exe_trace_print_cp_seq(&exeTraceParams, "print_cpseq",
                                   "print correct-path sequence number", false);
+Param<bool> exe_trace_pc_symbol(&exeTraceParams, "pc_symbol",
+                                  "Use symbols for the PC if available", true);
 Param<bool> exe_trace_intel_format(&exeTraceParams, "intel_format",
                                    "print trace in intel compatible format", false);
 Param<string> exe_trace_system(&exeTraceParams, "trace_system",
@@ -215,6 +218,7 @@ Trace::InstRecord::setParams()
     flags[PRINT_INT_REGS]    = exe_trace_print_iregs;
     flags[PRINT_FETCH_SEQ]   = exe_trace_print_fetchseq;
     flags[PRINT_CP_SEQ]      = exe_trace_print_cp_seq;
+    flags[PC_SYMBOL]         = exe_trace_pc_symbol;
     flags[INTEL_FORMAT]      = exe_trace_intel_format;
     trace_system	     = exe_trace_system;
 }
diff --git a/cpu/exetrace.hh b/cpu/exetrace.hh
index 67d042ec8..2f70e26e7 100644
--- a/cpu/exetrace.hh
+++ b/cpu/exetrace.hh
@@ -144,6 +144,7 @@ class InstRecord : public Record
         PRINT_INT_REGS,
         PRINT_FETCH_SEQ,
         PRINT_CP_SEQ,
+        PC_SYMBOL,
         INTEL_FORMAT,
         NUM_BITS
     };

From b14bf0321947419603610f07ed4f14b51a2192a3 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 16:59:50 -0400
Subject: [PATCH 12/50] Fixes for ll/sc for the O3 model.

cpu/o3/alpha_cpu.hh:
    Store conditionals should not write their data to memory if they failed.
cpu/o3/lsq_unit.hh:
    Setup request parameters when they're needed.

--HG--
extra : convert_revision : d75cd7deda03584b7e25cb567e4d79032cac7118
---
 cpu/o3/alpha_cpu.hh | 3 ++-
 cpu/o3/lsq_unit.hh  | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index 68e149e77..dfdf092ed 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -425,9 +425,10 @@ class AlphaFullCPU : public FullO3CPU<Impl>
                 req->result = 2;
             } else {
                 if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
-                    req->result=1;
+                    req->result = 1;
                 } else {
                     req->result = 0;
+                    return NoFault;
                 }
             }
         }
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index 73c485ce9..ba8b1d2e2 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -566,6 +566,9 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     DPRINTF(LSQUnit, "Doing functional access for inst PC %#x\n",
             loadQueue[load_idx]->readPC());
     assert(!req->data);
+    req->cmd = Read;
+    req->completionEvent = NULL;
+    req->time = curTick;
     req->data = new uint8_t[64];
     Fault fault = cpu->read(req, data);
     memcpy(req->data, &data, sizeof(T));
@@ -587,9 +590,6 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         }
         DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
                 loadQueue[load_idx]->readPC());
-        req->cmd = Read;
-        req->completionEvent = NULL;
-        req->time = curTick;
 
         assert(!req->completionEvent);
         req->completionEvent =

From 676afbe2c729575f3468d4ae0aad31c5ac382ab8 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 17:06:00 -0400
Subject: [PATCH 13/50] New stats added to O3 model.

--HG--
extra : convert_revision : 7abb491e89e3e1a331cd19aa05ddce5184abf9e0
---
 cpu/o3/commit.hh          |  19 ++++-
 cpu/o3/commit_impl.hh     | 115 ++++++++++++++++++++++++-
 cpu/o3/fetch.hh           |   3 +
 cpu/o3/fetch_impl.hh      |  29 ++++++-
 cpu/o3/iew.hh             |  33 ++++++-
 cpu/o3/iew_impl.hh        | 175 ++++++++++++++++++++++++++++++++++++--
 cpu/o3/inst_queue.hh      |  17 +++-
 cpu/o3/inst_queue_impl.hh | 128 ++++++++++++++++++++++++++--
 cpu/o3/rename.hh          |  12 +--
 cpu/o3/rename_impl.hh     | 100 ++++++++++++----------
 10 files changed, 555 insertions(+), 76 deletions(-)

diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 93b74ebb0..f374b8fb7 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -369,6 +369,8 @@ class DefaultCommit
     /** Rename map interface. */
     RenameMap *renameMap[Impl::MaxThreads];
 
+    void updateComInstStats(DynInstPtr &inst);
+
     /** Stat for the total number of committed instructions. */
     Stats::Scalar<> commitCommittedInsts;
     /** Stat for the total number of squashed instructions discarded by commit.
@@ -383,15 +385,26 @@ class DefaultCommit
      */
     Stats::Scalar<> commitNonSpecStalls;
     /** Stat for the total number of committed branches. */
-    Stats::Scalar<> commitCommittedBranches;
+//    Stats::Scalar<> commitCommittedBranches;
     /** Stat for the total number of committed loads. */
-    Stats::Scalar<> commitCommittedLoads;
+//    Stats::Scalar<> commitCommittedLoads;
     /** Stat for the total number of committed memory references. */
-    Stats::Scalar<> commitCommittedMemRefs;
+//    Stats::Scalar<> commitCommittedMemRefs;
     /** Stat for the total number of branch mispredicts that caused a squash. */
     Stats::Scalar<> branchMispredicts;
     /** Distribution of the number of committed instructions each cycle. */
     Stats::Distribution<> numCommittedDist;
+
+    // total number of instructions committed
+    Stats::Vector<> stat_com_inst;
+    Stats::Vector<> stat_com_swp;
+    Stats::Vector<> stat_com_refs;
+    Stats::Vector<> stat_com_loads;
+    Stats::Vector<> stat_com_membars;
+    Stats::Vector<> stat_com_branches;
+
+    Stats::Scalar<> commit_eligible_samples;
+    Stats::Vector<> commit_eligible;
 };
 
 #endif // __CPU_O3_COMMIT_HH__
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index ef1ba9282..157e688c7 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -133,6 +133,7 @@ template <class Impl>
 void
 DefaultCommit<Impl>::regStats()
 {
+    using namespace Stats;
     commitCommittedInsts
         .name(name() + ".commitCommittedInsts")
         .desc("The number of committed instructions")
@@ -150,6 +151,7 @@ DefaultCommit<Impl>::regStats()
         .desc("The number of times commit has been forced to stall to "
               "communicate backwards")
         .prereq(commitNonSpecStalls);
+/*
     commitCommittedBranches
         .name(name() + ".commitCommittedBranches")
         .desc("The number of committed branches")
@@ -162,6 +164,7 @@ DefaultCommit<Impl>::regStats()
         .name(name() + ".commitCommittedMemRefs")
         .desc("The number of committed memory references")
         .prereq(commitCommittedMemRefs);
+*/
     branchMispredicts
         .name(name() + ".branchMispredicts")
         .desc("The number of times a branch was mispredicted")
@@ -172,6 +175,73 @@ DefaultCommit<Impl>::regStats()
         .desc("Number of insts commited each cycle")
         .flags(Stats::pdf)
         ;
+
+    stat_com_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:count")
+        .desc("Number of instructions committed")
+        .flags(total)
+        ;
+
+    stat_com_swp
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:swp_count")
+        .desc("Number of s/w prefetches committed")
+        .flags(total)
+        ;
+
+    stat_com_refs
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:refs")
+        .desc("Number of memory references committed")
+        .flags(total)
+        ;
+
+    stat_com_loads
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:loads")
+        .desc("Number of loads committed")
+        .flags(total)
+        ;
+
+    stat_com_membars
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:membars")
+        .desc("Number of memory barriers committed")
+        .flags(total)
+        ;
+
+    stat_com_branches
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:branches")
+        .desc("Number of branches committed")
+        .flags(total)
+        ;
+
+    //
+    //  Commit-Eligible instructions...
+    //
+    //  -> The number of instructions eligible to commit in those
+    //  cycles where we reached our commit BW limit (less the number
+    //  actually committed)
+    //
+    //  -> The average value is computed over ALL CYCLES... not just
+    //  the BW limited cycles
+    //
+    //  -> The standard deviation is computed only over cycles where
+    //  we reached the BW limit
+    //
+    commit_eligible
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:bw_limited")
+        .desc("number of insts not committed due to BW limits")
+        .flags(total)
+        ;
+
+    commit_eligible_samples
+        .name(name() + ".COM:bw_lim_events")
+        .desc("number cycles where commit BW limit reached")
+        ;
 }
 
 template <class Impl>
@@ -1060,9 +1130,7 @@ head_inst->isWriteBarrier())*/
         return false;
     }
 
-    if (head_inst->isControl()) {
-        ++commitCommittedBranches;
-    }
+    updateComInstStats(head_inst);
 
     // Now that the instruction is going to be committed, finalize its
     // trace data.
@@ -1186,6 +1254,47 @@ DefaultCommit<Impl>::robDoneSquashing()
     return true;
 }
 
+template <class Impl>
+void
+DefaultCommit<Impl>::updateComInstStats(DynInstPtr &inst)
+{
+    unsigned thread = inst->threadNumber;
+
+    //
+    //  Pick off the software prefetches
+    //
+#ifdef TARGET_ALPHA
+    if (inst->isDataPrefetch()) {
+        stat_com_swp[thread]++;
+    } else {
+        stat_com_inst[thread]++;
+    }
+#else
+    stat_com_inst[thread]++;
+#endif
+
+    //
+    //  Control Instructions
+    //
+    if (inst->isControl())
+        stat_com_branches[thread]++;
+
+    //
+    //  Memory references
+    //
+    if (inst->isMemRef()) {
+        stat_com_refs[thread]++;
+
+        if (inst->isLoad()) {
+            stat_com_loads[thread]++;
+        }
+    }
+
+    if (inst->isMemBarrier()) {
+        stat_com_membars[thread]++;
+    }
+}
+
 ////////////////////////////////////////
 //                                    //
 //   SMT COMMIT POLICY MAITAINED HERE //
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index f0f3f2745..f0b15cb86 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -370,6 +370,7 @@ class DefaultFetch
     Stats::Scalar<> icacheStallCycles;
     /** Stat for total number of fetched instructions. */
     Stats::Scalar<> fetchedInsts;
+    Stats::Scalar<> fetchedBranches;
     /** Stat for total number of predicted branches. */
     Stats::Scalar<> predictedBranches;
     /** Stat for total number of cycles spent fetching. */
@@ -383,6 +384,8 @@ class DefaultFetch
     Stats::Scalar<> fetchBlockedCycles;
     /** Stat for total number of fetched cache lines. */
     Stats::Scalar<> fetchedCacheLines;
+
+    Stats::Scalar<> fetchIcacheSquashes;
     /** Distribution of number of instructions fetched each cycle. */
     Stats::Distribution<> fetchNisnDist;
     Stats::Formula idleRate;
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 7abc5733f..563a767df 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -178,6 +178,11 @@ DefaultFetch<Impl>::regStats()
         .desc("Number of instructions fetch has processed")
         .prereq(fetchedInsts);
 
+    fetchedBranches
+        .name(name() + ".fetchedBranches")
+        .desc("Number of branches that fetch encountered")
+        .prereq(fetchedBranches);
+
     predictedBranches
         .name(name() + ".predictedBranches")
         .desc("Number of branches that fetch has predicted taken")
@@ -209,6 +214,11 @@ DefaultFetch<Impl>::regStats()
         .desc("Number of cache lines fetched")
         .prereq(fetchedCacheLines);
 
+    fetchIcacheSquashes
+        .name(name() + ".fetchIcacheSquashes")
+        .desc("Number of outstanding Icache misses that were squashed")
+        .prereq(fetchIcacheSquashes);
+
     fetchNisnDist
         .init(/* base value */ 0,
               /* last value */ fetchWidth,
@@ -322,8 +332,10 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
     // Can keep track of how many cache accesses go unused due to
     // misspeculation here.
     if (fetchStatus[tid] != IcacheMissStall ||
-        req != memReq[tid])
+        req != memReq[tid]) {
+        ++fetchIcacheSquashes;
         return;
+    }
 
     // Wake up the CPU (if it went to sleep and was waiting on this completion
     // event).
@@ -400,6 +412,8 @@ DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
 
     predict_taken = branchPred.predict(inst, next_PC, inst->threadNumber);
 
+    ++fetchedBranches;
+
     if (predict_taken) {
         ++predictedBranches;
     }
@@ -457,6 +471,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     // If translation was successful, attempt to read the first
     // instruction.
     if (fault == NoFault) {
+#if FULL_SYSTEM
         if (cpu->system->memctrl->badaddr(memReq[tid]->paddr)) {
             DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
                     "misspeculating path!",
@@ -464,6 +479,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
             ret_fault = TheISA::genMachineCheckFault();
             return false;
         }
+#endif
 
         DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
         fault = cpu->mem->read(memReq[tid], cacheData[tid]);
@@ -480,6 +496,8 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
 
             MemAccessResult result = icacheInterface->access(memReq[tid]);
 
+            fetchedCacheLines++;
+
             // If the cache missed, then schedule an event to wake
             // up this stage once the cache miss completes.
             // @todo: Possibly allow for longer than 1 cycle cache hits.
@@ -499,8 +517,6 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
                         "read.\n", tid);
 
 //                memcpy(cacheData[tid], memReq[tid]->data, memReq[tid]->size);
-
-                fetchedCacheLines++;
             }
         } else {
             DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
@@ -889,10 +905,14 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         if (!fetch_success)
             return;
     } else {
-        if (fetchStatus[tid] == Blocked) {
+        if (fetchStatus[tid] == Idle) {
+            ++fetchIdleCycles;
+        } else if (fetchStatus[tid] == Blocked) {
             ++fetchBlockedCycles;
         } else if (fetchStatus[tid] == Squashing) {
             ++fetchSquashCycles;
+        } else if (fetchStatus[tid] == IcacheMissStall) {
+            ++icacheStallCycles;
         }
 
         // Status is Idle, Squashing, Blocked, or IcacheMissStall, so
@@ -904,6 +924,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
 
     // If we had a stall due to an icache miss, then return.
     if (fetchStatus[tid] == IcacheMissStall) {
+        ++icacheStallCycles;
         status_change = true;
         return;
     }
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index e55837812..58cd68b21 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -278,6 +278,8 @@ class DefaultIEW
     void tick();
 
   private:
+    void updateExeInstStats(DynInstPtr &inst);
+
     /** Pointer to main time buffer used for backwards communication. */
     TimeBuffer<TimeStruct> *timeBuffer;
 
@@ -443,9 +445,9 @@ class DefaultIEW
     /** Stat for total number of executed instructions. */
     Stats::Scalar<> iewExecutedInsts;
     /** Stat for total number of executed load instructions. */
-    Stats::Scalar<> iewExecLoadInsts;
+    Stats::Vector<> iewExecLoadInsts;
     /** Stat for total number of executed store instructions. */
-    Stats::Scalar<> iewExecStoreInsts;
+//    Stats::Scalar<> iewExecStoreInsts;
     /** Stat for total number of squashed instructions skipped at execute. */
     Stats::Scalar<> iewExecSquashedInsts;
     /** Stat for total number of memory ordering violation events. */
@@ -456,6 +458,33 @@ class DefaultIEW
     Stats::Scalar<> predictedNotTakenIncorrect;
     /** Stat for total number of mispredicted branches detected at execute. */
     Stats::Formula branchMispredicts;
+
+    Stats::Vector<> exe_swp;
+    Stats::Vector<> exe_nop;
+    Stats::Vector<> exe_refs;
+    Stats::Vector<> exe_branches;
+
+//    Stats::Vector<> issued_ops;
+/*
+    Stats::Vector<> stat_fu_busy;
+    Stats::Vector2d<> stat_fuBusy;
+    Stats::Vector<> dist_unissued;
+    Stats::Vector2d<> stat_issued_inst_type;
+*/
+    Stats::Formula issue_rate;
+    Stats::Formula iewExecStoreInsts;
+//    Stats::Formula issue_op_rate;
+//    Stats::Formula fu_busy_rate;
+
+    Stats::Vector<> iewInstsToCommit;
+    Stats::Vector<> writeback_count;
+    Stats::Vector<> producer_inst;
+    Stats::Vector<> consumer_inst;
+    Stats::Vector<> wb_penalized;
+
+    Stats::Formula wb_rate;
+    Stats::Formula wb_fanout;
+    Stats::Formula wb_penalized_rate;
 };
 
 #endif // __CPU_O3_IEW_HH__
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 21eb7dcf8..2ae2e1361 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -140,6 +140,8 @@ template <class Impl>
 void
 DefaultIEW<Impl>::regStats()
 {
+    using namespace Stats;
+
     instQueue.regStats();
 
     //ldstQueue.regStats();
@@ -195,13 +197,15 @@ DefaultIEW<Impl>::regStats()
         .desc("Number of executed instructions");
 
     iewExecLoadInsts
+        .init(cpu->number_of_threads)
         .name(name() + ".iewExecLoadInsts")
-        .desc("Number of load instructions executed");
-
+        .desc("Number of load instructions executed")
+        .flags(total);
+/*
     iewExecStoreInsts
         .name(name() + ".iewExecStoreInsts")
         .desc("Number of store instructions executed");
-
+*/
     iewExecSquashedInsts
         .name(name() + ".iewExecSquashedInsts")
         .desc("Number of squashed instructions skipped in execute");
@@ -223,6 +227,116 @@ DefaultIEW<Impl>::regStats()
         .desc("Number of branch mispredicts detected at execute");
 
     branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
+
+    exe_swp
+        .init(cpu->number_of_threads)
+        .name(name() + ".EXEC:swp")
+        .desc("number of swp insts executed")
+        .flags(total)
+        ;
+
+    exe_nop
+        .init(cpu->number_of_threads)
+        .name(name() + ".EXEC:nop")
+        .desc("number of nop insts executed")
+        .flags(total)
+        ;
+
+    exe_refs
+        .init(cpu->number_of_threads)
+        .name(name() + ".EXEC:refs")
+        .desc("number of memory reference insts executed")
+        .flags(total)
+        ;
+
+    exe_branches
+        .init(cpu->number_of_threads)
+        .name(name() + ".EXEC:branches")
+        .desc("Number of branches executed")
+        .flags(total)
+        ;
+
+    issue_rate
+        .name(name() + ".EXEC:rate")
+        .desc("Inst execution rate")
+        .flags(total)
+        ;
+    issue_rate = iewExecutedInsts / cpu->numCycles;
+
+    iewExecStoreInsts
+        .name(name() + ".EXEC:stores")
+        .desc("Number of stores executed")
+        .flags(total)
+        ;
+    iewExecStoreInsts = exe_refs - iewExecLoadInsts;
+/*
+    for (int i=0; i<Num_OpClasses; ++i) {
+        stringstream subname;
+        subname << opClassStrings[i] << "_delay";
+        issue_delay_dist.subname(i, subname.str());
+    }
+*/
+    //
+    //  Other stats
+    //
+
+    iewInstsToCommit
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:sent")
+        .desc("cumulative count of insts sent to commit")
+        .flags(total)
+        ;
+
+    writeback_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:count")
+        .desc("cumulative count of insts written-back")
+        .flags(total)
+        ;
+
+    producer_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:producers")
+        .desc("num instructions producing a value")
+        .flags(total)
+        ;
+
+    consumer_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:consumers")
+        .desc("num instructions consuming a value")
+        .flags(total)
+        ;
+
+    wb_penalized
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:penalized")
+        .desc("number of instrctions required to write to 'other' IQ")
+        .flags(total)
+        ;
+
+    wb_penalized_rate
+        .name(name() + ".WB:penalized_rate")
+        .desc ("fraction of instructions written-back that wrote to 'other' IQ")
+        .flags(total)
+        ;
+
+    wb_penalized_rate = wb_penalized / writeback_count;
+
+    wb_fanout
+        .name(name() + ".WB:fanout")
+        .desc("average fanout of values written-back")
+        .flags(total)
+        ;
+
+    wb_fanout = producer_inst / consumer_inst;
+
+    wb_rate
+        .name(name() + ".WB:rate")
+        .desc("insts written-back per cycle")
+        .flags(total)
+        ;
+    wb_rate = writeback_count / cpu->numCycles;
 }
 
 template<class Impl>
@@ -990,6 +1104,8 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
 
             instQueue.advanceTail(inst);
 
+            exe_nop[tid]++;
+
             add_to_iq = false;
         } else if (inst->isExecuted()) {
             assert(0 && "Instruction shouldn't be executed.\n");
@@ -1124,11 +1240,11 @@ DefaultIEW<Impl>::executeInsts()
                 // event adds the instruction to the queue to commit
                 fault = ldstQueue.executeLoad(inst);
 
-                ++iewExecLoadInsts;
+//                ++iewExecLoadInsts;
             } else if (inst->isStore()) {
                 ldstQueue.executeStore(inst);
 
-                ++iewExecStoreInsts;
+//                ++iewExecStoreInsts;
 
                 // If the store had a fault then it may not have a mem req
                 if (inst->req && !(inst->req->flags & LOCKED)) {
@@ -1146,13 +1262,13 @@ DefaultIEW<Impl>::executeInsts()
         } else {
             inst->execute();
 
-            ++iewExecutedInsts;
-
             inst->setExecuted();
 
             instToCommit(inst);
         }
 
+        updateExeInstStats(inst);
+
         // Check if branch was correct.  This check happens after the
         // instruction is added to the queue because even if the branch
         // is mispredicted, the branch instruction itself is still valid.
@@ -1243,17 +1359,20 @@ DefaultIEW<Impl>::writebackInsts()
     for (int inst_num = 0; inst_num < issueWidth &&
              toCommit->insts[inst_num]; inst_num++) {
         DynInstPtr inst = toCommit->insts[inst_num];
+        int tid = inst->threadNumber;
 
         DPRINTF(IEW, "Sending instructions to commit, PC %#x.\n",
                 inst->readPC());
 
+        iewInstsToCommit[tid]++;
+
         // Some instructions will be sent to commit without having
         // executed because they need commit to handle them.
         // E.g. Uncached loads have not actually executed when they
         // are first sent to commit.  Instead commit must tell the LSQ
         // when it's ready to execute the uncached load.
         if (!inst->isSquashed() && inst->isExecuted()) {
-            instQueue.wakeDependents(inst);
+            int dependents = instQueue.wakeDependents(inst);
 
             for (int i = 0; i < inst->numDestRegs(); i++) {
                 //mark as Ready
@@ -1261,6 +1380,10 @@ DefaultIEW<Impl>::writebackInsts()
                         inst->renamedDestRegIdx(i));
                 scoreboard->setReg(inst->renamedDestRegIdx(i));
             }
+
+            producer_inst[tid]++;
+            consumer_inst[tid]+= dependents;
+            writeback_count[tid]++;
         }
     }
 }
@@ -1390,3 +1513,39 @@ DefaultIEW<Impl>::tick()
         cpu->activityThisCycle();
     }
 }
+
+template <class Impl>
+void
+DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
+{
+    int thread_number = inst->threadNumber;
+
+    //
+    //  Pick off the software prefetches
+    //
+#ifdef TARGET_ALPHA
+    if (inst->isDataPrefetch())
+        exe_swp[thread_number]++;
+    else
+        iewExecutedInsts++;
+#else
+    iewExecutedInsts[thread_number]++;
+#endif
+
+    //
+    //  Control operations
+    //
+    if (inst->isControl())
+        exe_branches[thread_number]++;
+
+    //
+    //  Memory operations
+    //
+    if (inst->isMemRef()) {
+        exe_refs[thread_number]++;
+
+        if (inst->isLoad()) {
+            iewExecLoadInsts[thread_number]++;
+        }
+    }
+}
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 283bbdc22..06d9937f2 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -185,7 +185,7 @@ class InstructionQueue
     void commit(const InstSeqNum &inst, unsigned tid = 0);
 
     /** Wakes all dependents of a completed instruction. */
-    void wakeDependents(DynInstPtr &completed_inst);
+    int wakeDependents(DynInstPtr &completed_inst);
 
     /** Adds a ready memory instruction to the ready list. */
     void addReadyMemInst(DynInstPtr &ready_inst);
@@ -479,6 +479,7 @@ class InstructionQueue
     /** Stat for number of non-speculative instructions added. */
     Stats::Scalar<> iqNonSpecInstsAdded;
 //    Stats::Scalar<> iqIntInstsAdded;
+    Stats::Scalar<> iqInstsIssued;
     /** Stat for number of integer instructions issued. */
     Stats::Scalar<> iqIntInstsIssued;
 //    Stats::Scalar<> iqFloatInstsAdded;
@@ -505,6 +506,20 @@ class InstructionQueue
      */
     Stats::Scalar<> iqSquashedNonSpecRemoved;
 
+    Stats::VectorDistribution<> queue_res_dist;
+    Stats::Vector<> n_issued_dist;
+    Stats::VectorDistribution<> issue_delay_dist;
+
+    Stats::Vector<> stat_fu_busy;
+//    Stats::Vector<> dist_unissued;
+    Stats::Vector2d<> stat_issued_inst_type;
+
+    Stats::Formula issue_rate;
+//    Stats::Formula issue_stores;
+//    Stats::Formula issue_op_rate;
+    Stats::Vector<> fu_busy;  //cumulative fu busy
+
+    Stats::Formula fu_busy_rate;
 };
 
 #endif //__CPU_O3_INST_QUEUE_HH__
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index cfdd25cd5..804bc2472 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -224,6 +224,7 @@ template <class Impl>
 void
 InstructionQueue<Impl>::regStats()
 {
+    using namespace Stats;
     iqInstsAdded
         .name(name() + ".iqInstsAdded")
         .desc("Number of instructions added to the IQ (excludes non-spec)")
@@ -236,6 +237,11 @@ InstructionQueue<Impl>::regStats()
 
 //    iqIntInstsAdded;
 
+    iqInstsIssued
+        .name(name() + ".iqInstsIssued")
+        .desc("Number of instructions issued")
+        .prereq(iqInstsIssued);
+
     iqIntInstsIssued
         .name(name() + ".iqIntInstsIssued")
         .desc("Number of integer instructions issued")
@@ -291,6 +297,103 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of squashed non-spec instructions that were removed")
         .prereq(iqSquashedNonSpecRemoved);
 
+    queue_res_dist
+        .init(Num_OpClasses, 0, 99, 2)
+        .name(name() + ".IQ:residence:")
+        .desc("cycles from dispatch to issue")
+        .flags(total | pdf | cdf )
+        ;
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        queue_res_dist.subname(i, opClassStrings[i]);
+    }
+    n_issued_dist
+        .init(totalWidth + 1)
+        .name(name() + ".ISSUE:issued_per_cycle")
+        .desc("Number of insts issued each cycle")
+        .flags(total | pdf | dist)
+        ;
+/*
+    dist_unissued
+        .init(Num_OpClasses+2)
+        .name(name() + ".ISSUE:unissued_cause")
+        .desc("Reason ready instruction not issued")
+        .flags(pdf | dist)
+        ;
+    for (int i=0; i < (Num_OpClasses + 2); ++i) {
+        dist_unissued.subname(i, unissued_names[i]);
+    }
+*/
+    stat_issued_inst_type
+        .init(numThreads,Num_OpClasses)
+        .name(name() + ".ISSUE:FU_type")
+        .desc("Type of FU issued")
+        .flags(total | pdf | dist)
+        ;
+    stat_issued_inst_type.ysubnames(opClassStrings);
+
+    //
+    //  How long did instructions for a particular FU type wait prior to issue
+    //
+
+    issue_delay_dist
+        .init(Num_OpClasses,0,99,2)
+        .name(name() + ".ISSUE:")
+        .desc("cycles from operands ready to issue")
+        .flags(pdf | cdf)
+        ;
+
+    for (int i=0; i<Num_OpClasses; ++i) {
+        stringstream subname;
+        subname << opClassStrings[i] << "_delay";
+        issue_delay_dist.subname(i, subname.str());
+    }
+
+    issue_rate
+        .name(name() + ".ISSUE:rate")
+        .desc("Inst issue rate")
+        .flags(total)
+        ;
+    issue_rate = iqInstsIssued / cpu->numCycles;
+/*
+    issue_stores
+        .name(name() + ".ISSUE:stores")
+        .desc("Number of stores issued")
+        .flags(total)
+        ;
+    issue_stores = exe_refs - exe_loads;
+*/
+/*
+    issue_op_rate
+        .name(name() + ".ISSUE:op_rate")
+        .desc("Operation issue rate")
+        .flags(total)
+        ;
+    issue_op_rate = issued_ops / numCycles;
+*/
+    stat_fu_busy
+        .init(Num_OpClasses)
+        .name(name() + ".ISSUE:fu_full")
+        .desc("attempts to use FU when none available")
+        .flags(pdf | dist)
+        ;
+    for (int i=0; i < Num_OpClasses; ++i) {
+        stat_fu_busy.subname(i, opClassStrings[i]);
+    }
+
+    fu_busy
+        .init(numThreads)
+        .name(name() + ".ISSUE:fu_busy_cnt")
+        .desc("FU busy when requested")
+        .flags(total)
+        ;
+
+    fu_busy_rate
+        .name(name() + ".ISSUE:fu_busy_rate")
+        .desc("FU busy rate (busy events/executed inst)")
+        .flags(total)
+        ;
+    fu_busy_rate = fu_busy / iqInstsIssued;
+
     for ( int i=0; i < numThreads; i++) {
         // Tell mem dependence unit to reg stats as well.
         memDepUnit[i].regStats();
@@ -658,6 +761,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         int idx = fuPool->getUnit(op_class);
 
+        int tid = issuing_inst->threadNumber;
+
         if (idx == -2) {
             assert(op_class == No_OpClass);
 
@@ -666,7 +771,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             DPRINTF(IQ, "Thread %i: Issuing instruction PC that needs no FU"
                     " %#x [sn:%lli]\n",
-                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    tid, issuing_inst->readPC(),
                     issuing_inst->seqNum);
 
             readyInsts[op_class].pop();
@@ -685,14 +790,15 @@ InstructionQueue<Impl>::scheduleReadyInsts()
                 // Memory instructions can not be freed from the IQ until they
                 // complete.
                 ++freeEntries;
-                count[issuing_inst->threadNumber]--;
+                count[tid]--;
                 issuing_inst->removeInIQ();
             } else {
-                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
+                memDepUnit[tid].issue(issuing_inst);
             }
 
             listOrder.erase(order_it++);
 
+            stat_issued_inst_type[tid][op_class]++;
         } else if (idx != -1) {
             int op_latency = fuPool->getOpLatency(op_class);
 
@@ -722,7 +828,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             DPRINTF(IQ, "Thread %i: Issuing instruction PC %#x "
                     "[sn:%lli]\n",
-                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    tid, issuing_inst->readPC(),
                     issuing_inst->seqNum);
 
             readyInsts[op_class].pop();
@@ -741,14 +847,17 @@ InstructionQueue<Impl>::scheduleReadyInsts()
                 // Memory instructions can not be freed from the IQ until they
                 // complete.
                 ++freeEntries;
-                count[issuing_inst->threadNumber]--;
+                count[tid]--;
                 issuing_inst->removeInIQ();
             } else {
-                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
+                memDepUnit[tid].issue(issuing_inst);
             }
 
             listOrder.erase(order_it++);
+            stat_issued_inst_type[tid][op_class]++;
         } else {
+            stat_fu_busy[op_class]++;
+            fu_busy[tid]++;
             ++order_it;
         }
     }
@@ -808,9 +917,11 @@ InstructionQueue<Impl>::commit(const InstSeqNum &inst, unsigned tid)
 }
 
 template <class Impl>
-void
+int
 InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
 {
+    int dependents = 0;
+
     DPRINTF(IQ, "Waking dependents of completed instruction.\n");
 
     assert(!completed_inst->isSquashed());
@@ -875,6 +986,8 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
             curr = prev->next;
             prev->inst = NULL;
 
+            ++dependents;
+
             delete prev;
         }
 
@@ -886,6 +999,7 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
         // Mark the scoreboard as having that register ready.
         regScoreboard[dest_reg] = true;
     }
+    return dependents;
 }
 
 template <class Impl>
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index d5beccde9..c6f8f97aa 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -90,7 +90,7 @@ class DefaultRename
         Squashing,
         Blocked,
         Unblocking,
-        BarrierStall
+        SerializeStall
     };
 
   private:
@@ -359,8 +359,8 @@ class DefaultRename
     /** Tracks which stages are telling decode to stall. */
     Stalls stalls[Impl::MaxThreads];
 
-    /** The barrier instruction that rename has stalled on. */
-    DynInstPtr barrierInst[Impl::MaxThreads];
+    /** The serialize instruction that rename has stalled on. */
+    DynInstPtr serializeInst[Impl::MaxThreads];
 
     /** Records if rename needs to serialize on the next instruction for any
      * thread.
@@ -419,8 +419,8 @@ class DefaultRename
     Stats::Scalar<> renameIdleCycles;
     /** Stat for total number of cycles spent blocking. */
     Stats::Scalar<> renameBlockCycles;
-    /** Stat for total number of cycles spent stalling for a barrier. */
-    Stats::Scalar<> renameBarrierCycles;
+    /** Stat for total number of cycles spent stalling for a serializing inst. */
+    Stats::Scalar<> renameSerializeStallCycles;
     /** Stat for total number of cycles spent running normally. */
     Stats::Scalar<> renameRunCycles;
     /** Stat for total number of cycles spent unblocking. */
@@ -446,6 +446,8 @@ class DefaultRename
     Stats::Scalar<> renameCommittedMaps;
     /** Stat for total number of mappings that were undone due to a squash. */
     Stats::Scalar<> renameUndoneMaps;
+    Stats::Scalar<> renamedSerializing;
+    Stats::Scalar<> renamedTempSerializing;
 };
 
 #endif // __CPU_O3_RENAME_HH__
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index 441118ef1..e29211921 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -53,7 +53,7 @@ DefaultRename<Impl>::DefaultRename(Params *params)
 
         stalls[i].iew = false;
         stalls[i].commit = false;
-        barrierInst[i] = NULL;
+        serializeInst[i] = NULL;
 
         instsInProgress[i] = 0;
 
@@ -78,69 +78,79 @@ void
 DefaultRename<Impl>::regStats()
 {
     renameSquashCycles
-        .name(name() + ".renameSquashCycles")
+        .name(name() + ".RENAME:SquashCycles")
         .desc("Number of cycles rename is squashing")
         .prereq(renameSquashCycles);
     renameIdleCycles
-        .name(name() + ".renameIdleCycles")
+        .name(name() + ".RENAME:IdleCycles")
         .desc("Number of cycles rename is idle")
         .prereq(renameIdleCycles);
     renameBlockCycles
-        .name(name() + ".renameBlockCycles")
+        .name(name() + ".RENAME:BlockCycles")
         .desc("Number of cycles rename is blocking")
         .prereq(renameBlockCycles);
-    renameBarrierCycles
-        .name(name() + ".renameBarrierCycles")
-        .desc("Number of cycles rename is blocking due to a barrier stall")
-        .prereq(renameBarrierCycles);
+    renameSerializeStallCycles
+        .name(name() + ".RENAME:serializeStallCycles")
+        .desc("count of cycles rename stalled for serializing inst")
+        .flags(Stats::total);
     renameRunCycles
-        .name(name() + ".renameRunCycles")
+        .name(name() + ".RENAME:RunCycles")
         .desc("Number of cycles rename is running")
         .prereq(renameIdleCycles);
     renameUnblockCycles
-        .name(name() + ".renameUnblockCycles")
+        .name(name() + ".RENAME:UnblockCycles")
         .desc("Number of cycles rename is unblocking")
         .prereq(renameUnblockCycles);
     renameRenamedInsts
-        .name(name() + ".renameRenamedInsts")
+        .name(name() + ".RENAME:RenamedInsts")
         .desc("Number of instructions processed by rename")
         .prereq(renameRenamedInsts);
     renameSquashedInsts
-        .name(name() + ".renameSquashedInsts")
+        .name(name() + ".RENAME:SquashedInsts")
         .desc("Number of squashed instructions processed by rename")
         .prereq(renameSquashedInsts);
     renameROBFullEvents
-        .name(name() + ".renameROBFullEvents")
+        .name(name() + ".RENAME:ROBFullEvents")
         .desc("Number of times rename has blocked due to ROB full")
         .prereq(renameROBFullEvents);
     renameIQFullEvents
-        .name(name() + ".renameIQFullEvents")
+        .name(name() + ".RENAME:IQFullEvents")
         .desc("Number of times rename has blocked due to IQ full")
         .prereq(renameIQFullEvents);
     renameLSQFullEvents
-        .name(name() + ".renameLSQFullEvents")
+        .name(name() + ".RENAME:LSQFullEvents")
         .desc("Number of times rename has blocked due to LSQ full")
         .prereq(renameLSQFullEvents);
     renameFullRegistersEvents
-        .name(name() + ".renameFullRegisterEvents")
+        .name(name() + ".RENAME:FullRegisterEvents")
         .desc("Number of times there has been no free registers")
         .prereq(renameFullRegistersEvents);
     renameRenamedOperands
-        .name(name() + ".renameRenamedOperands")
+        .name(name() + ".RENAME:RenamedOperands")
         .desc("Number of destination operands rename has renamed")
         .prereq(renameRenamedOperands);
     renameRenameLookups
-        .name(name() + ".renameRenameLookups")
+        .name(name() + ".RENAME:RenameLookups")
         .desc("Number of register rename lookups that rename has made")
         .prereq(renameRenameLookups);
     renameCommittedMaps
-        .name(name() + ".renameCommittedMaps")
+        .name(name() + ".RENAME:CommittedMaps")
         .desc("Number of HB maps that are committed")
         .prereq(renameCommittedMaps);
     renameUndoneMaps
-        .name(name() + ".renameUndoneMaps")
+        .name(name() + ".RENAME:UndoneMaps")
         .desc("Number of HB maps that are undone due to squashing")
         .prereq(renameUndoneMaps);
+    renamedSerializing
+        .name(name() + ".RENAME:serializingInsts")
+        .desc("count of serializing insts renamed")
+        .flags(Stats::total)
+        ;
+    renamedTempSerializing
+        .name(name() + ".RENAME:tempSerializingInsts")
+        .desc("count of temporary serializing insts renamed")
+        .flags(Stats::total)
+        ;
 }
 
 template <class Impl>
@@ -254,7 +264,7 @@ DefaultRename<Impl>::squash(unsigned tid)
     // cycle and there should be space to hold everything due to the squash.
     if (renameStatus[tid] == Blocked ||
         renameStatus[tid] == Unblocking ||
-        renameStatus[tid] == BarrierStall) {
+        renameStatus[tid] == SerializeStall) {
 #if !FULL_SYSTEM
         // In syscall emulation, we can have both a block and a squash due
         // to a syscall in the same cycle.  This would cause both signals to
@@ -267,7 +277,7 @@ DefaultRename<Impl>::squash(unsigned tid)
 #else
         toDecode->renameUnblock[tid] = 1;
 #endif
-        barrierInst[tid] = NULL;
+        serializeInst[tid] = NULL;
     }
 
     // Set the status to Squashing.
@@ -370,8 +380,8 @@ DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
         ++renameBlockCycles;
     } else if (renameStatus[tid] == Squashing) {
         ++renameSquashCycles;
-    } else if (renameStatus[tid] == BarrierStall) {
-        ++renameBarrierCycles;
+    } else if (renameStatus[tid] == SerializeStall) {
+        ++renameSerializeStallCycles;
     }
 
     if (renameStatus[tid] == Running ||
@@ -535,14 +545,18 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
         if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize before instruction encountered.\n");
 
-            if (!inst->isTempSerializeBefore())
+            if (!inst->isTempSerializeBefore()) {
+                renamedSerializing++;
                 inst->setSerializeHandled();
+            } else {
+                renamedTempSerializing++;
+            }
 
-            // Change status over to BarrierStall so that other stages know
+            // Change status over to SerializeStall so that other stages know
             // what this is blocked on.
-            renameStatus[tid] = BarrierStall;
+            renameStatus[tid] = SerializeStall;
 
-            barrierInst[tid] = inst;
+            serializeInst[tid] = inst;
 
             blockThisCycle = true;
 
@@ -716,9 +730,9 @@ DefaultRename<Impl>::block(unsigned tid)
             wroteToTimeBuffer = true;
         }
 
-        // Rename can not go from BarrierStall to Blocked, otherwise it would
-        // not know to complete the barrier stall.
-        if (renameStatus[tid] != BarrierStall) {
+        // Rename can not go from SerializeStall to Blocked, otherwise it would
+        // not know to complete the serialize stall.
+        if (renameStatus[tid] != SerializeStall) {
             // Set status to Blocked.
             renameStatus[tid] = Blocked;
             return true;
@@ -735,7 +749,7 @@ DefaultRename<Impl>::unblock(unsigned tid)
     DPRINTF(Rename, "[tid:%u]: Trying to unblock.\n", tid);
 
     // Rename is done unblocking if the skid buffer is empty.
-    if (skidBuffer[tid].empty() && renameStatus[tid] != BarrierStall) {
+    if (skidBuffer[tid].empty() && renameStatus[tid] != SerializeStall) {
 
         DPRINTF(Rename, "[tid:%u]: Done unblocking.\n", tid);
 
@@ -1008,9 +1022,9 @@ DefaultRename<Impl>::checkStall(unsigned tid)
     } else if (renameMap[tid]->numFreeEntries() <= 0) {
         DPRINTF(Rename,"[tid:%i]: Stall: RenameMap has 0 free entries.\n", tid);
         ret_val = true;
-    } else if (renameStatus[tid] == BarrierStall &&
+    } else if (renameStatus[tid] == SerializeStall &&
                (!emptyROB[tid] || instsInProgress[tid])) {
-        DPRINTF(Rename,"[tid:%i]: Stall: Barrier stall and ROB is not "
+        DPRINTF(Rename,"[tid:%i]: Stall: Serialize stall and ROB is not "
                 "empty.\n",
                 tid);
         ret_val = true;
@@ -1064,7 +1078,7 @@ DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
     //         if so then go to unblocking
     // If status was Squashing
     //     check if squashing is not high.  Switch to running this cycle.
-    // If status was barrier stall
+    // If status was serialize stall
     //     check if ROB is empty and no insts are in flight to the ROB
 
     readFreeEntries(tid);
@@ -1113,12 +1127,12 @@ DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
         return false;
     }
 
-    if (renameStatus[tid] == BarrierStall) {
+    if (renameStatus[tid] == SerializeStall) {
         // Stall ends once the ROB is free.
-        DPRINTF(Rename, "[tid:%u]: Done with barrier stall, switching to "
+        DPRINTF(Rename, "[tid:%u]: Done with serialize stall, switching to "
                 "unblocking.\n", tid);
 
-        DynInstPtr barr_inst = barrierInst[tid];
+        DynInstPtr serial_inst = serializeInst[tid];
 
         renameStatus[tid] = Unblocking;
 
@@ -1126,21 +1140,21 @@ DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
 
         DPRINTF(Rename, "[tid:%u]: Processing instruction [%lli] with "
                 "PC %#x.\n",
-                tid, barr_inst->seqNum, barr_inst->readPC());
+                tid, serial_inst->seqNum, serial_inst->readPC());
 
         // Put instruction into queue here.
-        barr_inst->clearSerializeBefore();
+        serial_inst->clearSerializeBefore();
 
         if (!skidBuffer[tid].empty()) {
-            skidBuffer[tid].push_front(barr_inst);
+            skidBuffer[tid].push_front(serial_inst);
         } else {
-            insts[tid].push_front(barr_inst);
+            insts[tid].push_front(serial_inst);
         }
 
         DPRINTF(Rename, "[tid:%u]: Instruction must be processed by rename."
                 " Adding to front of list.", tid);
 
-        barrierInst[tid] = NULL;
+        serializeInst[tid] = NULL;
 
         return true;
     }

From e704960c80033dd008907caa7c24742a1020d302 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 17:10:06 -0400
Subject: [PATCH 14/50] Updates to Ozone model for quiesce, store conditionals.

--HG--
extra : convert_revision : 72ddd75ad0b5783aca9484e7d178c2915ee8e355
---
 cpu/ozone/cpu.hh              | 122 +++++++++++++++++++++++++++++++---
 cpu/ozone/cpu_impl.hh         |  62 ++++-------------
 cpu/ozone/dyn_inst_impl.hh    |   1 +
 cpu/ozone/front_end.hh        |  10 +--
 cpu/ozone/front_end_impl.hh   |  69 +++++++++++++++----
 cpu/ozone/lw_back_end.hh      |   6 +-
 cpu/ozone/lw_back_end_impl.hh |  61 ++++++++++-------
 cpu/ozone/lw_lsq.hh           |  25 ++++---
 cpu/ozone/lw_lsq_impl.hh      |  32 +++++++--
 9 files changed, 272 insertions(+), 116 deletions(-)

diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index 17e0f5c42..d37d3360c 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -42,7 +42,6 @@
 #include "cpu/pc_event.hh"
 #include "cpu/static_inst.hh"
 #include "mem/mem_interface.hh"
-#include "mem/page_table.hh"
 #include "sim/eventq.hh"
 
 // forward declarations
@@ -59,7 +58,6 @@ class GDBListener;
 
 #else
 
-class PageTable;
 class Process;
 
 #endif // FULL_SYSTEM
@@ -349,9 +347,8 @@ class OzoneCPU : public BaseCPU
     // L1 data cache
     MemInterface *dcacheInterface;
 
-#if !FULL_SYSTEM
-    PageTable *pTable;
-#endif
+    /** Pointer to memory. */
+    FunctionalMemory *mem;
 
     FrontEnd *frontEnd;
 
@@ -428,24 +425,62 @@ class OzoneCPU : public BaseCPU
     int getInstAsid() { return thread.asid; }
     int getDataAsid() { return thread.asid; }
 
+    Fault dummyTranslation(MemReqPtr &req)
+    {
+#if 0
+        assert((req->vaddr >> 48 & 0xffff) == 0);
+#endif
+
+        // put the asid in the upper 16 bits of the paddr
+        req->paddr = req->vaddr & ~((Addr)0xffff << sizeof(Addr) * 8 - 16);
+        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
+        return NoFault;
+    }
+
     /** Translates instruction requestion in syscall emulation mode. */
     Fault translateInstReq(MemReqPtr &req)
     {
-        return this->pTable->translate(req);
+        return dummyTranslation(req);
     }
 
     /** Translates data read request in syscall emulation mode. */
     Fault translateDataReadReq(MemReqPtr &req)
     {
-        return this->pTable->translate(req);
+        return dummyTranslation(req);
     }
 
     /** Translates data write request in syscall emulation mode. */
     Fault translateDataWriteReq(MemReqPtr &req)
     {
-        return this->pTable->translate(req);
+        return dummyTranslation(req);
     }
 #endif
+
+    /** Old CPU read from memory function. No longer used. */
+    template <class T>
+    Fault read(MemReqPtr &req, T &data)
+    {
+//	panic("CPU READ NOT IMPLEMENTED W/NEW MEMORY\n");
+#if 0
+#if FULL_SYSTEM && defined(TARGET_ALPHA)
+        if (req->flags & LOCKED) {
+            req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
+            req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
+        }
+#endif
+#endif
+        Fault error;
+        if (req->flags & LOCKED) {
+//            lockAddr = req->paddr;
+            lockFlag = true;
+        }
+
+        error = this->mem->read(req, data);
+        data = gtoh(data);
+        return error;
+    }
+
+
     /** CPU read function, forwards read to LSQ. */
     template <class T>
     Fault read(MemReqPtr &req, T &data, int load_idx)
@@ -453,6 +488,75 @@ class OzoneCPU : public BaseCPU
         return backEnd->read(req, data, load_idx);
     }
 
+    /** Old CPU write to memory function. No longer used. */
+    template <class T>
+    Fault write(MemReqPtr &req, T &data)
+    {
+#if 0
+#if FULL_SYSTEM && defined(TARGET_ALPHA)
+        ExecContext *xc;
+
+        // If this is a store conditional, act appropriately
+        if (req->flags & LOCKED) {
+            xc = req->xc;
+
+            if (req->flags & UNCACHEABLE) {
+                // Don't update result register (see stq_c in isa_desc)
+                req->result = 2;
+                xc->setStCondFailures(0);//Needed? [RGD]
+            } else {
+                bool lock_flag = xc->readMiscReg(TheISA::Lock_Flag_DepTag);
+                Addr lock_addr = xc->readMiscReg(TheISA::Lock_Addr_DepTag);
+                req->result = lock_flag;
+                if (!lock_flag ||
+                    ((lock_addr & ~0xf) != (req->paddr & ~0xf))) {
+                    xc->setMiscReg(TheISA::Lock_Flag_DepTag, false);
+                    xc->setStCondFailures(xc->readStCondFailures() + 1);
+                    if (((xc->readStCondFailures()) % 100000) == 0) {
+                        std::cerr << "Warning: "
+                                  << xc->readStCondFailures()
+                                  << " consecutive store conditional failures "
+                                  << "on cpu " << req->xc->readCpuId()
+                                  << std::endl;
+                    }
+                    return NoFault;
+                }
+                else xc->setStCondFailures(0);
+            }
+        }
+
+        // Need to clear any locked flags on other proccessors for
+        // this address.  Only do this for succsful Store Conditionals
+        // and all other stores (WH64?).  Unsuccessful Store
+        // Conditionals would have returned above, and wouldn't fall
+        // through.
+        for (int i = 0; i < this->system->execContexts.size(); i++){
+            xc = this->system->execContexts[i];
+            if ((xc->readMiscReg(TheISA::Lock_Addr_DepTag) & ~0xf) ==
+                (req->paddr & ~0xf)) {
+                xc->setMiscReg(TheISA::Lock_Flag_DepTag, false);
+            }
+        }
+
+#endif
+#endif
+
+        if (req->flags & LOCKED) {
+            if (req->flags & UNCACHEABLE) {
+                req->result = 2;
+            } else {
+                if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
+                    req->result = 1;
+                } else {
+                    req->result = 0;
+                    return NoFault;
+                }
+            }
+        }
+
+        return this->mem->write(req, (T)htog(data));
+    }
+
     /** CPU write function, forwards write to LSQ. */
     template <class T>
     Fault write(MemReqPtr &req, T &data, int store_idx)
@@ -507,6 +611,8 @@ class OzoneCPU : public BaseCPU
         bool stall;
     };
     TimeBuffer<CommStruct> comm;
+
+    bool lockFlag;
 };
 
 #endif // __CPU_OZONE_CPU_HH__
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index c205ad319..a7bc61603 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -149,12 +149,14 @@ OzoneCPU<Impl>::DCacheCompletionEvent::description()
 template <class Impl>
 OzoneCPU<Impl>::OzoneCPU(Params *p)
 #if FULL_SYSTEM
-    : BaseCPU(p), thread(this, 0, p->mem), tickEvent(this, p->width),
+    : BaseCPU(p), thread(this, 0, p->mem), tickEvent(this, p->width), mem(p->mem),
 #else
     : BaseCPU(p), thread(this, 0, p->workload[0], 0), tickEvent(this, p->width),
+      mem(p->workload[0]->getMemory()),
 #endif
       comm(5, 5)
 {
+
     frontEnd = new FrontEnd(p);
     backEnd = new BackEnd(p);
 
@@ -245,51 +247,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     globalSeqNum = 1;
 
     checkInterrupts = false;
-/*
-    fetchRedirBranch = true;
-    fetchRedirExcp = true;
 
-    // Need to initialize the rename maps, and the head and tail pointers.
-    robHeadPtr = new DynInst(this);
-    robTailPtr = new DynInst(this);
-
-    robHeadPtr->setNextInst(robTailPtr);
-//    robHeadPtr->setPrevInst(NULL);
-//    robTailPtr->setNextInst(NULL);
-    robTailPtr->setPrevInst(robHeadPtr);
-
-    robHeadPtr->setCompleted();
-    robTailPtr->setCompleted();
-
-    for (int i = 0; i < ISA::TotalNumRegs; ++i) {
-        renameTable[i] = new DynInst(this);
-        commitTable[i] = new DynInst(this);
-
-        renameTable[i]->setCompleted();
-        commitTable[i]->setCompleted();
-    }
-
-#if FULL_SYSTEM
-    for (int i = 0; i < ISA::NumIntRegs; ++i) {
-        palShadowTable[i] = new DynInst(this);
-        palShadowTable[i]->setCompleted();
-    }
-#endif
-
-    // Size of cache block.
-    cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
-
-    // Create mask to get rid of offset bits.
-    cacheBlkMask = (cacheBlkSize - 1);
-
-    // Get the size of an instruction.
-    instSize = sizeof(MachInst);
-
-    // Create space to store a cache line.
-    cacheData = new uint8_t[cacheBlkSize];
-
-    cacheBlkValid = false;
-*/
     for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
         thread.renameTable[i] = new DynInst(this);
         thread.renameTable[i]->setCompleted();
@@ -299,9 +257,11 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     backEnd->renameTable.copyFrom(thread.renameTable);
 
 #if !FULL_SYSTEM
-    pTable = p->pTable;
+//    pTable = p->pTable;
 #endif
 
+    lockFlag = 0;
+
     DPRINTF(OzoneCPU, "OzoneCPU: Created Ozone cpu object.\n");
 }
 
@@ -392,6 +352,7 @@ OzoneCPU<Impl>::activateContext(int thread_num, int delay)
     scheduleTickEvent(delay);
     _status = Running;
     thread._status = ExecContext::Active;
+    frontEnd->wakeFromQuiesce();
 }
 
 template <class Impl>
@@ -401,8 +362,8 @@ OzoneCPU<Impl>::suspendContext(int thread_num)
     // Eventually change this in SMT.
     assert(thread_num == 0);
 //    assert(xcProxy);
-
-    assert(_status == Running);
+    // @todo: Figure out how to initially set the status properly so this is running.
+//    assert(_status == Running);
     notIdleFraction--;
     unscheduleTickEvent();
     _status = Idle;
@@ -665,6 +626,7 @@ OzoneCPU<Impl>::tick()
 {
     DPRINTF(OzoneCPU, "\n\nOzoneCPU: Ticking cpu.\n");
 
+    _status = Running;
     thread.renameTable[ZeroReg]->setIntResult(0);
     thread.renameTable[ZeroReg+TheISA::FP_Base_DepTag]->
         setDoubleResult(0.0);
@@ -756,7 +718,7 @@ OzoneCPU<Impl>::tick()
     // check for instruction-count-based events
     comInstEventQueue[0]->serviceEvents(numInst);
 
-    if (!tickEvent.scheduled())
+    if (!tickEvent.scheduled() && _status == Running)
         tickEvent.schedule(curTick + 1);
 }
 
@@ -821,6 +783,8 @@ OzoneCPU<Impl>::hwrei()
 
     thread.setNextPC(thread.readMiscReg(AlphaISA::IPR_EXC_ADDR));
 
+    lockFlag = false;
+
     // Not sure how to make a similar check in the Ozone model
 //    if (!misspeculating()) {
         kernelStats->hwrei();
diff --git a/cpu/ozone/dyn_inst_impl.hh b/cpu/ozone/dyn_inst_impl.hh
index 2d86ced62..c83481c9a 100644
--- a/cpu/ozone/dyn_inst_impl.hh
+++ b/cpu/ozone/dyn_inst_impl.hh
@@ -237,6 +237,7 @@ OzoneDynInst<Impl>::hwrei()
     this->cpu->kernelStats->hwrei();
 
     this->cpu->checkInterrupts = true;
+    this->cpu->lockFlag = false;
 
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
diff --git a/cpu/ozone/front_end.hh b/cpu/ozone/front_end.hh
index 251f4200c..2bff2544d 100644
--- a/cpu/ozone/front_end.hh
+++ b/cpu/ozone/front_end.hh
@@ -60,7 +60,7 @@ class FrontEnd
                 const bool is_branch = false, const bool branch_taken = false);
     DynInstPtr getInst();
 
-    void processCacheCompletion();
+    void processCacheCompletion(MemReqPtr &req);
 
     void addFreeRegs(int num_freed);
 
@@ -109,6 +109,7 @@ class FrontEnd
         SerializeBlocked,
         SerializeComplete,
         RenameBlocked,
+        QuiescePending,
         BEBlocked
     };
 
@@ -130,17 +131,16 @@ class FrontEnd
     class ICacheCompletionEvent : public Event
     {
       private:
+        MemReqPtr req;
         FrontEnd *frontEnd;
 
       public:
-        ICacheCompletionEvent(FrontEnd *_fe);
+        ICacheCompletionEvent(MemReqPtr &_req, FrontEnd *_fe);
 
         virtual void process();
         virtual const char *description();
     };
 
-    ICacheCompletionEvent cacheCompletionEvent;
-
     MemInterface *icacheInterface;
 
 #if !FULL_SYSTEM
@@ -174,6 +174,8 @@ class FrontEnd
     void setPC(Addr val) { PC = val; }
     void setNextPC(Addr val) { nextPC = val; }
 
+    void wakeFromQuiesce();
+
     void dumpInsts();
 
   private:
diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh
index af452fe95..7c18386cf 100644
--- a/cpu/ozone/front_end_impl.hh
+++ b/cpu/ozone/front_end_impl.hh
@@ -1,4 +1,5 @@
 
+#include "arch/faults.hh"
 #include "arch/isa_traits.hh"
 #include "base/statistics.hh"
 #include "cpu/exec_context.hh"
@@ -12,7 +13,6 @@ using namespace TheISA;
 template <class Impl>
 FrontEnd<Impl>::FrontEnd(Params *params)
     : branchPred(params),
-      cacheCompletionEvent(this),
       icacheInterface(params->icacheInterface),
       instBufferSize(0),
       maxInstBufferSize(params->maxInstBufferSize),
@@ -26,10 +26,12 @@ FrontEnd<Impl>::FrontEnd(Params *params)
     // Setup branch predictor.
 
     // Setup Memory Request
+/*
     memReq = new MemReq();
     memReq->asid = 0;
     memReq->data = new uint8_t[64];
-
+*/
+    memReq = NULL;
     // Size of cache block.
     cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
 
@@ -46,7 +48,7 @@ FrontEnd<Impl>::FrontEnd(Params *params)
     cacheBlkValid = false;
 
 #if !FULL_SYSTEM
-    pTable = params->pTable;
+//    pTable = params->pTable;
 #endif
     fetchFault = NoFault;
 }
@@ -72,7 +74,7 @@ void
 FrontEnd<Impl>::setXC(ExecContext *xc_ptr)
 {
     xc = xc_ptr;
-    memReq->xc = xc;
+//    memReq->xc = xc;
 }
 
 template <class Impl>
@@ -269,6 +271,9 @@ FrontEnd<Impl>::tick()
         }
         updateStatus();
         return;
+    } else if (status == QuiescePending) {
+        DPRINTF(FE, "Waiting for quiesce to execute or get squashed.\n");
+        return;
     } else if (status != IcacheMissComplete) {
         if (fetchCacheLineNextCycle) {
             Fault fault = fetchCacheLine();
@@ -325,6 +330,14 @@ FrontEnd<Impl>::tick()
         // rename(num_inst);
         // }
 
+#if FULL_SYSTEM
+        if (inst->isQuiesce()) {
+            warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
+            status = QuiescePending;
+            break;
+        }
+#endif
+
         if (inst->predTaken()) {
             // Start over with tick?
             break;
@@ -364,6 +377,12 @@ FrontEnd<Impl>::fetchCacheLine()
 
     // Setup the memReq to do a read of the first isntruction's address.
     // Set the appropriate read size and flags as well.
+    memReq = new MemReq();
+
+    memReq->asid = 0;
+    memReq->thread_num = 0;
+    memReq->data = new uint8_t[64];
+    memReq->xc = xc;
     memReq->cmd = Read;
     memReq->reset(fetch_PC, cacheBlkSize, flags);
 
@@ -377,16 +396,26 @@ FrontEnd<Impl>::fetchCacheLine()
     // Now do the timing access to see whether or not the instruction
     // exists within the cache.
     if (icacheInterface && fault == NoFault) {
+#if FULL_SYSTEM
+        if (cpu->system->memctrl->badaddr(memReq->paddr)) {
+            DPRINTF(FE, "Fetch: Bad address %#x (hopefully on a "
+                    "misspeculating path!",
+                    memReq->paddr);
+            return TheISA::genMachineCheckFault();
+        }
+#endif
+
         memReq->completionEvent = NULL;
 
         memReq->time = curTick;
+        fault = cpu->mem->read(memReq, cacheData);
 
         MemAccessResult res = icacheInterface->access(memReq);
 
         // If the cache missed then schedule an event to wake
         // up this stage once the cache miss completes.
         if (icacheInterface->doEvents() && res != MA_HIT) {
-            memReq->completionEvent = new ICacheCompletionEvent(this);
+            memReq->completionEvent = new ICacheCompletionEvent(memReq, this);
 
             status = IcacheMissStall;
 
@@ -398,7 +427,7 @@ FrontEnd<Impl>::fetchCacheLine()
 
             cacheBlkValid = true;
 
-            memcpy(cacheData, memReq->data, memReq->size);
+//            memcpy(cacheData, memReq->data, memReq->size);
         }
     }
 
@@ -541,7 +570,8 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
     // Clear the icache miss if it's outstanding.
     if (status == IcacheMissStall && icacheInterface) {
         DPRINTF(FE, "Squashing outstanding Icache miss.\n");
-        icacheInterface->squash(0);
+//        icacheInterface->squash(0);
+        memReq = NULL;
     }
 
     if (status == SerializeBlocked) {
@@ -577,12 +607,13 @@ FrontEnd<Impl>::getInst()
 
 template <class Impl>
 void
-FrontEnd<Impl>::processCacheCompletion()
+FrontEnd<Impl>::processCacheCompletion(MemReqPtr &req)
 {
     DPRINTF(FE, "Processing cache completion\n");
 
     // Do something here.
-    if (status != IcacheMissStall) {
+    if (status != IcacheMissStall ||
+        req != memReq) {
         DPRINTF(FE, "Previous fetch was squashed.\n");
         return;
     }
@@ -595,10 +626,11 @@ FrontEnd<Impl>::processCacheCompletion()
         fetchStatus[tid] = IcacheMissComplete;
     }
 */
-    memcpy(cacheData, memReq->data, memReq->size);
+//    memcpy(cacheData, memReq->data, memReq->size);
 
     // Reset the completion event to NULL.
-    memReq->completionEvent = NULL;
+//    memReq->completionEvent = NULL;
+    memReq = NULL;
 }
 
 template <class Impl>
@@ -766,6 +798,15 @@ FrontEnd<Impl>::renameInst(DynInstPtr &inst)
     }
 }
 
+template <class Impl>
+void
+FrontEnd<Impl>::wakeFromQuiesce()
+{
+    DPRINTF(FE, "Waking up from quiesce\n");
+    // Hopefully this is safe
+    status = Running;
+}
+
 template <class Impl>
 void
 FrontEnd<Impl>::dumpInsts()
@@ -786,8 +827,8 @@ FrontEnd<Impl>::dumpInsts()
 }
 
 template <class Impl>
-FrontEnd<Impl>::ICacheCompletionEvent::ICacheCompletionEvent(FrontEnd *fe)
-    : Event(&mainEventQueue, Delayed_Writeback_Pri), frontEnd(fe)
+FrontEnd<Impl>::ICacheCompletionEvent::ICacheCompletionEvent(MemReqPtr &_req, FrontEnd *fe)
+    : Event(&mainEventQueue, Delayed_Writeback_Pri), req(_req), frontEnd(fe)
 {
     this->setFlags(Event::AutoDelete);
 }
@@ -796,7 +837,7 @@ template <class Impl>
 void
 FrontEnd<Impl>::ICacheCompletionEvent::process()
 {
-    frontEnd->processCacheCompletion();
+    frontEnd->processCacheCompletion(req);
 }
 
 template <class Impl>
diff --git a/cpu/ozone/lw_back_end.hh b/cpu/ozone/lw_back_end.hh
index b89957aad..f17c93ff4 100644
--- a/cpu/ozone/lw_back_end.hh
+++ b/cpu/ozone/lw_back_end.hh
@@ -94,8 +94,7 @@ class LWBackEnd
 
     void regStats();
 
-    void setCPU(FullCPU *cpu_ptr)
-    { cpu = cpu_ptr; }
+    void setCPU(FullCPU *cpu_ptr);
 
     void setFrontEnd(FrontEnd *front_end_ptr)
     { frontEnd = front_end_ptr; }
@@ -404,6 +403,9 @@ class LWBackEnd
     Stats::Scalar<> commit_eligible_samples;
     Stats::Vector<> commit_eligible;
 
+    Stats::Vector<> squashedInsts;
+    Stats::Vector<> ROBSquashedInsts;
+
     Stats::Scalar<> ROB_fcount;
     Stats::Formula ROB_full_rate;
 
diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh
index 115821787..d1290239c 100644
--- a/cpu/ozone/lw_back_end_impl.hh
+++ b/cpu/ozone/lw_back_end_impl.hh
@@ -480,6 +480,18 @@ LWBackEnd<Impl>::regStats()
         .desc("number cycles where commit BW limit reached")
         ;
 
+    squashedInsts
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:squashed_insts")
+        .desc("Number of instructions removed from inst list")
+        ;
+
+    ROBSquashedInsts
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:rob_squashed_insts")
+        .desc("Number of instructions removed from inst list when they reached the head of the ROB")
+        ;
+
     ROB_fcount
         .name(name() + ".ROB:full_count")
         .desc("number of cycles where ROB was full")
@@ -515,6 +527,14 @@ LWBackEnd<Impl>::regStats()
 //    IQ.regStats();
 }
 
+template <class Impl>
+void
+LWBackEnd<Impl>::setCPU(FullCPU *cpu_ptr)
+{
+    cpu = cpu_ptr;
+    LSQ.setCPU(cpu_ptr);
+}
+
 template <class Impl>
 void
 LWBackEnd<Impl>::setCommBuffer(TimeBuffer<CommStruct> *_comm)
@@ -1044,35 +1064,24 @@ LWBackEnd<Impl>::commitInst(int inst_num)
         }
     }
 
-    // Now check if it's one of the special trap or barrier or
-    // serializing instructions.
-    if (inst->isThreadSync())
-    {
-        // Not handled for now.
-        panic("Thread sync instructions are not handled yet.\n");
-    }
+    // Not handled for now.
+    assert(!inst->isThreadSync());
 
     // Check if the instruction caused a fault.  If so, trap.
     Fault inst_fault = inst->getFault();
 
     if (inst_fault != NoFault) {
-        if (!inst->isNop()) {
-            DPRINTF(BE, "Inst [sn:%lli] PC %#x has a fault\n",
-                    inst->seqNum, inst->readPC());
-            thread->setInst(
-                static_cast<TheISA::MachInst>(inst->staticInst->machInst));
+        DPRINTF(BE, "Inst [sn:%lli] PC %#x has a fault\n",
+                inst->seqNum, inst->readPC());
+        thread->setInst(
+            static_cast<TheISA::MachInst>(inst->staticInst->machInst));
 #if FULL_SYSTEM
-            handleFault(inst_fault);
-            return false;
+        handleFault(inst_fault);
+        return false;
 #else // !FULL_SYSTEM
-            panic("fault (%d) detected @ PC %08p", inst_fault,
-                  inst->PC);
+        panic("fault (%d) detected @ PC %08p", inst_fault,
+              inst->PC);
 #endif // FULL_SYSTEM
-        }
-    }
-
-    if (inst->isControl()) {
-//        ++commitCommittedBranches;
     }
 
     int freed_regs = 0;
@@ -1096,7 +1105,6 @@ LWBackEnd<Impl>::commitInst(int inst_num)
     instList.pop_back();
 
     --numInsts;
-    cpu->numInst++;
     thread->numInsts++;
     ++thread->funcExeInst;
     // Maybe move this to where teh fault is handled; if the fault is handled,
@@ -1134,15 +1142,14 @@ template <class Impl>
 void
 LWBackEnd<Impl>::commitInsts()
 {
-    int commit_width = commitWidth ? commitWidth : width;
-
     // Not sure this should be a loop or not.
     int inst_num = 0;
-    while (!instList.empty() && inst_num < commit_width) {
+    while (!instList.empty() && inst_num < commitWidth) {
         if (instList.back()->isSquashed()) {
             instList.back()->clearDependents();
             instList.pop_back();
             --numInsts;
+            ROBSquashedInsts[instList.back()->threadNumber]++;
             continue;
         }
 
@@ -1150,6 +1157,7 @@ LWBackEnd<Impl>::commitInsts()
             DPRINTF(BE, "Can't commit, Instruction [sn:%lli] PC "
                     "%#x is head of ROB and not ready\n",
                     instList.back()->seqNum, instList.back()->readPC());
+            --inst_num;
             break;
         }
     }
@@ -1217,6 +1225,8 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
 
         (*insts_it)->clearDependents();
 
+        squashedInsts[(*insts_it)->threadNumber]++;
+
         instList.erase(insts_it++);
         --numInsts;
     }
@@ -1350,6 +1360,7 @@ LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst)
 {
     unsigned thread = inst->threadNumber;
 
+    cpu->numInst++;
     //
     //  Pick off the software prefetches
     //
diff --git a/cpu/ozone/lw_lsq.hh b/cpu/ozone/lw_lsq.hh
index 2b2c25b58..eb9886244 100644
--- a/cpu/ozone/lw_lsq.hh
+++ b/cpu/ozone/lw_lsq.hh
@@ -43,7 +43,7 @@
 //#include "mem/page_table.hh"
 #include "sim/sim_object.hh"
 
-class PageTable;
+//class PageTable;
 
 /**
  * Class that implements the actual LQ and SQ for each specific thread.
@@ -115,7 +115,7 @@ class OzoneLWLSQ {
     { be = be_ptr; }
 
     /** Sets the page table pointer. */
-    void setPageTable(PageTable *pt_ptr);
+//    void setPageTable(PageTable *pt_ptr);
 
     /** Ticks the LSQ unit, which in this case only resets the number of
      * used cache ports.
@@ -243,7 +243,7 @@ class OzoneLWLSQ {
     MemInterface *dcacheInterface;
 
     /** Pointer to the page table. */
-    PageTable *pTable;
+//    PageTable *pTable;
 
   public:
     struct SQEntry {
@@ -562,6 +562,19 @@ OzoneLWLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
 
     // If there's no forwarding case, then go access memory
+    DPRINTF(OzoneLSQ, "Doing functional access for inst PC %#x\n",
+            inst->readPC());
+
+
+    // Setup MemReq pointer
+    req->cmd = Read;
+    req->completionEvent = NULL;
+    req->time = curTick;
+    assert(!req->data);
+    req->data = new uint8_t[64];
+    Fault fault = cpu->read(req, data);
+    memcpy(req->data, &data, sizeof(T));
+
     ++usedPorts;
 
     // if we have a cache, do cache access too
@@ -582,12 +595,6 @@ OzoneLWLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
                 "vaddr:%#x flags:%i\n",
                 inst->readPC(), req->paddr, req->vaddr, req->flags);
 
-        // Setup MemReq pointer
-        req->cmd = Read;
-        req->completionEvent = NULL;
-        req->time = curTick;
-        assert(!req->data);
-        req->data = new uint8_t[64];
 
         assert(!req->completionEvent);
         req->completionEvent =
diff --git a/cpu/ozone/lw_lsq_impl.hh b/cpu/ozone/lw_lsq_impl.hh
index 54d7ead6c..7b22d2564 100644
--- a/cpu/ozone/lw_lsq_impl.hh
+++ b/cpu/ozone/lw_lsq_impl.hh
@@ -131,7 +131,7 @@ OzoneLWLSQ<Impl>::clearSQ()
 {
     storeQueue.clear();
 }
-
+/*
 template<class Impl>
 void
 OzoneLWLSQ<Impl>::setPageTable(PageTable *pt_ptr)
@@ -139,7 +139,7 @@ OzoneLWLSQ<Impl>::setPageTable(PageTable *pt_ptr)
     DPRINTF(OzoneLSQ, "Setting the page table pointer.\n");
     pTable = pt_ptr;
 }
-
+*/
 template<class Impl>
 void
 OzoneLWLSQ<Impl>::resizeLQ(unsigned size)
@@ -519,6 +519,23 @@ OzoneLWLSQ<Impl>::writebackStores()
                 req->paddr, *(req->data),
                 inst->seqNum);
 
+        switch((*sq_it).size) {
+          case 1:
+            cpu->write(req, (uint8_t &)(*sq_it).data);
+            break;
+          case 2:
+            cpu->write(req, (uint16_t &)(*sq_it).data);
+            break;
+          case 4:
+            cpu->write(req, (uint32_t &)(*sq_it).data);
+            break;
+          case 8:
+            cpu->write(req, (uint64_t &)(*sq_it).data);
+            break;
+          default:
+            panic("Unexpected store size!\n");
+        }
+
         if (dcacheInterface) {
             MemAccessResult result = dcacheInterface->access(req);
 
@@ -538,7 +555,7 @@ OzoneLWLSQ<Impl>::writebackStores()
                 typename BackEnd::LdWritebackEvent *wb = NULL;
                 if (req->flags & LOCKED) {
                     // Stx_C does not generate a system port transaction.
-                    req->result=1;
+//                    req->result=1;
                     wb = new typename BackEnd::LdWritebackEvent(inst,
                                                             be);
                 }
@@ -571,12 +588,12 @@ OzoneLWLSQ<Impl>::writebackStores()
 
                 if (req->flags & LOCKED) {
                     // Stx_C does not generate a system port transaction.
-                    if (req->flags & UNCACHEABLE) {
+/*                    if (req->flags & UNCACHEABLE) {
                         req->result = 2;
                     } else {
                         req->result = 1;
                     }
-
+*/
                     typename BackEnd::LdWritebackEvent *wb =
                         new typename BackEnd::LdWritebackEvent(inst,
                                                                be);
@@ -642,6 +659,11 @@ OzoneLWLSQ<Impl>::squash(const InstSeqNum &squashed_num)
 
     while (stores != 0 && (*sq_it).inst->seqNum > squashed_num) {
         assert(!storeQueue.empty());
+
+        if ((*sq_it).canWB) {
+            break;
+        }
+
         // Clear the smart pointer to make sure it is decremented.
         DPRINTF(OzoneLSQ,"Store Instruction PC %#x idx:%i squashed [sn:%lli]\n",
                 (*sq_it).inst->readPC(), (*sq_it).inst->sqIdx,

From 31e09892d750d0e6dc7de3d455e34808c159a420 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 17:11:31 -0400
Subject: [PATCH 15/50] Include option for disabling PC symbols.

cpu/inst_seq.hh:
cpu/o3/cpu.cc:
cpu/ozone/cpu_builder.cc:
cpu/ozone/thread_state.hh:
    SE build fixes.

--HG--
extra : convert_revision : a4df6128533105f849b5469f62d83dffe299b7df
---
 cpu/inst_seq.hh           |  2 ++
 cpu/o3/cpu.cc             | 10 ++++------
 cpu/ozone/cpu_builder.cc  | 12 ++++++------
 cpu/ozone/thread_state.hh |  5 +++--
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/cpu/inst_seq.hh b/cpu/inst_seq.hh
index 8de047af7..356d19df0 100644
--- a/cpu/inst_seq.hh
+++ b/cpu/inst_seq.hh
@@ -29,6 +29,8 @@
 #ifndef __STD_TYPES_HH__
 #define __STD_TYPES_HH__
 
+#include <stdint.h>
+
 // inst sequence type, used to order instructions in the ready list,
 // if this rolls over the ready list order temporarily will get messed
 // up, but execution will continue and complete correctly
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index d322037bc..ac8c4236e 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -123,7 +123,7 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
       physmem(system->physmem),
       mem(params->mem),
 #else
-      pTable(params->pTable),
+//      pTable(params->pTable),
 #endif // FULL_SYSTEM
 
       icacheInterface(params->icacheInterface),
@@ -238,8 +238,8 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
     // Setup the page table for whichever stages need it.
 #if !FULL_SYSTEM
-    fetch.setPageTable(pTable);
-    iew.setPageTable(pTable);
+//    fetch.setPageTable(pTable);
+//    iew.setPageTable(pTable);
 #endif
 
     // Setup the ROB for whichever stages need it.
@@ -885,11 +885,9 @@ template <class Impl>
 void
 FullO3CPU<Impl>::removeFrontInst(DynInstPtr &inst)
 {
-    unsigned tid = inst->threadNumber;
-
     DPRINTF(FullCPU, "FullCPU: Removing committed instruction [tid:%i] PC %#x "
             "[sn:%lli]\n",
-            tid, inst->readPC(), inst->seqNum);
+            inst->threadNumber, inst->readPC(), inst->seqNum);
 
     removeInstsThisCycle = true;
 
diff --git a/cpu/ozone/cpu_builder.cc b/cpu/ozone/cpu_builder.cc
index 8ac6858b0..0146dd1bd 100644
--- a/cpu/ozone/cpu_builder.cc
+++ b/cpu/ozone/cpu_builder.cc
@@ -45,7 +45,7 @@ SimObjectParam<AlphaITB *> itb;
 SimObjectParam<AlphaDTB *> dtb;
 #else
 SimObjectVectorParam<Process *> workload;
-SimObjectParam<PageTable *> page_table;
+//SimObjectParam<PageTable *> page_table;
 #endif // FULL_SYSTEM
 
 SimObjectParam<FunctionalMemory *> mem;
@@ -159,7 +159,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
     INIT_PARAM(dtb, "Data translation buffer"),
 #else
     INIT_PARAM(workload, "Processes to run"),
-    INIT_PARAM(page_table, "Page table"),
+//    INIT_PARAM(page_table, "Page table"),
 #endif // FULL_SYSTEM
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
@@ -310,7 +310,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
     params->dtb = dtb;
 #else
     params->workload = workload;
-    params->pTable = page_table;
+//    params->pTable = page_table;
 #endif // FULL_SYSTEM
 
     params->mem = mem;
@@ -440,7 +440,7 @@ SimObjectParam<AlphaITB *> itb;
 SimObjectParam<AlphaDTB *> dtb;
 #else
 SimObjectVectorParam<Process *> workload;
-SimObjectParam<PageTable *> page_table;
+//SimObjectParam<PageTable *> page_table;
 #endif // FULL_SYSTEM
 
 SimObjectParam<FunctionalMemory *> mem;
@@ -554,7 +554,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(SimpleOzoneCPU)
     INIT_PARAM(dtb, "Data translation buffer"),
 #else
     INIT_PARAM(workload, "Processes to run"),
-    INIT_PARAM(page_table, "Page table"),
+//    INIT_PARAM(page_table, "Page table"),
 #endif // FULL_SYSTEM
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
@@ -705,7 +705,7 @@ CREATE_SIM_OBJECT(SimpleOzoneCPU)
     params->dtb = dtb;
 #else
     params->workload = workload;
-    params->pTable = page_table;
+//    params->pTable = page_table;
 #endif // FULL_SYSTEM
 
     params->mem = mem;
diff --git a/cpu/ozone/thread_state.hh b/cpu/ozone/thread_state.hh
index c6d23a63b..269fc6459 100644
--- a/cpu/ozone/thread_state.hh
+++ b/cpu/ozone/thread_state.hh
@@ -6,9 +6,10 @@
 #include "arch/isa_traits.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/thread_state.hh"
+#include "sim/process.hh"
 
 class Event;
-class Process;
+//class Process;
 
 #if FULL_SYSTEM
 class EndQuiesceEvent;
@@ -40,7 +41,7 @@ struct OzoneThreadState : public ThreadState {
     }
 #else
     OzoneThreadState(FullCPU *_cpu, int _thread_num, Process *_process, int _asid)
-        : ThreadState(-1, _thread_num, NULL, _process, _asid),
+        : ThreadState(-1, _thread_num, _process->getMemory(), _process, _asid),
           cpu(_cpu), inSyscall(0), trapPending(0)
     {
         memset(&regs, 0, sizeof(TheISA::RegFile));

From d363d5aad72b34769c753752a779a13e11532fd8 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 17:40:00 -0400
Subject: [PATCH 16/50] Quiesce stuff.

cpu/ozone/cpu.hh:
    Add quiesce stat (not clear how it should be used yet).
cpu/ozone/cpu_impl.hh:
    Fix for quiesce.

--HG--
extra : convert_revision : a1998818e241374ae3f4c3cabbef885dda55c884
---
 cpu/ozone/cpu.hh      | 2 ++
 cpu/ozone/cpu_impl.hh | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index d37d3360c..56b6571a2 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -613,6 +613,8 @@ class OzoneCPU : public BaseCPU
     TimeBuffer<CommStruct> comm;
 
     bool lockFlag;
+
+    Stats::Scalar<> quiesceCycles;
 };
 
 #endif // __CPU_OZONE_CPU_HH__
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index a7bc61603..17d944e7c 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -413,6 +413,11 @@ OzoneCPU<Impl>::regStats()
         .desc("Percentage of idle cycles")
         ;
 
+    quiesceCycles
+        .name(name() + ".quiesce_cycles")
+        .desc("Number of cycles spent in quiesce")
+        ;
+
     idleFraction = constant(1.0) - notIdleFraction;
 
     frontEnd->regStats();
@@ -609,7 +614,8 @@ OzoneCPU<Impl>::post_interrupt(int int_num, int index)
 {
     BaseCPU::post_interrupt(int_num, index);
 
-    if (thread._status == ExecContext::Suspended) {
+//    if (thread._status == ExecContext::Suspended) {
+    if (_status == Idle) {
         DPRINTF(IPI,"Suspended Processor awoke\n");
 //	thread.activate();
         // Hack for now.  Otherwise might have to go through the xcProxy, or

From 32a52949834a5524c67a5a8d697b7e769138dc0f Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Wed, 3 May 2006 15:51:53 -0400
Subject: [PATCH 17/50] XC needs to get memory from the process.

--HG--
extra : convert_revision : a2c014276824255a896a7e353f919fe81071091e
---
 cpu/cpu_exec_context.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpu/cpu_exec_context.cc b/cpu/cpu_exec_context.cc
index 363244e60..4400cf842 100644
--- a/cpu/cpu_exec_context.cc
+++ b/cpu/cpu_exec_context.cc
@@ -85,7 +85,7 @@ CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num,
                          Process *_process, int _asid)
     : _status(ExecContext::Unallocated),
       cpu(_cpu), thread_num(_thread_num), cpu_id(-1), lastActivate(0),
-      lastSuspend(0), process(_process), mem(NULL), asid(_asid),
+      lastSuspend(0), process(_process), mem(process->getMemory()), asid(_asid),
       func_exe_inst(0), storeCondFailures(0)
 {
     memset(&regs, 0, sizeof(RegFile));

From 4601230d35de7bbda5906d04a28e2387f0e5177b Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Wed, 3 May 2006 15:54:36 -0400
Subject: [PATCH 18/50] Fixes for the sampler.

cpu/simple/cpu.cc:
    Sampler fixes.  The status may be switched out when calling activate or suspend if there is a switchover during a quiesce.

--HG--
extra : convert_revision : da026e75dfb86289484cf01c5b1ecd9b03a72bd3
---
 cpu/simple/cpu.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpu/simple/cpu.cc b/cpu/simple/cpu.cc
index 8db72b77e..07f9d0dad 100644
--- a/cpu/simple/cpu.cc
+++ b/cpu/simple/cpu.cc
@@ -144,6 +144,7 @@ SimpleCPU::SimpleCPU(Params *p)
     cpuXC = new CPUExecContext(this, /* thread_num */ 0, p->process,
                                /* asid */ 0);
 #endif // !FULL_SYSTEM
+    cpuXC->setStatus(ExecContext::Suspended);
     xcProxy = cpuXC->getProxy();
 
     icacheInterface = p->icache_interface;
@@ -212,7 +213,7 @@ SimpleCPU::activateContext(int thread_num, int delay)
     assert(thread_num == 0);
     assert(cpuXC);
 
-    assert(_status == Idle);
+    assert(_status == Idle || _status == SwitchedOut);
     notIdleFraction++;
     scheduleTickEvent(delay);
     _status = Running;
@@ -225,7 +226,7 @@ SimpleCPU::suspendContext(int thread_num)
     assert(thread_num == 0);
     assert(cpuXC);
 
-    assert(_status == Running);
+    assert(_status == Running || _status == SwitchedOut);
     notIdleFraction--;
     unscheduleTickEvent();
     _status = Idle;
@@ -418,7 +419,7 @@ SimpleCPU::read(Addr addr, T &data, unsigned flags)
         Fault fault = cpuXC->read(memReq,data);
 
         if (traceData) {
-            traceData->setAddr(addr);
+            traceData->setAddr(memReq->vaddr);
         }
         return fault;
     }

From f3358e5f7b6452f14a6df5106129ef0cb2ed8b65 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 4 May 2006 11:36:20 -0400
Subject: [PATCH 19/50] O3 CPU now handles being used with the sampler.

cpu/o3/2bit_local_pred.cc:
cpu/o3/2bit_local_pred.hh:
cpu/o3/bpred_unit.hh:
cpu/o3/bpred_unit_impl.hh:
cpu/o3/btb.cc:
cpu/o3/btb.hh:
cpu/o3/commit.hh:
cpu/o3/commit_impl.hh:
cpu/o3/cpu.cc:
cpu/o3/cpu.hh:
cpu/o3/decode.hh:
cpu/o3/decode_impl.hh:
cpu/o3/fetch.hh:
cpu/o3/fetch_impl.hh:
cpu/o3/fu_pool.cc:
cpu/o3/fu_pool.hh:
cpu/o3/iew.hh:
cpu/o3/iew_impl.hh:
cpu/o3/inst_queue.hh:
cpu/o3/inst_queue_impl.hh:
cpu/o3/lsq.hh:
cpu/o3/lsq_impl.hh:
cpu/o3/lsq_unit.hh:
cpu/o3/lsq_unit_impl.hh:
cpu/o3/mem_dep_unit.hh:
cpu/o3/mem_dep_unit_impl.hh:
cpu/o3/ras.cc:
cpu/o3/ras.hh:
cpu/o3/rename.hh:
cpu/o3/rename_impl.hh:
cpu/o3/rob.hh:
cpu/o3/rob_impl.hh:
cpu/o3/sat_counter.cc:
cpu/o3/sat_counter.hh:
cpu/o3/thread_state.hh:
    Handle switching out and taking over.  Needs to be able to reset all state.
cpu/o3/alpha_cpu_impl.hh:
    Handle taking over from another XC.

--HG--
extra : convert_revision : b936e826f0f8a18319bfa940ff35097b4192b449
---
 cpu/o3/2bit_local_pred.cc   |   8 ++
 cpu/o3/2bit_local_pred.hh   |   2 +
 cpu/o3/alpha_cpu_impl.hh    |  20 +++++
 cpu/o3/bpred_unit.hh        |   4 +
 cpu/o3/bpred_unit_impl.hh   |  21 +++++
 cpu/o3/btb.cc               |   8 ++
 cpu/o3/btb.hh               |   2 +
 cpu/o3/commit.hh            |   6 ++
 cpu/o3/commit_impl.hh       |  38 ++++++++-
 cpu/o3/cpu.cc               |  76 +++++++++++++----
 cpu/o3/cpu.hh               |   9 +-
 cpu/o3/decode.hh            |   5 ++
 cpu/o3/decode_impl.hh       |  50 +++++++++--
 cpu/o3/fetch.hh             |  13 +++
 cpu/o3/fetch_impl.hh        |  70 ++++++++++++----
 cpu/o3/fu_pool.cc           |  14 ++++
 cpu/o3/fu_pool.hh           |   3 +
 cpu/o3/iew.hh               |   8 ++
 cpu/o3/iew_impl.hh          |  56 ++++++++++++-
 cpu/o3/inst_queue.hh        |  14 +++-
 cpu/o3/inst_queue_impl.hh   | 160 +++++++++++++++++++++++-------------
 cpu/o3/lsq.hh               |  12 +--
 cpu/o3/lsq_impl.hh          |  19 ++++-
 cpu/o3/lsq_unit.hh          |  49 +++++++++--
 cpu/o3/lsq_unit_impl.hh     |  90 +++++++++++++++++++-
 cpu/o3/mem_dep_unit.hh      |   4 +
 cpu/o3/mem_dep_unit_impl.hh |  20 +++++
 cpu/o3/ras.cc               |   9 ++
 cpu/o3/ras.hh               |   2 +
 cpu/o3/rename.hh            |   5 ++
 cpu/o3/rename_impl.hh       |  67 +++++++++++++--
 cpu/o3/rob.hh               |   4 +
 cpu/o3/rob_impl.hh          |  25 ++++++
 cpu/o3/sat_counter.cc       |  24 +-----
 cpu/o3/sat_counter.hh       |  19 ++++-
 cpu/o3/thread_state.hh      |   2 +-
 36 files changed, 786 insertions(+), 152 deletions(-)

diff --git a/cpu/o3/2bit_local_pred.cc b/cpu/o3/2bit_local_pred.cc
index 458fbd663..eab98531d 100644
--- a/cpu/o3/2bit_local_pred.cc
+++ b/cpu/o3/2bit_local_pred.cc
@@ -67,6 +67,14 @@ DefaultBP::DefaultBP(unsigned _localPredictorSize,
             instShiftAmt);
 }
 
+void
+DefaultBP::reset()
+{
+    for (int i = 0; i < localPredictorSets; ++i) {
+        localCtrs[i].reset();
+    }
+}
+
 bool
 DefaultBP::lookup(Addr &branch_addr)
 {
diff --git a/cpu/o3/2bit_local_pred.hh b/cpu/o3/2bit_local_pred.hh
index 38d3f4842..0dfe53819 100644
--- a/cpu/o3/2bit_local_pred.hh
+++ b/cpu/o3/2bit_local_pred.hh
@@ -62,6 +62,8 @@ class DefaultBP
      */
     void update(Addr &branch_addr, bool taken);
 
+    void reset();
+
   private:
 
     /**
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 86f7d9f28..7a2d5d2b9 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -151,6 +151,26 @@ template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
 {
+    // some things should already be set up
+    assert(getMemPtr() == old_context->getMemPtr());
+#if FULL_SYSTEM
+    assert(getSystemPtr() == old_context->getSystemPtr());
+#else
+    assert(getProcessPtr() == old_context->getProcessPtr());
+#endif
+
+    // copy over functional state
+    setStatus(old_context->status());
+    copyArchRegs(old_context);
+    setCpuId(old_context->readCpuId());
+#if !FULL_SYSTEM
+    thread->funcExeInst = old_context->readFuncExeInst();
+#endif
+
+    old_context->setStatus(ExecContext::Unallocated);
+
+    thread->inSyscall = false;
+    thread->trapPending = false;
 }
 
 template <class Impl>
diff --git a/cpu/o3/bpred_unit.hh b/cpu/o3/bpred_unit.hh
index 67c300989..ee7ffc183 100644
--- a/cpu/o3/bpred_unit.hh
+++ b/cpu/o3/bpred_unit.hh
@@ -67,6 +67,10 @@ class TwobitBPredUnit
      */
     void regStats();
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /**
      * Predicts whether or not the instruction is a taken branch, and the
      * target of the branch if it is taken.
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index f79b67b6c..872c0c62e 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -94,6 +94,26 @@ TwobitBPredUnit<Impl>::regStats()
         ;
 }
 
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::switchOut()
+{
+    for (int i = 0; i < Impl::MaxThreads; ++i) {
+        predHist[i].clear();
+    }
+}
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::takeOverFrom()
+{
+    for (int i = 0; i < Impl::MaxThreads; ++i)
+        RAS[i].reset();
+
+    BP.reset();
+    BTB.reset();
+}
+
 template <class Impl>
 bool
 TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid)
@@ -297,5 +317,6 @@ TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn,
         BP.update(pred_hist.front().PC, actually_taken);
 
         BTB.update(pred_hist.front().PC, corr_target, tid);
+        pred_hist.pop_front();
     }
 }
diff --git a/cpu/o3/btb.cc b/cpu/o3/btb.cc
index e084142d7..e5f69043a 100644
--- a/cpu/o3/btb.cc
+++ b/cpu/o3/btb.cc
@@ -58,6 +58,14 @@ DefaultBTB::DefaultBTB(unsigned _numEntries,
     tagShiftAmt = instShiftAmt + floorLog2(numEntries);
 }
 
+void
+DefaultBTB::reset()
+{
+    for (int i = 0; i < numEntries; ++i) {
+        btb[i].valid = false;
+    }
+}
+
 inline
 unsigned
 DefaultBTB::getIndex(const Addr &inst_PC)
diff --git a/cpu/o3/btb.hh b/cpu/o3/btb.hh
index aaa9945f7..b9ff42573 100644
--- a/cpu/o3/btb.hh
+++ b/cpu/o3/btb.hh
@@ -65,6 +65,8 @@ class DefaultBTB
     DefaultBTB(unsigned numEntries, unsigned tagBits,
                unsigned instShiftAmt);
 
+    void reset();
+
     /** Looks up an address in the BTB. Must call valid() first on the address.
      *  @param inst_PC The address of the branch to look up.
      *  @param tid The thread id.
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index f374b8fb7..028bd5295 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -175,6 +175,10 @@ class DefaultCommit
     /** Initializes stage by sending back the number of free entries. */
     void initStage();
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /** Ticks the commit stage, which tries to commit instructions. */
     void tick();
 
@@ -351,6 +355,8 @@ class DefaultCommit
     /** Number of Active Threads */
     unsigned numThreads;
 
+    bool switchedOut;
+
     Tick trapLatency;
 
     Tick fetchTrapLatency;
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 157e688c7..7834460e2 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -54,6 +54,7 @@ template <class Impl>
 void
 DefaultCommit<Impl>::TrapEvent::process()
 {
+    // This will get reset if it was switched out.
     commit->trapSquash[tid] = true;
 }
 
@@ -75,7 +76,8 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
       renameWidth(params->renameWidth),
       iewWidth(params->executeWidth),
       commitWidth(params->commitWidth),
-      numThreads(params->numberOfThreads)
+      numThreads(params->numberOfThreads),
+      switchedOut(false)
 {
     _status = Active;
     _nextStatus = Inactive;
@@ -254,6 +256,9 @@ DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr)
     // Commit must broadcast the number of free entries it has at the start of
     // the simulation, so it starts as active.
     cpu->activateStage(FullCPU::CommitIdx);
+
+    trapLatency = cpu->cycles(6);
+    fetchTrapLatency = cpu->cycles(12);
 }
 
 template <class Impl>
@@ -360,6 +365,29 @@ DefaultCommit<Impl>::initStage()
     cpu->activityThisCycle();
 }
 
+template <class Impl>
+void
+DefaultCommit<Impl>::switchOut()
+{
+    rob->switchOut();
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::takeOverFrom()
+{
+    _status = Active;
+    _nextStatus = Inactive;
+    for (int i=0; i < numThreads; i++) {
+        commitStatus[i] = Idle;
+        changedROBNumEntries[i] = false;
+        trapSquash[i] = false;
+        xcSquash[i] = false;
+    }
+    squashCounter = 0;
+    rob->takeOverFrom();
+}
+
 template <class Impl>
 void
 DefaultCommit<Impl>::updateStatus()
@@ -719,8 +747,9 @@ DefaultCommit<Impl>::commit()
     while (threads != (*activeThreads).end()) {
         unsigned tid = *threads++;
 
-        if (fromFetch->fetchFault) {
+        if (fromFetch->fetchFault && commitStatus[0] != TrapPending) {
             // Record the fault.  Wait until it's empty in the ROB.  Then handle the trap.
+            // Ignore it if there's already a trap pending as fetch will be redirected.
             fetchFault = fromFetch->fetchFault;
             fetchFaultSN = fromFetch->fetchFaultSN;
             fetchFaultTick = curTick + fetchTrapLatency;
@@ -975,6 +1004,7 @@ DefaultCommit<Impl>::commitInsts()
                 }
 
                 PC[tid] = nextPC[tid];
+                nextPC[tid] = nextPC[tid] + sizeof(TheISA::MachInst);
 #if FULL_SYSTEM
                 int count = 0;
                 Addr oldpc;
@@ -1002,6 +1032,10 @@ DefaultCommit<Impl>::commitInsts()
 
     DPRINTF(CommitRate, "%i\n", num_committed);
     numCommittedDist.sample(num_committed);
+
+    if (num_committed == commitWidth) {
+        commit_eligible[0]++;
+    }
 }
 
 template <class Impl>
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index ac8c4236e..fc8372026 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -124,6 +124,7 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
       mem(params->mem),
 #else
 //      pTable(params->pTable),
+      mem(params->workload[0]->getMemory()),
 #endif // FULL_SYSTEM
 
       icacheInterface(params->icacheInterface),
@@ -176,9 +177,9 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     numThreads = number_of_threads;
 
 #if !FULL_SYSTEM
-    int activeThreads = params->workload.size();
+    int active_threads = params->workload.size();
 #else
-    int activeThreads = 1;
+    int active_threads = 1;
 #endif
 
     assert(params->numPhysIntRegs   >= numThreads * TheISA::NumIntRegs);
@@ -192,7 +193,7 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     PhysRegIndex freg_idx = params->numPhysIntRegs; //Index to 1 after int regs
 
     for (int tid=0; tid < numThreads; tid++) {
-        bool bindRegs = (tid <= activeThreads - 1);
+        bool bindRegs = (tid <= active_threads - 1);
 
         commitRenameMap[tid].init(TheISA::NumIntRegs,
                                   params->numPhysIntRegs,
@@ -357,7 +358,7 @@ FullO3CPU<Impl>::tick()
     }
 
     if (activityCount && !tickEvent.scheduled()) {
-        tickEvent.schedule(curTick + 1);
+        tickEvent.schedule(curTick + cycles(1));
     }
 
 #if !FULL_SYSTEM
@@ -370,8 +371,8 @@ template <class Impl>
 void
 FullO3CPU<Impl>::init()
 {
-    if (deferRegistration) {
-        return;
+    if (!deferRegistration) {
+        registerExecContexts();
     }
 
     // Set inSyscall so that the CPU doesn't squash when initially
@@ -379,7 +380,6 @@ FullO3CPU<Impl>::init()
     for (int i = 0; i < number_of_threads; ++i)
         thread[i]->inSyscall = true;
 
-    registerExecContexts();
 
     // Need to do a copy of the xc->regs into the CPU's regfile so
     // that it can start properly.
@@ -388,7 +388,7 @@ FullO3CPU<Impl>::init()
         // Need to do a copy of the xc->regs into the CPU's regfile so
         // that it can start properly.
 #if FULL_SYSTEM
-        ExecContext *src_xc = system->execContexts[tid];
+        ExecContext *src_xc = execContexts[tid];
 #else
         ExecContext *src_xc = thread[tid]->getXCProxy();
 #endif
@@ -584,7 +584,7 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
         activeThreads.push_back(tid);
     }
 
-    assert(_status == Idle);
+    assert(_status == Idle || _status == SwitchedOut);
 
     scheduleTickEvent(delay);
 
@@ -658,21 +658,64 @@ FullO3CPU<Impl>::haltContext(int tid)
 
 template <class Impl>
 void
-FullO3CPU<Impl>::switchOut()
+FullO3CPU<Impl>::switchOut(Sampler *sampler)
 {
-    panic("FullO3CPU does not have a switch out function.\n");
+//    panic("FullO3CPU does not have a switch out function.\n");
+    fetch.switchOut();
+    decode.switchOut();
+    rename.switchOut();
+    iew.switchOut();
+    commit.switchOut();
+    if (tickEvent.scheduled())
+        tickEvent.squash();
+    sampler->signalSwitched();
+    _status = SwitchedOut;
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
+    for (int i = 0; i < 6; ++i) {
+        timeBuffer.advance();
+        fetchQueue.advance();
+        decodeQueue.advance();
+        renameQueue.advance();
+        iewQueue.advance();
+        activityBuffer.advance();
+    }
+
+    activityCount = 0;
+    bzero(&stageActive, sizeof(stageActive));
+
     BaseCPU::takeOverFrom(oldCPU);
 
+    fetch.takeOverFrom();
+    decode.takeOverFrom();
+    rename.takeOverFrom();
+    iew.takeOverFrom();
+    commit.takeOverFrom();
+
     assert(!tickEvent.scheduled());
 
+    // @todo: Figure out how to properly select the tid to put onto the active threads list.
+    int tid = 0;
+
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive == activeThreads.end()) {
+        //May Need to Re-code this if the delay variable is the
+        //delay needed for thread to activate
+        DPRINTF(FullCPU, "Adding Thread %i to active threads list\n",
+                tid);
+
+        activeThreads.push_back(tid);
+    }
+
     // Set all status's to active, schedule the
     // CPU's tick event.
+    // @todo: Fix up statuses so this is handled properly
     for (int i = 0; i < execContexts.size(); ++i) {
         ExecContext *xc = execContexts[i];
         if (xc->status() == ExecContext::Active && _status != Running) {
@@ -680,6 +723,8 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
             tickEvent.schedule(curTick);
         }
     }
+    if (!tickEvent.scheduled())
+        tickEvent.schedule(curTick);
 }
 
 template <class Impl>
@@ -758,7 +803,8 @@ template <class Impl>
 float
 FullO3CPU<Impl>::readArchFloatRegSingle(int reg_idx, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     return regFile.readFloatRegSingle(phys_reg);
 }
@@ -767,7 +813,8 @@ template <class Impl>
 double
 FullO3CPU<Impl>::readArchFloatRegDouble(int reg_idx, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     return regFile.readFloatRegDouble(phys_reg);
 }
@@ -776,7 +823,8 @@ template <class Impl>
 uint64_t
 FullO3CPU<Impl>::readArchFloatRegInt(int reg_idx, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     return regFile.readFloatRegInt(phys_reg);
 }
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index 91eaf9d6f..621ddf541 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -82,7 +82,8 @@ class FullO3CPU : public BaseFullCPU
         Running,
         Idle,
         Halted,
-        Blocked
+        Blocked,
+        SwitchedOut
     };
 
     /** Overall CPU status. */
@@ -112,9 +113,9 @@ class FullO3CPU : public BaseFullCPU
     void scheduleTickEvent(int delay)
     {
         if (tickEvent.squashed())
-            tickEvent.reschedule(curTick + delay);
+            tickEvent.reschedule(curTick + cycles(delay));
         else if (!tickEvent.scheduled())
-            tickEvent.schedule(curTick + delay);
+            tickEvent.schedule(curTick + cycles(delay));
     }
 
     /** Unschedule tick event, regardless of its current state. */
@@ -196,7 +197,7 @@ class FullO3CPU : public BaseFullCPU
     /** Switches out this CPU.
      *  @todo: Implement this.
      */
-    void switchOut();
+    void switchOut(Sampler *sampler);
 
     /** Takes over from another CPU.
      *  @todo: Implement this.
diff --git a/cpu/o3/decode.hh b/cpu/o3/decode.hh
index 279ff556e..3f3f68247 100644
--- a/cpu/o3/decode.hh
+++ b/cpu/o3/decode.hh
@@ -107,6 +107,9 @@ class DefaultDecode
     /** Sets pointer to list of active threads. */
     void setActiveThreads(std::list<unsigned> *at_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
     /** Ticks decode, processing all input signals and decoding as many
      * instructions as possible.
      */
@@ -272,6 +275,8 @@ class DefaultDecode
     Stats::Scalar<> decodeUnblockCycles;
     /** Stat for total number of squashing cycles. */
     Stats::Scalar<> decodeSquashCycles;
+    /** Stat for number of times a branch is resolved at decode. */
+    Stats::Scalar<> decodeBranchResolved;
     /** Stat for number of times a branch mispredict is detected. */
     Stats::Scalar<> decodeBranchMispred;
     /** Stat for number of times decode detected a non-control instruction
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index f1aea27b4..caa97067b 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -66,40 +66,44 @@ void
 DefaultDecode<Impl>::regStats()
 {
     decodeIdleCycles
-        .name(name() + ".decodeIdleCycles")
+        .name(name() + ".DECODE:IdleCycles")
         .desc("Number of cycles decode is idle")
         .prereq(decodeIdleCycles);
     decodeBlockedCycles
-        .name(name() + ".decodeBlockedCycles")
+        .name(name() + ".DECODE:BlockedCycles")
         .desc("Number of cycles decode is blocked")
         .prereq(decodeBlockedCycles);
     decodeRunCycles
-        .name(name() + ".decodeRunCycles")
+        .name(name() + ".DECODE:RunCycles")
         .desc("Number of cycles decode is running")
         .prereq(decodeRunCycles);
     decodeUnblockCycles
-        .name(name() + ".decodeUnblockCycles")
+        .name(name() + ".DECODE:UnblockCycles")
         .desc("Number of cycles decode is unblocking")
         .prereq(decodeUnblockCycles);
     decodeSquashCycles
-        .name(name() + ".decodeSquashCycles")
+        .name(name() + ".DECODE:SquashCycles")
         .desc("Number of cycles decode is squashing")
         .prereq(decodeSquashCycles);
+    decodeBranchResolved
+        .name(name() + ".DECODE:BranchResolved")
+        .desc("Number of times decode resolved a branch")
+        .prereq(decodeBranchResolved);
     decodeBranchMispred
-        .name(name() + ".decodeBranchMispred")
+        .name(name() + ".DECODE:BranchMispred")
         .desc("Number of times decode detected a branch misprediction")
         .prereq(decodeBranchMispred);
     decodeControlMispred
-        .name(name() + ".decodeControlMispred")
+        .name(name() + ".DECODE:ControlMispred")
         .desc("Number of times decode detected an instruction incorrectly"
               " predicted as a control")
         .prereq(decodeControlMispred);
     decodeDecodedInsts
-        .name(name() + ".decodeDecodedInsts")
+        .name(name() + ".DECODE:DecodedInsts")
         .desc("Number of instructions handled by decode")
         .prereq(decodeDecodedInsts);
     decodeSquashedInsts
-        .name(name() + ".decodeSquashedInsts")
+        .name(name() + ".DECODE:SquashedInsts")
         .desc("Number of squashed instructions handled by decode")
         .prereq(decodeSquashedInsts);
 }
@@ -158,6 +162,33 @@ DefaultDecode<Impl>::setActiveThreads(list<unsigned> *at_ptr)
     activeThreads = at_ptr;
 }
 
+template <class Impl>
+void
+DefaultDecode<Impl>::switchOut()
+{
+}
+
+template <class Impl>
+void
+DefaultDecode<Impl>::takeOverFrom()
+{
+    _status = Inactive;
+
+    for (int i = 0; i < numThreads; ++i) {
+        decodeStatus[i] = Idle;
+
+        stalls[i].rename = false;
+        stalls[i].iew = false;
+        stalls[i].commit = false;
+        while (!insts[i].empty())
+            insts[i].pop();
+        while (!skidBuffer[i].empty())
+            skidBuffer[i].pop();
+        branchCount[i] = 0;
+    }
+    wroteToTimeBuffer = false;
+}
+
 template<class Impl>
 bool
 DefaultDecode<Impl>::checkStall(unsigned tid) const
@@ -680,6 +711,7 @@ DefaultDecode<Impl>::decodeInsts(unsigned tid)
 
         // Go ahead and compute any PC-relative branches.
         if (inst->isDirectCtrl() && inst->isUncondCtrl()) {
+            ++decodeBranchResolved;
             inst->setNextPC(inst->branchTarget());
 
             if (inst->mispredicted()) {
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index f0b15cb86..6074831c6 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -35,6 +35,8 @@
 #include "mem/mem_interface.hh"
 #include "sim/eventq.hh"
 
+class Sampler;
+
 /**
  * DefaultFetch class handles both single threaded and SMT fetch. Its width is
  * specified by the parameters; each cycle it tries to fetch that many
@@ -81,6 +83,7 @@ class DefaultFetch
         Fetching,
         TrapPending,
         QuiescePending,
+        SwitchOut,
         IcacheMissStall,
         IcacheMissComplete
     };
@@ -160,6 +163,12 @@ class DefaultFetch
     /** Processes cache completion event. */
     void processCacheCompletion(MemReqPtr &req);
 
+    void switchOut();
+
+    void takeOverFrom();
+
+    bool isSwitchedOut() { return switchedOut; }
+
     void wakeFromQuiesce();
 
   private:
@@ -360,6 +369,8 @@ class DefaultFetch
 
     bool interruptPending;
 
+    bool switchedOut;
+
 #if !FULL_SYSTEM
     /** Page table pointer. */
 //    PageTable *pTable;
@@ -382,6 +393,8 @@ class DefaultFetch
      */
     Stats::Scalar<> fetchIdleCycles;
     Stats::Scalar<> fetchBlockedCycles;
+
+    Stats::Scalar<> fetchMiscStallCycles;
     /** Stat for total number of fetched cache lines. */
     Stats::Scalar<> fetchedCacheLines;
 
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 563a767df..92f923c65 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -169,53 +169,59 @@ void
 DefaultFetch<Impl>::regStats()
 {
     icacheStallCycles
-        .name(name() + ".icacheStallCycles")
+        .name(name() + ".FETCH:icacheStallCycles")
         .desc("Number of cycles fetch is stalled on an Icache miss")
         .prereq(icacheStallCycles);
 
     fetchedInsts
-        .name(name() + ".fetchedInsts")
+        .name(name() + ".FETCH:Insts")
         .desc("Number of instructions fetch has processed")
         .prereq(fetchedInsts);
 
     fetchedBranches
-        .name(name() + ".fetchedBranches")
+        .name(name() + ".FETCH:Branches")
         .desc("Number of branches that fetch encountered")
         .prereq(fetchedBranches);
 
     predictedBranches
-        .name(name() + ".predictedBranches")
+        .name(name() + ".FETCH:predictedBranches")
         .desc("Number of branches that fetch has predicted taken")
         .prereq(predictedBranches);
 
     fetchCycles
-        .name(name() + ".fetchCycles")
+        .name(name() + ".FETCH:Cycles")
         .desc("Number of cycles fetch has run and was not squashing or"
               " blocked")
         .prereq(fetchCycles);
 
     fetchSquashCycles
-        .name(name() + ".fetchSquashCycles")
+        .name(name() + ".FETCH:SquashCycles")
         .desc("Number of cycles fetch has spent squashing")
         .prereq(fetchSquashCycles);
 
     fetchIdleCycles
-        .name(name() + ".fetchIdleCycles")
+        .name(name() + ".FETCH:IdleCycles")
         .desc("Number of cycles fetch was idle")
         .prereq(fetchIdleCycles);
 
     fetchBlockedCycles
-        .name(name() + ".fetchBlockedCycles")
+        .name(name() + ".FETCH:BlockedCycles")
         .desc("Number of cycles fetch has spent blocked")
         .prereq(fetchBlockedCycles);
 
     fetchedCacheLines
-        .name(name() + ".fetchedCacheLines")
+        .name(name() + ".FETCH:CacheLines")
         .desc("Number of cache lines fetched")
         .prereq(fetchedCacheLines);
 
+    fetchMiscStallCycles
+        .name(name() + ".FETCH:MiscStallCycles")
+        .desc("Number of cycles fetch has spent waiting on interrupts, or "
+              "bad addresses, or out of MSHRs")
+        .prereq(fetchMiscStallCycles);
+
     fetchIcacheSquashes
-        .name(name() + ".fetchIcacheSquashes")
+        .name(name() + ".FETCH:IcacheSquashes")
         .desc("Number of outstanding Icache misses that were squashed")
         .prereq(fetchIcacheSquashes);
 
@@ -223,24 +229,24 @@ DefaultFetch<Impl>::regStats()
         .init(/* base value */ 0,
               /* last value */ fetchWidth,
               /* bucket size */ 1)
-        .name(name() + ".rateDist")
+        .name(name() + ".FETCH:rateDist")
         .desc("Number of instructions fetched each cycle (Total)")
         .flags(Stats::pdf);
 
     idleRate
-        .name(name() + ".idleRate")
+        .name(name() + ".FETCH:idleRate")
         .desc("Percent of cycles fetch was idle")
         .prereq(idleRate);
     idleRate = fetchIdleCycles * 100 / cpu->numCycles;
 
     branchRate
-        .name(name() + ".branchRate")
+        .name(name() + ".FETCH:branchRate")
         .desc("Number of branch fetches per cycle")
         .flags(Stats::total);
     branchRate = predictedBranches / cpu->numCycles;
 
     fetchRate
-        .name(name() + ".rate")
+        .name(name() + ".FETCH:rate")
         .desc("Number of inst fetches per cycle")
         .flags(Stats::total);
     fetchRate = fetchedInsts / cpu->numCycles;
@@ -332,7 +338,8 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
     // Can keep track of how many cache accesses go unused due to
     // misspeculation here.
     if (fetchStatus[tid] != IcacheMissStall ||
-        req != memReq[tid]) {
+        req != memReq[tid] ||
+        isSwitchedOut()) {
         ++fetchIcacheSquashes;
         return;
     }
@@ -360,6 +367,35 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
 //    memReq[tid]->completionEvent = NULL;
 }
 
+template <class Impl>
+void
+DefaultFetch<Impl>::switchOut()
+{
+    switchedOut = true;
+    branchPred.switchOut();
+}
+
+template <class Impl>
+void
+DefaultFetch<Impl>::takeOverFrom()
+{
+    // Reset all state
+    for (int i = 0; i < Impl::MaxThreads; ++i) {
+        stalls[i].decode = 0;
+        stalls[i].rename = 0;
+        stalls[i].iew = 0;
+        stalls[i].commit = 0;
+        PC[i] = cpu->readPC(i);
+        nextPC[i] = cpu->readNextPC(i);
+        fetchStatus[i] = Running;
+    }
+    numInst = 0;
+    wroteToTimeBuffer = false;
+    _status = Inactive;
+    switchedOut = false;
+    branchPred.takeOverFrom();
+}
+
 template <class Impl>
 void
 DefaultFetch<Impl>::wakeFromQuiesce()
@@ -902,8 +938,10 @@ DefaultFetch<Impl>::fetch(bool &status_change)
                 tid, fetch_PC);
 
         bool fetch_success = fetchCacheLine(fetch_PC, fault, tid);
-        if (!fetch_success)
+        if (!fetch_success) {
+            ++fetchMiscStallCycles;
             return;
+        }
     } else {
         if (fetchStatus[tid] == Idle) {
             ++fetchIdleCycles;
diff --git a/cpu/o3/fu_pool.cc b/cpu/o3/fu_pool.cc
index 9b6ac15d9..cb7a15061 100644
--- a/cpu/o3/fu_pool.cc
+++ b/cpu/o3/fu_pool.cc
@@ -242,6 +242,20 @@ FUPool::dump()
     }
 }
 
+void
+FUPool::switchOut()
+{
+}
+
+void
+FUPool::takeOverFrom()
+{
+    for (int i = 0; i < numFU; i++) {
+        unitBusy[i] = false;
+    }
+    unitsToBeFreed.clear();
+}
+
 //
 
 ////////////////////////////////////////////////////////////////////////////
diff --git a/cpu/o3/fu_pool.hh b/cpu/o3/fu_pool.hh
index d7b7acadb..7df5ad5f3 100644
--- a/cpu/o3/fu_pool.hh
+++ b/cpu/o3/fu_pool.hh
@@ -154,6 +154,9 @@ class FUPool : public SimObject
     unsigned getIssueLatency(OpClass capability) {
         return maxIssueLatencies[capability];
     }
+
+    void switchOut();
+    void takeOverFrom();
 };
 
 #endif // __CPU_O3_FU_POOL_HH__
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index 58cd68b21..ae0ba6a21 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -157,6 +157,12 @@ class DefaultIEW
     /** Sets pointer to the scoreboard. */
     void setScoreboard(Scoreboard *sb_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
+
+    bool isSwitchedOut() { return switchedOut; }
+
     /** Sets page table pointer within LSQ. */
 //    void setPageTable(PageTable *pt_ptr);
 
@@ -420,6 +426,8 @@ class DefaultIEW
     /** Maximum size of the skid buffer. */
     unsigned skidBufferMax;
 
+    bool switchedOut;
+
     /** Stat for total number of idle cycles. */
     Stats::Scalar<> iewIdleCycles;
     /** Stat for total number of squashing cycles. */
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 2ae2e1361..42d83ee72 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -55,13 +55,13 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
 
     //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
 
-    iewStage->wakeCPU();
-
-    if (inst->isSquashed()) {
+    if (inst->isSquashed() || iewStage->isSwitchedOut()) {
         inst = NULL;
         return;
     }
 
+    iewStage->wakeCPU();
+
     if (!inst->isExecuted()) {
         inst->setExecuted();
 
@@ -101,7 +101,8 @@ DefaultIEW<Impl>::DefaultIEW(Params *params)
       issueReadWidth(params->issueWidth),
       issueWidth(params->issueWidth),
       executeWidth(params->executeWidth),
-      numThreads(params->numberOfThreads)
+      numThreads(params->numberOfThreads),
+      switchedOut(false)
 {
     DPRINTF(IEW, "executeIntWidth: %i.\n", params->executeIntWidth);
     _status = Active;
@@ -436,6 +437,53 @@ DefaultIEW<Impl>::setPageTable(PageTable *pt_ptr)
 }
 #endif
 
+template <class Impl>
+void
+DefaultIEW<Impl>::switchOut()
+{
+    switchedOut = true;
+    instQueue.switchOut();
+    ldstQueue.switchOut();
+    fuPool->switchOut();
+
+    for (int i = 0; i < numThreads; i++) {
+        while (!insts[i].empty())
+            insts[i].pop();
+        while (!skidBuffer[i].empty())
+            skidBuffer[i].pop();
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::takeOverFrom()
+{
+    _status = Active;
+    exeStatus = Running;
+    wbStatus = Idle;
+    switchedOut = false;
+
+    instQueue.takeOverFrom();
+    ldstQueue.takeOverFrom();
+    fuPool->takeOverFrom();
+
+    initStage();
+    cpu->activityThisCycle();
+
+    for (int i=0; i < numThreads; i++) {
+        dispatchStatus[i] = Running;
+        stalls[i].commit = false;
+        fetchRedirect[i] = false;
+    }
+
+    updateLSQNextCycle = false;
+
+    // @todo: Fix hardcoded number
+    for (int i = 0; i < 6; ++i) {
+        issueToExecQueue.advance();
+    }
+}
+
 template<class Impl>
 void
 DefaultIEW<Impl>::squash(unsigned tid)
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 06d9937f2..982294b4f 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -112,6 +112,10 @@ class InstructionQueue
     /** Registers statistics. */
     void regStats();
 
+    void resetState();
+
+    void resetDependencyGraph();
+
     /** Sets CPU pointer. */
     void setCPU(FullCPU *_cpu) { cpu = _cpu; }
 
@@ -127,6 +131,12 @@ class InstructionQueue
     /** Sets the global time buffer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
+
+    bool isSwitchedOut() { return switchedOut; }
+
     /** Number of entries needed for given amount of threads. */
     int entryAmount(int num_threads);
 
@@ -385,6 +395,8 @@ class InstructionQueue
      */
     unsigned commitToIEWDelay;
 
+    bool switchedOut;
+
     //////////////////////////////////
     // Variables needed for squashing
     //////////////////////////////////
@@ -507,7 +519,7 @@ class InstructionQueue
     Stats::Scalar<> iqSquashedNonSpecRemoved;
 
     Stats::VectorDistribution<> queue_res_dist;
-    Stats::Vector<> n_issued_dist;
+    Stats::Distribution<> n_issued_dist;
     Stats::VectorDistribution<> issue_delay_dist;
 
     Stats::Vector<> stat_fu_busy;
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index 804bc2472..0d9cc09f3 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -82,16 +82,10 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
 {
     assert(fuPool);
 
+    switchedOut = false;
+
     numThreads = params->numberOfThreads;
 
-    //Initialize thread IQ counts
-    for (int i = 0; i <numThreads; i++) {
-        count[i] = 0;
-    }
-
-    // Initialize the number of free IQ entries.
-    freeEntries = numEntries;
-
     // Set the number of physical registers as the number of int + float
     numPhysRegs = numPhysIntRegs + numPhysFloatRegs;
 
@@ -101,6 +95,13 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
     //dependency graph.
     dependGraph = new DependencyEntry[numPhysRegs];
 
+    // Initialize all the head pointers to point to NULL, and all the
+    // entries as unready.
+    for (int i = 0; i < numPhysRegs; ++i) {
+        dependGraph[i].next = NULL;
+        dependGraph[i].inst = NULL;
+    }
+
     // Resize the register scoreboard.
     regScoreboard.resize(numPhysRegs);
 
@@ -110,27 +111,7 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
         memDepUnit[i].setIQ(this);
     }
 
-    // Initialize all the head pointers to point to NULL, and all the
-    // entries as unready.
-    // Note that in actuality, the registers corresponding to the logical
-    // registers start off as ready.  However this doesn't matter for the
-    // IQ as the instruction should have been correctly told if those
-    // registers are ready in rename.  Thus it can all be initialized as
-    // unready.
-    for (int i = 0; i < numPhysRegs; ++i) {
-        dependGraph[i].next = NULL;
-        dependGraph[i].inst = NULL;
-        regScoreboard[i] = false;
-    }
-
-    for (int i = 0; i < numThreads; ++i) {
-        squashedSeqNum[i] = 0;
-    }
-
-    for (int i = 0; i < Num_OpClasses; ++i) {
-        queueOnList[i] = false;
-        readyIt[i] = listOrder.end();
-    }
+    resetState();
 
     string policy = params->smtIQPolicy;
 
@@ -184,30 +165,7 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
 template <class Impl>
 InstructionQueue<Impl>::~InstructionQueue()
 {
-    // Clear the dependency graph
-    DependencyEntry *curr;
-    DependencyEntry *prev;
-
-    for (int i = 0; i < numPhysRegs; ++i) {
-        curr = dependGraph[i].next;
-
-        while (curr) {
-            DependencyEntry::mem_alloc_counter--;
-
-            prev = curr;
-            curr = prev->next;
-            prev->inst = NULL;
-
-            delete prev;
-        }
-
-        if (dependGraph[i].inst) {
-            dependGraph[i].inst = NULL;
-        }
-
-        dependGraph[i].next = NULL;
-    }
-
+    resetDependencyGraph();
     assert(DependencyEntry::mem_alloc_counter == 0);
 
     delete [] dependGraph;
@@ -307,10 +265,10 @@ InstructionQueue<Impl>::regStats()
         queue_res_dist.subname(i, opClassStrings[i]);
     }
     n_issued_dist
-        .init(totalWidth + 1)
+        .init(0,totalWidth,1)
         .name(name() + ".ISSUE:issued_per_cycle")
         .desc("Number of insts issued each cycle")
-        .flags(total | pdf | dist)
+        .flags(pdf)
         ;
 /*
     dist_unissued
@@ -400,6 +358,71 @@ InstructionQueue<Impl>::regStats()
     }
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::resetState()
+{
+    //Initialize thread IQ counts
+    for (int i = 0; i <numThreads; i++) {
+        count[i] = 0;
+        instList[i].clear();
+    }
+
+    // Initialize the number of free IQ entries.
+    freeEntries = numEntries;
+
+    // Note that in actuality, the registers corresponding to the logical
+    // registers start off as ready.  However this doesn't matter for the
+    // IQ as the instruction should have been correctly told if those
+    // registers are ready in rename.  Thus it can all be initialized as
+    // unready.
+    for (int i = 0; i < numPhysRegs; ++i) {
+        regScoreboard[i] = false;
+    }
+
+    for (int i = 0; i < numThreads; ++i) {
+        squashedSeqNum[i] = 0;
+    }
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        while (!readyInsts[i].empty())
+            readyInsts[i].pop();
+        queueOnList[i] = false;
+        readyIt[i] = listOrder.end();
+    }
+    nonSpecInsts.clear();
+    listOrder.clear();
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::resetDependencyGraph()
+{
+    // Clear the dependency graph
+    DependencyEntry *curr;
+    DependencyEntry *prev;
+
+    for (int i = 0; i < numPhysRegs; ++i) {
+        curr = dependGraph[i].next;
+
+        while (curr) {
+            DependencyEntry::mem_alloc_counter--;
+
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
+
+            delete prev;
+        }
+
+        if (dependGraph[i].inst) {
+            dependGraph[i].inst = NULL;
+        }
+
+        dependGraph[i].next = NULL;
+    }
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::setActiveThreads(list<unsigned> *at_ptr)
@@ -426,6 +449,25 @@ InstructionQueue<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
     fromCommit = timeBuffer->getWire(-commitToIEWDelay);
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::switchOut()
+{
+    resetState();
+    resetDependencyGraph();
+    switchedOut = true;
+    for (int i = 0; i < numThreads; ++i) {
+        memDepUnit[i].switchOut();
+    }
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::takeOverFrom()
+{
+    switchedOut = false;
+}
+
 template <class Impl>
 int
 InstructionQueue<Impl>::entryAmount(int num_threads)
@@ -685,6 +727,10 @@ InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
 {
     // The CPU could have been sleeping until this op completed (*extremely*
     // long latency op).  Wake it if it was.  This may be overkill.
+    if (isSwitchedOut()) {
+        return;
+    }
+
     iewStage->wakeCPU();
 
     fuPool->freeUnit(fu_idx);
@@ -816,7 +862,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
                     FUCompletion *execution = new FUCompletion(issuing_inst,
                                                                idx, this);
 
-                    execution->schedule(curTick + issue_latency - 1);
+                    execution->schedule(curTick + cpu->cycles(issue_latency - 1));
                 } else {
                     i2e_info->insts[exec_queue_slot++] = issuing_inst;
                     i2e_info->size++;
@@ -862,6 +908,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
         }
     }
 
+    n_issued_dist.sample(total_issued);
+
     if (total_issued) {
         cpu->activityThisCycle();
     } else {
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
index c59b5f13b..d5f893e57 100644
--- a/cpu/o3/lsq.hh
+++ b/cpu/o3/lsq.hh
@@ -71,6 +71,9 @@ class LSQ {
     /** Sets the page table pointer. */
 //    void setPageTable(PageTable *pt_ptr);
 
+    void switchOut();
+    void takeOverFrom();
+
     /** Number of entries needed for the given amount of threads.*/
     int entryAmount(int num_threads);
     void removeEntries(unsigned tid);
@@ -271,15 +274,6 @@ class LSQ {
     /** Max SQ Size - Used to Enforce Sharing Policies. */
     unsigned maxSQEntries;
 
-    /** Global Load Count. */
-    int loads;
-
-    /** Global Store Count */
-    int stores;
-
-    /** Global Store To WB Count */
-    int storesToWB;
-
     /** Number of Threads. */
     unsigned numThreads;
 };
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
index 523517869..c43c19619 100644
--- a/cpu/o3/lsq_impl.hh
+++ b/cpu/o3/lsq_impl.hh
@@ -33,7 +33,6 @@ using namespace std;
 template <class Impl>
 LSQ<Impl>::LSQ(Params *params)
     : LQEntries(params->LQEntries), SQEntries(params->SQEntries),
-      loads(0), stores(0), storesToWB(0),
       numThreads(params->numberOfThreads)
 {
     DPRINTF(LSQ, "Creating LSQ object.\n");
@@ -143,6 +142,24 @@ LSQ<Impl>::setPageTable(PageTable *pt_ptr)
 }
 #endif
 
+template <class Impl>
+void
+LSQ<Impl>::switchOut()
+{
+    for (int tid = 0; tid < numThreads; tid++) {
+        thread[tid].switchOut();
+    }
+}
+
+template <class Impl>
+void
+LSQ<Impl>::takeOverFrom()
+{
+    for (int tid = 0; tid < numThreads; tid++) {
+        thread[tid].takeOverFrom();
+    }
+}
+
 template <class Impl>
 int
 LSQ<Impl>::entryAmount(int num_threads)
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index ba8b1d2e2..d17efe96a 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -38,6 +38,7 @@
 #include "cpu/inst_seq.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
+#include "sim/debug.hh"
 #include "sim/sim_object.hh"
 #include "arch/faults.hh"
 
@@ -110,6 +111,12 @@ class LSQUnit {
     /** Sets the page table pointer. */
 //    void setPageTable(PageTable *pt_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
+
+    bool isSwitchedOut() { return switchedOut; }
+
     /** Ticks the LSQ unit, which in this case only resets the number of
      * used cache ports.
      * @todo: Move the number of used ports up to the LSQ level so it can
@@ -278,20 +285,20 @@ class LSQUnit {
         /** Whether or not the store is completed. */
         bool completed;
     };
-
+/*
     enum Status {
         Running,
         Idle,
         DcacheMissStall,
         DcacheMissSwitch
     };
-
+*/
   private:
     /** The LSQUnit thread id. */
     unsigned lsqID;
 
     /** The status of the LSQ unit. */
-    Status _status;
+//    Status _status;
 
     /** The store queue. */
     std::vector<SQEntry> storeQueue;
@@ -335,6 +342,8 @@ class LSQUnit {
     /** The number of used cache ports in this cycle. */
     int usedPorts;
 
+    bool switchedOut;
+
     //list<InstSeqNum> mshrSeqNums;
 
      //Stats::Scalar<> dcacheStallCycles;
@@ -373,7 +382,25 @@ class LSQUnit {
     // Will also need how many read/write ports the Dcache has.  Or keep track
     // of that in stage that is one level up, and only call executeLoad/Store
     // the appropriate number of times.
+/*
+    // total number of loads forwaded from LSQ stores
+    Stats::Vector<> lsq_forw_loads;
 
+    // total number of loads ignored due to invalid addresses
+    Stats::Vector<> inv_addr_loads;
+
+    // total number of software prefetches ignored due to invalid addresses
+    Stats::Vector<> inv_addr_swpfs;
+
+    // total non-speculative bogus addresses seen (debug var)
+    Counter sim_invalid_addrs;
+    Stats::Vector<> fu_busy;  //cumulative fu busy
+
+    // ready loads blocked due to memory disambiguation
+    Stats::Vector<> lsq_blocked_loads;
+
+    Stats::Scalar<> lsqInversion;
+*/
   public:
     /** Executes the load at the given index. */
     template <class T>
@@ -590,7 +617,12 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         }
         DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
                 loadQueue[load_idx]->readPC());
-
+/*
+        Addr debug_addr = ULL(0xfffffc0000be81a8);
+        if (req->vaddr == debug_addr) {
+            debug_break();
+        }
+*/
         assert(!req->completionEvent);
         req->completionEvent =
             new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
@@ -608,7 +640,7 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
             lastDcacheStall = curTick;
 
-            _status = DcacheMissStall;
+//            _status = DcacheMissStall;
 
         } else {
             DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
@@ -694,7 +726,12 @@ LSQUnit<Impl>::write(MemReqPtr &req, T &data, int store_idx)
     storeQueue[store_idx].req = req;
     storeQueue[store_idx].size = sizeof(T);
     storeQueue[store_idx].data = data;
-
+/*
+    Addr debug_addr = ULL(0xfffffc0000be81a8);
+    if (req->vaddr == debug_addr) {
+        debug_break();
+    }
+*/
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index d9a118b0e..c5ce34c70 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -50,6 +50,9 @@ LSQUnit<Impl>::StoreCompletionEvent::process()
 
     //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum);
 
+    if (lsqPtr->isSwitchedOut())
+        return;
+
     lsqPtr->cpu->wakeCPU();
     if (wbEvent)
         wbEvent->process();
@@ -78,6 +81,8 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
 {
     DPRINTF(LSQUnit, "Creating LSQUnit%i object.\n",id);
 
+    switchedOut = false;
+
     lsqID = id;
 
     LQEntries = maxLQEntries;
@@ -138,6 +143,89 @@ LSQUnit<Impl>::setPageTable(PageTable *pt_ptr)
 }
 #endif
 
+template<class Impl>
+void
+LSQUnit<Impl>::switchOut()
+{
+    switchedOut = true;
+    for (int i = 0; i < loadQueue.size(); ++i)
+        loadQueue[i] = NULL;
+
+    while (storesToWB > 0 &&
+           storeWBIdx != storeTail &&
+           storeQueue[storeWBIdx].inst &&
+           storeQueue[storeWBIdx].canWB) {
+
+        if (storeQueue[storeWBIdx].size == 0 ||
+            storeQueue[storeWBIdx].inst->isDataPrefetch() ||
+            storeQueue[storeWBIdx].committed ||
+            storeQueue[storeWBIdx].req->flags & LOCKED) {
+            incrStIdx(storeWBIdx);
+
+            continue;
+        }
+
+        assert(storeQueue[storeWBIdx].req);
+        assert(!storeQueue[storeWBIdx].committed);
+
+        MemReqPtr req = storeQueue[storeWBIdx].req;
+        storeQueue[storeWBIdx].committed = true;
+
+        req->cmd = Write;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+        memcpy(req->data, (uint8_t *)&storeQueue[storeWBIdx].data, req->size);
+
+        DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x "
+                "to Addr:%#x, data:%#x [sn:%lli]\n",
+                storeWBIdx,storeQueue[storeWBIdx].inst->readPC(),
+                req->paddr, *(req->data),
+                storeQueue[storeWBIdx].inst->seqNum);
+
+        switch(storeQueue[storeWBIdx].size) {
+          case 1:
+            cpu->write(req, (uint8_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 2:
+            cpu->write(req, (uint16_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 4:
+            cpu->write(req, (uint32_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 8:
+            cpu->write(req, (uint64_t &)storeQueue[storeWBIdx].data);
+            break;
+          default:
+            panic("Unexpected store size!\n");
+        }
+        incrStIdx(storeWBIdx);
+    }
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::takeOverFrom()
+{
+    switchedOut = false;
+    loads = stores = storesToWB = 0;
+
+    loadHead = loadTail = 0;
+
+    storeHead = storeWBIdx = storeTail = 0;
+
+    usedPorts = 0;
+
+    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+
+    blockedLoadSeqNum = 0;
+
+    stalled = false;
+    isLoadBlocked = false;
+    loadBlockedHandled = false;
+}
+
 template<class Impl>
 void
 LSQUnit<Impl>::resizeLQ(unsigned size)
@@ -647,7 +735,7 @@ LSQUnit<Impl>::writebackStores()
 
                 lastDcacheStall = curTick;
 
-                _status = DcacheMissStall;
+//                _status = DcacheMissStall;
 
                 //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum);
 
diff --git a/cpu/o3/mem_dep_unit.hh b/cpu/o3/mem_dep_unit.hh
index 32ce9f768..141e0fdc4 100644
--- a/cpu/o3/mem_dep_unit.hh
+++ b/cpu/o3/mem_dep_unit.hh
@@ -84,6 +84,10 @@ class MemDepUnit {
     /** Registers statistics. */
     void regStats();
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /** Sets the pointer to the IQ. */
     void setIQ(InstructionQueue<Impl> *iq_ptr);
 
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 771a0505e..05a33685d 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -101,6 +101,26 @@ MemDepUnit<MemDepPred, Impl>::regStats()
         .desc("Number of conflicting stores.");
 }
 
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::switchOut()
+{
+    for (int i = 0; i < Impl::MaxThreads; ++i) {
+        instList[i].clear();
+    }
+    instsToReplay.clear();
+    memDepHash.clear();
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::takeOverFrom()
+{
+    loadBarrier = storeBarrier = false;
+    loadBarrierSN = storeBarrierSN = 0;
+    depPred.clear();
+}
+
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::setIQ(InstructionQueue<Impl> *iq_ptr)
diff --git a/cpu/o3/ras.cc b/cpu/o3/ras.cc
index 5e7ef38ae..0b3ea4918 100644
--- a/cpu/o3/ras.cc
+++ b/cpu/o3/ras.cc
@@ -41,6 +41,15 @@ ReturnAddrStack::init(unsigned _numEntries)
          addrStack[i] = 0;
 }
 
+void
+ReturnAddrStack::reset()
+{
+    usedEntries = 0;
+    tos = 0;
+    for (int i = 0; i < numEntries; ++i)
+        addrStack[i] = 0;
+}
+
 void
 ReturnAddrStack::push(const Addr &return_addr)
 {
diff --git a/cpu/o3/ras.hh b/cpu/o3/ras.hh
index 5aa4fc05f..27e7c2df4 100644
--- a/cpu/o3/ras.hh
+++ b/cpu/o3/ras.hh
@@ -47,6 +47,8 @@ class ReturnAddrStack
      */
     void init(unsigned numEntries);
 
+    void reset();
+
     /** Returns the top address on the RAS. */
     Addr top()
     { return addrStack[tos]; }
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index c6f8f97aa..4c5c46356 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -153,6 +153,10 @@ class DefaultRename
     /** Sets pointer to the scoreboard. */
     void setScoreboard(Scoreboard *_scoreboard);
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /** Squashes all instructions in a thread. */
     void squash(unsigned tid);
 
@@ -448,6 +452,7 @@ class DefaultRename
     Stats::Scalar<> renameUndoneMaps;
     Stats::Scalar<> renamedSerializing;
     Stats::Scalar<> renamedTempSerializing;
+    Stats::Scalar<> renameSkidInsts;
 };
 
 #endif // __CPU_O3_RENAME_HH__
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index e29211921..d41058deb 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -151,6 +151,11 @@ DefaultRename<Impl>::regStats()
         .desc("count of temporary serializing insts renamed")
         .flags(Stats::total)
         ;
+    renameSkidInsts
+        .name(name() + ".RENAME:skidInsts")
+        .desc("count of insts added to the skid buffer")
+        .flags(Stats::total)
+        ;
 }
 
 template <class Impl>
@@ -213,8 +218,8 @@ DefaultRename<Impl>::initStage()
 
     // Clear these pointers so they are not accidentally used in
     // non-initialization code.
-    iew_ptr = NULL;
-    commit_ptr = NULL;
+//    iew_ptr = NULL;
+//    commit_ptr = NULL;
 }
 
 template<class Impl>
@@ -253,6 +258,55 @@ DefaultRename<Impl>::setScoreboard(Scoreboard *_scoreboard)
     scoreboard = _scoreboard;
 }
 
+template <class Impl>
+void
+DefaultRename<Impl>::switchOut()
+{
+    for (int i = 0; i < numThreads; i++) {
+        typename list<RenameHistory>::iterator hb_it = historyBuffer[i].begin();
+
+        while (!historyBuffer[i].empty()) {
+            assert(hb_it != historyBuffer[i].end());
+
+            DPRINTF(Rename, "[tid:%u]: Removing history entry with sequence "
+                    "number %i.\n", i, (*hb_it).instSeqNum);
+
+            // Tell the rename map to set the architected register to the
+            // previous physical register that it was renamed to.
+            renameMap[i]->setEntry(hb_it->archReg, hb_it->prevPhysReg);
+
+            // Put the renamed physical register back on the free list.
+            freeList->addReg(hb_it->newPhysReg);
+
+            historyBuffer[i].erase(hb_it++);
+        }
+        insts[i].clear();
+        skidBuffer[i].clear();
+    }
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::takeOverFrom()
+{
+    _status = Inactive;
+    initStage();
+
+    for (int i=0; i< numThreads; i++) {
+        renameStatus[i] = Idle;
+
+        stalls[i].iew = false;
+        stalls[i].commit = false;
+        serializeInst[i] = NULL;
+
+        instsInProgress[i] = 0;
+
+        emptyROB[i] = true;
+
+        serializeOnNextInst[i] = false;
+    }
+}
+
 template <class Impl>
 void
 DefaultRename<Impl>::squash(unsigned tid)
@@ -393,7 +447,7 @@ DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
     } else if (renameStatus[tid] == Unblocking) {
         renameInsts(tid);
 
-        ++renameUnblockCycles;
+//        ++renameUnblockCycles;
 
         if (validInsts()) {
             // Add the current inputs to the skid buffer so they can be
@@ -564,6 +618,8 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
         } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize after instruction encountered.\n");
 
+            renamedSerializing++;
+
             inst->setSerializeHandled();
 
             serializeAfter(insts_to_rename, tid);
@@ -594,13 +650,12 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
         // Increment which instruction we're on.
         ++toIEWIndex;
 
-        ++renameRenamedInsts;
-
         // Decrement how many instructions are available.
         --insts_available;
     }
 
     instsInProgress[tid] += renamed_insts;
+    renameRenamedInsts += renamed_insts;
 
     // If we wrote to the time buffer, record this.
     if (toIEWIndex) {
@@ -635,6 +690,8 @@ DefaultRename<Impl>::skidInsert(unsigned tid)
         DPRINTF(Rename, "[tid:%u]: Inserting [sn:%lli] PC:%#x into Rename "
                 "skidBuffer\n", tid, inst->seqNum, inst->readPC());
 
+        ++renameSkidInsts;
+
         skidBuffer[tid].push_back(inst);
     }
 
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index 48199915f..0748850ea 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -97,6 +97,10 @@ class ROB
      */
     void setActiveThreads(std::list<unsigned>* at_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /** Function to insert an instruction into the ROB. Note that whatever
      *  calls this function must ensure that there is enough space within the
      *  ROB for the new instruction.
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index 96d907cda..02a4bfbee 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -121,6 +121,31 @@ ROB<Impl>::setActiveThreads(list<unsigned> *at_ptr)
     activeThreads = at_ptr;
 }
 
+template <class Impl>
+void
+ROB<Impl>::switchOut()
+{
+    for (int tid = 0; tid < numThreads; tid++) {
+        instList[tid].clear();
+    }
+}
+
+template <class Impl>
+void
+ROB<Impl>::takeOverFrom()
+{
+    for (int tid=0; tid  < numThreads; tid++) {
+        doneSquashing[tid] = true;
+        threadEntries[tid] = 0;
+        squashIt[tid] = instList[tid].end();
+    }
+    numInstsInROB = 0;
+
+    // Initialize the "universal" ROB head & tail point to invalid
+    // pointers
+    head = instList[0].end();
+    tail = instList[0].end();
+}
 
 template <class Impl>
 void
diff --git a/cpu/o3/sat_counter.cc b/cpu/o3/sat_counter.cc
index a6e131483..b481b4ad2 100644
--- a/cpu/o3/sat_counter.cc
+++ b/cpu/o3/sat_counter.cc
@@ -30,17 +30,17 @@
 #include "cpu/o3/sat_counter.hh"
 
 SatCounter::SatCounter()
-    : maxVal(0), counter(0)
+    : initialVal(0), counter(0)
 {
 }
 
 SatCounter::SatCounter(unsigned bits)
-    : maxVal((1 << bits) - 1), counter(0)
+    : initialVal(0), maxVal((1 << bits) - 1), counter(0)
 {
 }
 
-SatCounter::SatCounter(unsigned bits, unsigned initial_val)
-    : maxVal((1 << bits) - 1), counter(initial_val)
+SatCounter::SatCounter(unsigned bits, uint8_t initial_val)
+    : initialVal(initialVal), maxVal((1 << bits) - 1), counter(initial_val)
 {
     // Check to make sure initial value doesn't exceed the max counter value.
     if (initial_val > maxVal) {
@@ -53,19 +53,3 @@ SatCounter::setBits(unsigned bits)
 {
     maxVal = (1 << bits) - 1;
 }
-
-void
-SatCounter::increment()
-{
-    if (counter < maxVal) {
-        ++counter;
-    }
-}
-
-void
-SatCounter::decrement()
-{
-    if (counter > 0) {
-        --counter;
-    }
-}
diff --git a/cpu/o3/sat_counter.hh b/cpu/o3/sat_counter.hh
index 952f1f86d..1d20a8a8f 100644
--- a/cpu/o3/sat_counter.hh
+++ b/cpu/o3/sat_counter.hh
@@ -57,22 +57,34 @@ class SatCounter
      * @param bits How many bits the counter will have.
      * @param initial_val Starting value for each counter.
      */
-    SatCounter(unsigned bits, unsigned initial_val);
+    SatCounter(unsigned bits, uint8_t initial_val);
 
     /**
      * Sets the number of bits.
      */
     void setBits(unsigned bits);
 
+    void reset() { counter = initialVal; }
+
     /**
      * Increments the counter's current value.
      */
-    void increment();
+    void increment()
+    {
+        if (counter < maxVal) {
+            ++counter;
+        }
+    }
 
     /**
      * Decrements the counter's current value.
      */
-    void decrement();
+    void decrement()
+    {
+        if (counter > 0) {
+            --counter;
+        }
+    }
 
     /**
      * Read the counter's value.
@@ -81,6 +93,7 @@ class SatCounter
     { return counter; }
 
   private:
+    uint8_t initialVal;
     uint8_t maxVal;
     uint8_t counter;
 };
diff --git a/cpu/o3/thread_state.hh b/cpu/o3/thread_state.hh
index 846f44176..17719bdeb 100644
--- a/cpu/o3/thread_state.hh
+++ b/cpu/o3/thread_state.hh
@@ -60,7 +60,7 @@ struct O3ThreadState : public ThreadState {
     { }
 #else
     O3ThreadState(FullCPU *_cpu, int _thread_num, Process *_process, int _asid)
-        : ThreadState(-1, _thread_num, NULL, _process, _asid),
+        : ThreadState(-1, _thread_num, _process->getMemory(), _process, _asid),
           cpu(_cpu), inSyscall(0), trapPending(0)
     { }
 

From 9a96ebf368cace048654186ae1ff8b4fb6672bb7 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 11 May 2006 14:12:34 -0400
Subject: [PATCH 20/50] Separate out result being ready and the instruction
 being complete.

--HG--
extra : convert_revision : 9f17af114bf639f8fb61896e49fa714932c081d7
---
 cpu/base_dyn_inst.cc |  1 +
 cpu/base_dyn_inst.hh | 41 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/cpu/base_dyn_inst.cc b/cpu/base_dyn_inst.cc
index 6ce9b4455..7ab760ae3 100644
--- a/cpu/base_dyn_inst.cc
+++ b/cpu/base_dyn_inst.cc
@@ -101,6 +101,7 @@ BaseDynInst<Impl>::initVars()
     readyRegs = 0;
 
     completed = false;
+    resultReady = false;
     canIssue = false;
     issued = false;
     executed = false;
diff --git a/cpu/base_dyn_inst.hh b/cpu/base_dyn_inst.hh
index ecad6ad64..18978142d 100644
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@@ -117,6 +117,11 @@ class BaseDynInst : public FastAlloc, public RefCounted
     Fault write(T data, Addr addr, unsigned flags,
                         uint64_t *res);
 
+    // @todo: Probably should not have this function in the DynInst.
+    template <class T>
+    bool snoop(MemReqPtr &req, T &data)
+    { return cpu->snoop(req, data); }
+
     void prefetch(Addr addr, unsigned flags);
     void writeHint(Addr addr, int size, unsigned flags);
     Fault copySrcTranslate(Addr src);
@@ -139,6 +144,9 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Is the instruction completed. */
     bool completed;
 
+    /** Is the instruction's result ready. */
+    bool resultReady;
+
     /** Can this instruction issue. */
     bool canIssue;
 
@@ -187,7 +195,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Pointer to the FullCPU object. */
     FullCPU *cpu;
 
-    /** Pointer to the exec context.  Will not exist in the final version. */
+    /** Pointer to the exec context. */
     ImplState *thread;
 
     /** The kind of fault this instruction has generated. */
@@ -353,6 +361,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     bool isWriteBarrier() const { return staticInst->isWriteBarrier(); }
     bool isNonSpeculative() const { return staticInst->isNonSpeculative(); }
     bool isQuiesce() const { return staticInst->isQuiesce(); }
+    bool isUnverifiable() const { return staticInst->isUnverifiable(); }
 
     /** Temporarily sets this instruction as a serialize before instruction. */
     void setSerializeBefore() { serializeBefore = true; }
@@ -423,6 +432,26 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Returns the result of a floating point (double) instruction. */
     double readDoubleResult() { return instResult.dbl; }
 
+    void setIntReg(const StaticInst *si, int idx, uint64_t val)
+    {
+        instResult.integer = val;
+    }
+
+    void setFloatRegSingle(const StaticInst *si, int idx, float val)
+    {
+        instResult.fp = val;
+    }
+
+    void setFloatRegDouble(const StaticInst *si, int idx, double val)
+    {
+        instResult.dbl = val;
+    }
+
+    void setFloatRegInt(const StaticInst *si, int idx, uint64_t val)
+    {
+        instResult.integer = val;
+    }
+
     //Push to .cc file.
     /** Records that one of the source registers is ready. */
     void markSrcRegReady();
@@ -444,6 +473,10 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Returns whether or not this instruction is completed. */
     bool isCompleted() const { return completed; }
 
+    void setResultReady() { resultReady = true; }
+
+    bool isResultReady() const { return resultReady; }
+
     /** Sets this instruction as ready to issue. */
     void setCanIssue() { canIssue = true; }
 
@@ -540,7 +573,11 @@ class BaseDynInst : public FastAlloc, public RefCounted
     const Addr readPC() const { return PC; }
 
     /** Set the next PC of this instruction (its actual target). */
-    void setNextPC(uint64_t val) { nextPC = val; }
+    void setNextPC(uint64_t val)
+    {
+        nextPC = val;
+//        instResult.integer = val;
+    }
 
     void setASID(short addr_space_id) { asid = addr_space_id; }
 

From 92838fd35e4ffc00cc52aacfd2e5317ae7ab8b1b Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 11 May 2006 15:19:48 -0400
Subject: [PATCH 21/50] Set memory properly.

--HG--
extra : convert_revision : 4e6c61d31bf052bb4aabf4bb7a4f0e870b44b771
---
 cpu/cpu_exec_context.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpu/cpu_exec_context.cc b/cpu/cpu_exec_context.cc
index 4400cf842..e15ba7e66 100644
--- a/cpu/cpu_exec_context.cc
+++ b/cpu/cpu_exec_context.cc
@@ -94,7 +94,7 @@ CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num,
 
 CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num,
                          FunctionalMemory *_mem, int _asid)
-    : cpu(_cpu), thread_num(_thread_num), process(0), mem(NULL), asid(_asid),
+    : cpu(_cpu), thread_num(_thread_num), process(0), mem(_mem), asid(_asid),
       func_exe_inst(0), storeCondFailures(0)
 {
     memset(&regs, 0, sizeof(RegFile));

From 8a9416ef8df05c24231a063680f61d2313cf5c32 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 11 May 2006 15:39:02 -0400
Subject: [PATCH 22/50] Small fixes to O3 model.

cpu/o3/alpha_dyn_inst.hh:
    Set the instResult using a function on the base dyn inst.
cpu/o3/bpred_unit_impl.hh:
    Don't need to reset the state.
cpu/o3/commit_impl.hh:
    Mark instructions as completed.

    Wait until all stores are written back to handle a fault.
cpu/o3/cpu.cc:
    Clear instruction lists when switching out.
cpu/o3/lsq_unit.hh:
    Allow wbEvent to be set externally.
cpu/o3/lsq_unit_impl.hh:
    Mark instructions as completed properly.  Also use events for writing back stores even if there is a hit in the dcache.

--HG--
extra : convert_revision : 172ad088b75ac31e848a5040633152b5c051444c
---
 cpu/o3/alpha_dyn_inst.hh  |  8 ++++----
 cpu/o3/bpred_unit_impl.hh |  2 ++
 cpu/o3/commit_impl.hh     |  9 +++++++++
 cpu/o3/cpu.cc             |  6 ++++++
 cpu/o3/lsq_unit.hh        |  2 ++
 cpu/o3/lsq_unit_impl.hh   | 39 +++++++++++++++------------------------
 6 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/cpu/o3/alpha_dyn_inst.hh b/cpu/o3/alpha_dyn_inst.hh
index e0b73f17e..24774bd0a 100644
--- a/cpu/o3/alpha_dyn_inst.hh
+++ b/cpu/o3/alpha_dyn_inst.hh
@@ -183,25 +183,25 @@ class AlphaDynInst : public BaseDynInst<Impl>
     void setIntReg(const StaticInst *si, int idx, uint64_t val)
     {
         this->cpu->setIntReg(_destRegIdx[idx], val);
-        this->instResult.integer = val;
+        BaseDynInst<Impl>::setIntReg(si, idx, val);
     }
 
     void setFloatRegSingle(const StaticInst *si, int idx, float val)
     {
         this->cpu->setFloatRegSingle(_destRegIdx[idx], val);
-        this->instResult.fp = val;
+        BaseDynInst<Impl>::setFloatRegSingle(si, idx, val);
     }
 
     void setFloatRegDouble(const StaticInst *si, int idx, double val)
     {
         this->cpu->setFloatRegDouble(_destRegIdx[idx], val);
-        this->instResult.dbl = val;
+        BaseDynInst<Impl>::setFloatRegDouble(si, idx, val);
     }
 
     void setFloatRegInt(const StaticInst *si, int idx, uint64_t val)
     {
         this->cpu->setFloatRegInt(_destRegIdx[idx], val);
-        this->instResult.integer = val;
+        BaseDynInst<Impl>::setFloatRegInt(si, idx, val);
     }
 
     /** Returns the physical register index of the i'th destination
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index 872c0c62e..d20b31e55 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -107,11 +107,13 @@ template <class Impl>
 void
 TwobitBPredUnit<Impl>::takeOverFrom()
 {
+/*
     for (int i = 0; i < Impl::MaxThreads; ++i)
         RAS[i].reset();
 
     BP.reset();
     BTB.reset();
+*/
 }
 
 template <class Impl>
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 7834460e2..034565f90 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -1117,6 +1117,10 @@ head_inst->isWriteBarrier())*/
         panic("Barrier instructions are not handled yet.\n");
     }
 
+    if (!head_inst->isStore()) {
+        head_inst->setCompleted();
+    }
+
     // Check if the instruction caused a fault.  If so, trap.
     Fault inst_fault = head_inst->getFault();
 
@@ -1126,6 +1130,11 @@ head_inst->isWriteBarrier())*/
             DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
                     head_inst->seqNum, head_inst->readPC());
 
+            if (iewStage->hasStoresToWB()) {
+                DPRINTF(Commit, "Stores outstanding, fault must wait.\n");
+                return false;
+            }
+
             assert(!thread[tid]->inSyscall);
 
             thread[tid]->inSyscall = true;
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index fc8372026..59308d6a9 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -666,6 +666,12 @@ FullO3CPU<Impl>::switchOut(Sampler *sampler)
     rename.switchOut();
     iew.switchOut();
     commit.switchOut();
+
+    instList.clear();
+    while (!removeList.empty()) {
+        removeList.pop();
+    }
+
     if (tickEvent.scheduled())
         tickEvent.squash();
     sampler->signalSwitched();
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index d17efe96a..623dbdb4b 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -82,7 +82,9 @@ class LSQUnit {
         /** The writeback event for the store.  Needed for store
          * conditionals.
          */
+      public:
         Event *wbEvent;
+      private:
         /** The pointer to the LSQ unit that issued the store. */
         LSQUnit<Impl> *lsqPtr;
     };
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index c5ce34c70..3bb9a81f8 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -672,11 +672,6 @@ LSQUnit<Impl>::writebackStores()
                 req->paddr, *(req->data),
                 storeQueue[storeWBIdx].inst->seqNum);
 
-//        if (fault != NoFault) {
-            //What should we do if there is a fault???
-            //for now panic
-//            panic("Page Table Fault!!!!!\n");
-//        }
         switch(storeQueue[storeWBIdx].size) {
           case 1:
             cpu->write(req, (uint8_t &)storeQueue[storeWBIdx].data);
@@ -693,8 +688,16 @@ LSQUnit<Impl>::writebackStores()
           default:
             panic("Unexpected store size!\n");
         }
+        if (!(req->flags & LOCKED)) {
+            storeQueue[storeWBIdx].inst->setCompleted();
+        }
 
         if (dcacheInterface) {
+            assert(!req->completionEvent);
+            StoreCompletionEvent *store_event = new
+                StoreCompletionEvent(storeWBIdx, NULL, this);
+            req->completionEvent = store_event;
+
             MemAccessResult result = dcacheInterface->access(req);
 
             if (isStalled() &&
@@ -710,16 +713,12 @@ LSQUnit<Impl>::writebackStores()
             if (result != MA_HIT && dcacheInterface->doEvents()) {
                 typename IEW::LdWritebackEvent *wb = NULL;
                 if (req->flags & LOCKED) {
-                    // Stx_C does not generate a system port transaction.
-/*
-                    if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
-                        req->result=1;
-                    } else {
-                        req->result = 0;
-                    }
-*/
-                    wb = new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
-                                                            iewStage);
+                    // Stx_C should not generate a system port transaction,
+                    // but that might be hard to accomplish.
+                    wb = new typename
+                        IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+                                              iewStage);
+                    store_event->wbEvent = wb;
                 }
 
                 DPRINTF(LSQUnit,"D-Cache Write Miss!\n");
@@ -727,12 +726,6 @@ LSQUnit<Impl>::writebackStores()
                 DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
                         storeQueue[storeWBIdx].inst->seqNum);
 
-                // Will stores need their own kind of writeback events?
-                // Do stores even need writeback events?
-                assert(!req->completionEvent);
-                req->completionEvent = new
-                    StoreCompletionEvent(storeWBIdx, wb, this);
-
                 lastDcacheStall = curTick;
 
 //                _status = DcacheMissStall;
@@ -766,10 +759,8 @@ LSQUnit<Impl>::writebackStores()
                     typename IEW::LdWritebackEvent *wb =
                         new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
                                                            iewStage);
-                    wb->schedule(curTick);
+                    store_event->wbEvent = wb;
                 }
-
-                completeStore(storeWBIdx);
             }
 
             incrStIdx(storeWBIdx);

From 21df09cf7aa6bdec5de11904751d355e773a3168 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 11 May 2006 19:18:36 -0400
Subject: [PATCH 23/50] Fixes for ozone CPU to successfully boot and run linux.

cpu/base_dyn_inst.hh:
    Remove snoop function (did not mean to commit it).
cpu/ozone/back_end_impl.hh:
    Set instruction as having its result ready, not completed.
cpu/ozone/cpu.hh:
    Fixes for store conditionals.  Use an additional lock addr list to make sure that the access is valid.  I don't know if this is fully necessary, but it gives me a peace of mind (at some performance cost).
    Make sure to schedule for cycles(1) and not just 1 cycle in the future as tick = 1ps.
    Also support the new Checker.
cpu/ozone/cpu_builder.cc:
    Add parameter for maxOutstandingMemOps so it can be set through the config.
    Also add in the checker.  Right now it's a BaseCPU simobject, but that may change in the future.
cpu/ozone/cpu_impl.hh:
    Add support for the checker.  For now there's a dynamic cast to convert the simobject passed back from the builder to the proper Checker type.  It's ugly, but only happens at startup, and is probably a justified use of dynamic cast.

    Support switching out/taking over from other CPUs.

    Correct indexing problem for float registers.
cpu/ozone/dyn_inst.hh:
    Add ability for instructions to wait on memory instructions in addition to source register instructions.  This is needed for memory dependence predictors and memory barriers.
cpu/ozone/dyn_inst_impl.hh:
    Support waiting on memory operations.
    Use "resultReady" to differentiate an instruction having its registers produced vs being totally completed.
cpu/ozone/front_end.hh:
    Support switching out.
    Also record if an interrupt is pending.
cpu/ozone/front_end_impl.hh:
    Support switching out.  Also support stalling the front end if an interrupt is pending.
cpu/ozone/lw_back_end.hh:
    Add checker in.
    Support switching out.
    Support memory barriers.
cpu/ozone/lw_back_end_impl.hh:
    Lots of changes to get things to work right.
    Faults, traps, interrupts all wait until all stores have written back (important).
    Memory barriers are supported, as is the general ability for instructions to be dependent on other memory instructions.
cpu/ozone/lw_lsq.hh:
    Support switching out.
    Also use store writeback events in all cases, not just dcache misses.
cpu/ozone/lw_lsq_impl.hh:
    Support switching out.
    Also use store writeback events in all cases, not just dcache misses.
    Support the checker CPU.  Marks instructions as completed once the functional access is done (which has to be done for the checker to be able to verify results).
cpu/ozone/simple_params.hh:
    Add max outstanding mem ops parameter.
python/m5/objects/OzoneCPU.py:
    Add max outstanding mem ops, checker.

--HG--
extra : convert_revision : f4d408e1bb1f25836a097b6abe3856111e950c59
---
 cpu/base_dyn_inst.hh          |   5 -
 cpu/ozone/back_end_impl.hh    |   2 +-
 cpu/ozone/cpu.hh              |  28 +++-
 cpu/ozone/cpu_builder.cc      |  16 ++-
 cpu/ozone/cpu_impl.hh         | 118 ++++++++++++----
 cpu/ozone/dyn_inst.hh         |  40 ++++--
 cpu/ozone/dyn_inst_impl.hh    |  43 +++++-
 cpu/ozone/front_end.hh        |  13 ++
 cpu/ozone/front_end_impl.hh   |  58 +++++++-
 cpu/ozone/lw_back_end.hh      |  20 ++-
 cpu/ozone/lw_back_end_impl.hh | 256 +++++++++++++++++++++++++++-------
 cpu/ozone/lw_lsq.hh           |  32 ++++-
 cpu/ozone/lw_lsq_impl.hh      | 189 +++++++++++++++++++------
 cpu/ozone/simple_params.hh    |   1 +
 python/m5/objects/OzoneCPU.py |   3 +
 15 files changed, 660 insertions(+), 164 deletions(-)

diff --git a/cpu/base_dyn_inst.hh b/cpu/base_dyn_inst.hh
index 18978142d..cd754dc3c 100644
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@@ -117,11 +117,6 @@ class BaseDynInst : public FastAlloc, public RefCounted
     Fault write(T data, Addr addr, unsigned flags,
                         uint64_t *res);
 
-    // @todo: Probably should not have this function in the DynInst.
-    template <class T>
-    bool snoop(MemReqPtr &req, T &data)
-    { return cpu->snoop(req, data); }
-
     void prefetch(Addr addr, unsigned flags);
     void writeHint(Addr addr, int size, unsigned flags);
     Fault copySrcTranslate(Addr src);
diff --git a/cpu/ozone/back_end_impl.hh b/cpu/ozone/back_end_impl.hh
index 0b0f04f59..36770d65c 100644
--- a/cpu/ozone/back_end_impl.hh
+++ b/cpu/ozone/back_end_impl.hh
@@ -1385,7 +1385,7 @@ BackEnd<Impl>::writebackInsts()
                     inst->seqNum, inst->readPC());
 
             inst->setCanCommit();
-            inst->setCompleted();
+            inst->setResultReady();
 
             if (inst->isExecuted()) {
                 int dependents = IQ.wakeDependents(inst);
diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index 56b6571a2..eec8902d8 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -53,6 +53,7 @@ class AlphaDTB;
 class PhysicalMemory;
 class MemoryController;
 
+class Sampler;
 class RemoteGDB;
 class GDBListener;
 
@@ -69,6 +70,9 @@ namespace Trace {
     class InstRecord;
 }
 
+template <class>
+class Checker;
+
 /**
  * Declaration of Out-of-Order CPU class.  Basically it is a SimpleCPU with
  * simple out-of-order capabilities added to it.  It is still a 1 CPI machine
@@ -226,7 +230,9 @@ class OzoneCPU : public BaseCPU
     };
 
     // execution context proxy
-    OzoneXC xcProxy;
+    OzoneXC ozoneXC;
+    ExecContext *xcProxy;
+    ExecContext *checkerXC;
 
     typedef OzoneThreadState<Impl> ImplState;
 
@@ -245,6 +251,7 @@ class OzoneCPU : public BaseCPU
     void tick();
 
     std::set<InstSeqNum> snList;
+    std::set<Addr> lockAddrList;
   private:
     struct TickEvent : public Event
     {
@@ -262,9 +269,9 @@ class OzoneCPU : public BaseCPU
     void scheduleTickEvent(int delay)
     {
         if (tickEvent.squashed())
-            tickEvent.reschedule(curTick + delay);
+            tickEvent.reschedule(curTick + cycles(delay));
         else if (!tickEvent.scheduled())
-            tickEvent.schedule(curTick + delay);
+            tickEvent.schedule(curTick + cycles(delay));
     }
 
     /// Unschedule tick event, regardless of its current state.
@@ -322,7 +329,7 @@ class OzoneCPU : public BaseCPU
 
     int cpuId;
 
-    void switchOut();
+    void switchOut(Sampler *sampler);
     void takeOverFrom(BaseCPU *oldCPU);
 
 #if FULL_SYSTEM
@@ -472,6 +479,7 @@ class OzoneCPU : public BaseCPU
         Fault error;
         if (req->flags & LOCKED) {
 //            lockAddr = req->paddr;
+            lockAddrList.insert(req->paddr);
             lockFlag = true;
         }
 
@@ -546,7 +554,13 @@ class OzoneCPU : public BaseCPU
                 req->result = 2;
             } else {
                 if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
-                    req->result = 1;
+                    if (lockAddrList.find(req->paddr) !=
+                        lockAddrList.end()) {
+                        req->result = 1;
+                    } else {
+                        req->result = 0;
+                        return NoFault;
+                    }
                 } else {
                     req->result = 0;
                     return NoFault;
@@ -599,7 +613,7 @@ class OzoneCPU : public BaseCPU
     void setSyscallReturn(SyscallReturn return_value, int tid);
 #endif
 
-    ExecContext *xcBase() { return &xcProxy; }
+    ExecContext *xcBase() { return xcProxy; }
 
     bool decoupledFrontEnd;
     struct CommStruct {
@@ -615,6 +629,8 @@ class OzoneCPU : public BaseCPU
     bool lockFlag;
 
     Stats::Scalar<> quiesceCycles;
+
+    Checker<DynInstPtr> *checker;
 };
 
 #endif // __CPU_OZONE_CPU_HH__
diff --git a/cpu/ozone/cpu_builder.cc b/cpu/ozone/cpu_builder.cc
index 0146dd1bd..64aa49c71 100644
--- a/cpu/ozone/cpu_builder.cc
+++ b/cpu/ozone/cpu_builder.cc
@@ -1,6 +1,7 @@
 
 #include <string>
 
+#include "cpu/checker/cpu.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/ozone/cpu.hh"
 #include "cpu/ozone/ozone_impl.hh"
@@ -50,6 +51,8 @@ SimObjectVectorParam<Process *> workload;
 
 SimObjectParam<FunctionalMemory *> mem;
 
+SimObjectParam<BaseCPU *> checker;
+
 Param<Counter> max_insts_any_thread;
 Param<Counter> max_insts_all_threads;
 Param<Counter> max_loads_any_thread;
@@ -66,6 +69,7 @@ Param<unsigned> backEndSquashLatency;
 Param<unsigned> backEndLatency;
 Param<unsigned> maxInstBufferSize;
 Param<unsigned> numPhysicalRegs;
+Param<unsigned> maxOutstandingMemOps;
 
 Param<unsigned> decodeToFetchDelay;
 Param<unsigned> renameToFetchDelay;
@@ -164,6 +168,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
 
+    INIT_PARAM_DFLT(checker, "Checker CPU", NULL),
+
     INIT_PARAM_DFLT(max_insts_any_thread,
                     "Terminate when any thread reaches this inst count",
                     0),
@@ -190,6 +196,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
     INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1),
     INIT_PARAM_DFLT(maxInstBufferSize, "Maximum instruction buffer size", 16),
     INIT_PARAM(numPhysicalRegs, "Number of physical registers"),
+    INIT_PARAM_DFLT(maxOutstandingMemOps, "Maximum outstanding memory operations", 4),
 
     INIT_PARAM(decodeToFetchDelay, "Decode to fetch delay"),
     INIT_PARAM(renameToFetchDelay, "Rename to fetch delay"),
@@ -314,7 +321,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
 #endif // FULL_SYSTEM
 
     params->mem = mem;
-
+    params->checker = checker;
     params->max_insts_any_thread = max_insts_any_thread;
     params->max_insts_all_threads = max_insts_all_threads;
     params->max_loads_any_thread = max_loads_any_thread;
@@ -334,6 +341,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
     params->backEndLatency = backEndLatency;
     params->maxInstBufferSize = maxInstBufferSize;
     params->numPhysicalRegs = numPhysIntRegs + numPhysFloatRegs;
+    params->maxOutstandingMemOps = maxOutstandingMemOps;
 
     params->decodeToFetchDelay = decodeToFetchDelay;
     params->renameToFetchDelay = renameToFetchDelay;
@@ -445,6 +453,8 @@ SimObjectVectorParam<Process *> workload;
 
 SimObjectParam<FunctionalMemory *> mem;
 
+SimObjectParam<BaseCPU *> checker;
+
 Param<Counter> max_insts_any_thread;
 Param<Counter> max_insts_all_threads;
 Param<Counter> max_loads_any_thread;
@@ -559,6 +569,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(SimpleOzoneCPU)
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
 
+    INIT_PARAM_DFLT(checker, "Checker CPU", NULL),
+
     INIT_PARAM_DFLT(max_insts_any_thread,
                     "Terminate when any thread reaches this inst count",
                     0),
@@ -709,7 +721,7 @@ CREATE_SIM_OBJECT(SimpleOzoneCPU)
 #endif // FULL_SYSTEM
 
     params->mem = mem;
-
+    params->checker = checker;
     params->max_insts_any_thread = max_insts_any_thread;
     params->max_insts_all_threads = max_insts_all_threads;
     params->max_loads_any_thread = max_loads_any_thread;
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index 17d944e7c..4f3fdf521 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -33,6 +33,7 @@
 #include "base/trace.hh"
 #include "config/full_system.hh"
 #include "cpu/base.hh"
+#include "cpu/checker/exec_context.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/ozone/cpu.hh"
@@ -156,17 +157,33 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
 #endif
       comm(5, 5)
 {
-
+    if (p->checker) {
+        BaseCPU *temp_checker = p->checker;
+        checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker);
+    } else {
+        checker = NULL;
+    }
     frontEnd = new FrontEnd(p);
     backEnd = new BackEnd(p);
 
     _status = Idle;
-    thread.xcProxy = &xcProxy;
+    if (checker) {
+        checker->setMemory(mem);
+#if FULL_SYSTEM
+        checker->setSystem(p->system);
+#endif
+        checkerXC = new CheckerExecContext<OzoneXC>(&ozoneXC, checker);
+        thread.xcProxy = checkerXC;
+        xcProxy = checkerXC;
+    } else {
+        thread.xcProxy = &ozoneXC;
+        xcProxy = &ozoneXC;
+    }
 
     thread.inSyscall = false;
 
-    xcProxy.cpu = this;
-    xcProxy.thread = &thread;
+    ozoneXC.cpu = this;
+    ozoneXC.thread = &thread;
 
     thread.setStatus(ExecContext::Suspended);
 #if FULL_SYSTEM
@@ -177,7 +194,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     thread.tid = 0;
     thread.mem = p->mem;
 
-    thread.quiesceEvent = new EndQuiesceEvent(&xcProxy);
+    thread.quiesceEvent = new EndQuiesceEvent(xcProxy);
 
     system = p->system;
     itb = p->itb;
@@ -187,9 +204,10 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
 
     if (p->profile) {
         thread.profile = new FunctionProfile(p->system->kernelSymtab);
+        // @todo: This might be better as an ExecContext instead of OzoneXC
         Callback *cb =
             new MakeCallback<OzoneXC,
-            &OzoneXC::dumpFuncProfile>(&xcProxy);
+            &OzoneXC::dumpFuncProfile>(&ozoneXC);
         registerExitCallback(cb);
     }
 
@@ -198,7 +216,6 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     static ProfileNode dummyNode;
     thread.profileNode = &dummyNode;
     thread.profilePC = 3;
-
 #else
 //    xc = new ExecContext(this, /* thread_num */ 0, p->workload[0], /* asid */ 0);
     thread.cpu = this;
@@ -225,13 +242,13 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
 
     issueWidth = p->issueWidth;
 */
-    execContexts.push_back(&xcProxy);
+    execContexts.push_back(xcProxy);
 
     frontEnd->setCPU(this);
     backEnd->setCPU(this);
 
-    frontEnd->setXC(&xcProxy);
-    backEnd->setXC(&xcProxy);
+    frontEnd->setXC(xcProxy);
+    backEnd->setXC(xcProxy);
 
     frontEnd->setThreadState(&thread);
     backEnd->setThreadState(&thread);
@@ -250,7 +267,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
 
     for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
         thread.renameTable[i] = new DynInst(this);
-        thread.renameTable[i]->setCompleted();
+        thread.renameTable[i]->setResultReady();
     }
 
     frontEnd->renameTable.copyFrom(thread.renameTable);
@@ -312,11 +329,15 @@ OzoneCPU<Impl>::copyToXC()
 */
 template <class Impl>
 void
-OzoneCPU<Impl>::switchOut()
+OzoneCPU<Impl>::switchOut(Sampler *sampler)
 {
+    // Front end needs state from back end, so switch out the back end first.
+    backEnd->switchOut();
+    frontEnd->switchOut();
     _status = SwitchedOut;
     if (tickEvent.scheduled())
         tickEvent.squash();
+    sampler->signalSwitched();
 }
 
 template <class Impl>
@@ -325,8 +346,16 @@ OzoneCPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
     BaseCPU::takeOverFrom(oldCPU);
 
+    backEnd->takeOverFrom();
+    frontEnd->takeOverFrom();
     assert(!tickEvent.scheduled());
 
+    // @todo: Fix hardcoded number
+    // Clear out any old information in time buffer.
+    for (int i = 0; i < 6; ++i) {
+        comm.advance();
+    }
+
     // if any of this CPU's ExecContexts are active, mark the CPU as
     // running and schedule its tick event.
     for (int i = 0; i < execContexts.size(); ++i) {
@@ -470,7 +499,7 @@ OzoneCPU<Impl>::serialize(std::ostream &os)
     BaseCPU::serialize(os);
     SERIALIZE_ENUM(_status);
     nameOut(os, csprintf("%s.xc", name()));
-    xcProxy.serialize(os);
+    ozoneXC.serialize(os);
     nameOut(os, csprintf("%s.tickEvent", name()));
     tickEvent.serialize(os);
 }
@@ -481,7 +510,7 @@ OzoneCPU<Impl>::unserialize(Checkpoint *cp, const std::string &section)
 {
     BaseCPU::unserialize(cp, section);
     UNSERIALIZE_ENUM(_status);
-    xcProxy.unserialize(cp, csprintf("%s.xc", section));
+    ozoneXC.unserialize(cp, csprintf("%s.xc", section));
     tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
 }
 
@@ -579,7 +608,7 @@ template <class Impl>
 Addr
 OzoneCPU<Impl>::dbg_vtophys(Addr addr)
 {
-    return vtophys(&xcProxy, addr);
+    return vtophys(xcProxy, addr);
 }
 #endif // FULL_SYSTEM
 /*
@@ -725,7 +754,7 @@ OzoneCPU<Impl>::tick()
     comInstEventQueue[0]->serviceEvents(numInst);
 
     if (!tickEvent.scheduled() && _status == Running)
-        tickEvent.schedule(curTick + 1);
+        tickEvent.schedule(curTick + cycles(1));
 }
 
 template <class Impl>
@@ -750,7 +779,7 @@ OzoneCPU<Impl>::syscall()
 
     DPRINTF(OzoneCPU, "FuncExeInst: %i\n", thread.funcExeInst);
 
-    thread.process->syscall(&xcProxy);
+    thread.process->syscall(xcProxy);
 
     thread.funcExeInst--;
 
@@ -784,19 +813,17 @@ OzoneCPU<Impl>::hwrei()
 {
     // Need to move this to ISA code
     // May also need to make this per thread
+/*
     if (!inPalMode())
         return new UnimplementedOpcodeFault;
 
     thread.setNextPC(thread.readMiscReg(AlphaISA::IPR_EXC_ADDR));
-
+*/
     lockFlag = false;
+    lockAddrList.clear();
+    kernelStats->hwrei();
 
-    // Not sure how to make a similar check in the Ozone model
-//    if (!misspeculating()) {
-        kernelStats->hwrei();
-
-        checkInterrupts = true;
-//    }
+    checkInterrupts = true;
 
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
@@ -847,6 +874,11 @@ OzoneCPU<Impl>::processInterrupts()
     if (ipl && ipl > thread.readMiscReg(IPR_IPLR)) {
         thread.setMiscReg(IPR_ISR, summary);
         thread.setMiscReg(IPR_INTID, ipl);
+        // @todo: Make this more transparent
+        if (checker) {
+            checkerXC->setMiscReg(IPR_ISR, summary);
+            checkerXC->setMiscReg(IPR_INTID, ipl);
+        }
         Fault fault = new InterruptFault;
         fault->invoke(thread.getXCProxy());
         DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
@@ -860,7 +892,7 @@ OzoneCPU<Impl>::simPalCheck(int palFunc)
 {
     // Need to move this to ISA code
     // May also need to make this per thread
-    this->kernelStats->callpal(palFunc, &xcProxy);
+    this->kernelStats->callpal(palFunc, xcProxy);
 
     switch (palFunc) {
       case PAL::halt:
@@ -944,7 +976,28 @@ OzoneCPU<Impl>::OzoneXC::dumpFuncProfile()
 template <class Impl>
 void
 OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
-{ }
+{
+    // some things should already be set up
+    assert(getMemPtr() == old_context->getMemPtr());
+#if FULL_SYSTEM
+    assert(getSystemPtr() == old_context->getSystemPtr());
+#else
+    assert(getProcessPtr() == old_context->getProcessPtr());
+#endif
+
+    // copy over functional state
+    setStatus(old_context->status());
+    copyArchRegs(old_context);
+    setCpuId(old_context->readCpuId());
+#if !FULL_SYSTEM
+    setFuncExeInst(old_context->readFuncExeInst());
+#endif
+
+//    storeCondFailures = 0;
+    cpu->lockFlag = false;
+
+    old_context->setStatus(ExecContext::Unallocated);
+}
 
 template <class Impl>
 void
@@ -1062,21 +1115,24 @@ template <class Impl>
 float
 OzoneCPU<Impl>::OzoneXC::readFloatRegSingle(int reg_idx)
 {
-    return thread->renameTable[reg_idx]->readFloatResult();
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    return thread->renameTable[idx]->readFloatResult();
 }
 
 template <class Impl>
 double
 OzoneCPU<Impl>::OzoneXC::readFloatRegDouble(int reg_idx)
 {
-    return thread->renameTable[reg_idx]->readDoubleResult();
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    return thread->renameTable[idx]->readDoubleResult();
 }
 
 template <class Impl>
 uint64_t
 OzoneCPU<Impl>::OzoneXC::readFloatRegInt(int reg_idx)
 {
-    return thread->renameTable[reg_idx]->readIntResult();
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    return thread->renameTable[idx]->readIntResult();
 }
 
 template <class Impl>
@@ -1101,7 +1157,9 @@ template <class Impl>
 void
 OzoneCPU<Impl>::OzoneXC::setFloatRegDouble(int reg_idx, double val)
 {
-    thread->renameTable[reg_idx]->setDoubleResult(val);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+
+    thread->renameTable[idx]->setDoubleResult(val);
 
     if (!thread->inSyscall) {
         cpu->squashFromXC();
diff --git a/cpu/ozone/dyn_inst.hh b/cpu/ozone/dyn_inst.hh
index 4382af0fd..f251c28ea 100644
--- a/cpu/ozone/dyn_inst.hh
+++ b/cpu/ozone/dyn_inst.hh
@@ -59,9 +59,9 @@ class OzoneDynInst : public BaseDynInst<Impl>
     typedef TheISA::MiscReg MiscReg;
     typedef typename std::list<DynInstPtr>::iterator ListIt;
 
-    // Note that this is duplicated from the BaseDynInst class; I'm simply not
-    // sure the enum would carry through so I could use it in array
-    // declarations in this class.
+    // Note that this is duplicated from the BaseDynInst class; I'm
+    // simply not sure the enum would carry through so I could use it
+    // in array declarations in this class.
     enum {
         MaxInstSrcRegs = TheISA::MaxInstSrcRegs,
         MaxInstDestRegs = TheISA::MaxInstDestRegs
@@ -90,9 +90,23 @@ class OzoneDynInst : public BaseDynInst<Impl>
     void addDependent(DynInstPtr &dependent_inst);
 
     std::vector<DynInstPtr> &getDependents() { return dependents; }
+    std::vector<DynInstPtr> &getMemDeps() { return memDependents; }
+    std::list<DynInstPtr> &getMemSrcs() { return srcMemInsts; }
 
     void wakeDependents();
 
+    void wakeMemDependents();
+
+    void addMemDependent(DynInstPtr &inst) { memDependents.push_back(inst); }
+
+    void addSrcMemInst(DynInstPtr &inst) { srcMemInsts.push_back(inst); }
+
+    void markMemInstReady(OzoneDynInst<Impl> *inst);
+
+    // For now I will remove instructions from the list when they wake
+    // up.  In the future, you only really need a counter.
+    bool memDepReady() { return srcMemInsts.empty(); }
+
 //    void setBPredInfo(const BPredInfo &bp_info) { bpInfo = bp_info; }
 
 //    BPredInfo &getBPredInfo() { return bpInfo; }
@@ -104,9 +118,13 @@ class OzoneDynInst : public BaseDynInst<Impl>
 
     std::vector<DynInstPtr> dependents;
 
-    /** The instruction that produces the value of the source registers.  These
-     *  may be NULL if the value has already been read from the source
-     *  instruction.
+    std::vector<DynInstPtr> memDependents;
+
+    std::list<DynInstPtr> srcMemInsts;
+
+    /** The instruction that produces the value of the source
+     *  registers.  These may be NULL if the value has already been
+     *  read from the source instruction.
      */
     DynInstPtr srcInsts[MaxInstSrcRegs];
 
@@ -165,22 +183,22 @@ class OzoneDynInst : public BaseDynInst<Impl>
      */
     void setIntReg(const StaticInst *si, int idx, uint64_t val)
     {
-        this->instResult.integer = val;
+        BaseDynInst<Impl>::setIntReg(si, idx, val);
     }
 
     void setFloatRegSingle(const StaticInst *si, int idx, float val)
     {
-        this->instResult.fp = val;
+        BaseDynInst<Impl>::setFloatRegSingle(si, idx, val);
     }
 
     void setFloatRegDouble(const StaticInst *si, int idx, double val)
     {
-        this->instResult.dbl = val;
+        BaseDynInst<Impl>::setFloatRegDouble(si, idx, val);
     }
 
     void setFloatRegInt(const StaticInst *si, int idx, uint64_t val)
     {
-        this->instResult.integer = val;
+        BaseDynInst<Impl>::setFloatRegInt(si, idx, val);
     }
 
     void setIntResult(uint64_t result) { this->instResult.integer = result; }
@@ -199,6 +217,8 @@ class OzoneDynInst : public BaseDynInst<Impl>
 
     void clearDependents();
 
+    void clearMemDependents();
+
   public:
     // ISA stuff
     MiscReg readMiscReg(int misc_reg);
diff --git a/cpu/ozone/dyn_inst_impl.hh b/cpu/ozone/dyn_inst_impl.hh
index c83481c9a..a7e4460a1 100644
--- a/cpu/ozone/dyn_inst_impl.hh
+++ b/cpu/ozone/dyn_inst_impl.hh
@@ -38,7 +38,7 @@ template <class Impl>
 OzoneDynInst<Impl>::OzoneDynInst(FullCPU *cpu)
     : BaseDynInst<Impl>(0, 0, 0, 0, cpu)
 {
-    this->setCompleted();
+    this->setResultReady();
 
     initInstPtrs();
 }
@@ -130,7 +130,7 @@ template <class Impl>
 bool
 OzoneDynInst<Impl>::srcInstReady(int regIdx)
 {
-    return srcInsts[regIdx]->isCompleted();
+    return srcInsts[regIdx]->isResultReady();
 }
 
 template <class Impl>
@@ -149,6 +149,28 @@ OzoneDynInst<Impl>::wakeDependents()
     }
 }
 
+template <class Impl>
+void
+OzoneDynInst<Impl>::wakeMemDependents()
+{
+    for (int i = 0; i < memDependents.size(); ++i) {
+        memDependents[i]->markMemInstReady(this);
+    }
+}
+
+template <class Impl>
+void
+OzoneDynInst<Impl>::markMemInstReady(OzoneDynInst<Impl> *inst)
+{
+    ListIt mem_it = srcMemInsts.begin();
+    while ((*mem_it) != inst && mem_it != srcMemInsts.end()) {
+        mem_it++;
+    }
+    assert(mem_it != srcMemInsts.end());
+
+    srcMemInsts.erase(mem_it);
+}
+
 template <class Impl>
 void
 OzoneDynInst<Impl>::initInstPtrs()
@@ -164,7 +186,7 @@ bool
 OzoneDynInst<Impl>::srcsReady()
 {
     for (int i = 0; i < this->numSrcRegs(); ++i) {
-        if (!srcInsts[i]->isCompleted())
+        if (!srcInsts[i]->isResultReady())
             return false;
     }
 
@@ -176,7 +198,7 @@ bool
 OzoneDynInst<Impl>::eaSrcsReady()
 {
     for (int i = 1; i < this->numSrcRegs(); ++i) {
-        if (!srcInsts[i]->isCompleted())
+        if (!srcInsts[i]->isResultReady())
             return false;
     }
 
@@ -195,6 +217,14 @@ OzoneDynInst<Impl>::clearDependents()
         prevDestInst[i] = NULL;
     }
 }
+
+template <class Impl>
+void
+OzoneDynInst<Impl>::clearMemDependents()
+{
+    memDependents.clear();
+}
+
 template <class Impl>
 MiscReg
 OzoneDynInst<Impl>::readMiscReg(int misc_reg)
@@ -213,6 +243,7 @@ template <class Impl>
 Fault
 OzoneDynInst<Impl>::setMiscReg(int misc_reg, const MiscReg &val)
 {
+    this->setIntResult(val);
     return this->thread->setMiscReg(misc_reg, val);
 }
 
@@ -234,11 +265,13 @@ OzoneDynInst<Impl>::hwrei()
 
     this->setNextPC(this->thread->readMiscReg(AlphaISA::IPR_EXC_ADDR));
 
+    this->cpu->hwrei();
+/*
     this->cpu->kernelStats->hwrei();
 
     this->cpu->checkInterrupts = true;
     this->cpu->lockFlag = false;
-
+*/
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
 }
diff --git a/cpu/ozone/front_end.hh b/cpu/ozone/front_end.hh
index 2bff2544d..188925ae5 100644
--- a/cpu/ozone/front_end.hh
+++ b/cpu/ozone/front_end.hh
@@ -66,6 +66,14 @@ class FrontEnd
 
     bool isEmpty() { return instBuffer.empty(); }
 
+    void switchOut();
+
+    void takeOverFrom(ExecContext *old_xc = NULL);
+
+    bool isSwitchedOut() { return switchedOut; }
+
+    bool switchedOut;
+
   private:
     bool updateStatus();
 
@@ -198,6 +206,9 @@ class FrontEnd
 
     DynInstPtr barrierInst;
 
+  public:
+    bool interruptPending;
+  private:
     // number of idle cycles
 /*
     Stats::Average<> notIdleFraction;
@@ -223,6 +234,8 @@ class FrontEnd
     Stats::Scalar<> fetchBlockedCycles;
     /** Stat for total number of fetched cache lines. */
     Stats::Scalar<> fetchedCacheLines;
+
+    Stats::Scalar<> fetchIcacheSquashes;
     /** Distribution of number of instructions fetched each cycle. */
     Stats::Distribution<> fetchNisnDist;
 //    Stats::Vector<> qfull_iq_occupancy;
diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh
index 7c18386cf..a3eb809d0 100644
--- a/cpu/ozone/front_end_impl.hh
+++ b/cpu/ozone/front_end_impl.hh
@@ -19,8 +19,11 @@ FrontEnd<Impl>::FrontEnd(Params *params)
       width(params->frontEndWidth),
       freeRegs(params->numPhysicalRegs),
       numPhysRegs(params->numPhysicalRegs),
-      serializeNext(false)
+      serializeNext(false),
+      interruptPending(false)
 {
+    switchedOut = false;
+
     status = Idle;
 
     // Setup branch predictor.
@@ -127,6 +130,11 @@ FrontEnd<Impl>::regStats()
         .desc("Number of cache lines fetched")
         .prereq(fetchedCacheLines);
 
+    fetchIcacheSquashes
+        .name(name() + ".fetchIcacheSquashes")
+        .desc("Number of outstanding Icache misses that were squashed")
+        .prereq(fetchIcacheSquashes);
+
     fetchNisnDist
         .init(/* base value */ 0,
               /* last value */ width,
@@ -370,6 +378,10 @@ FrontEnd<Impl>::fetchCacheLine()
 #endif // FULL_SYSTEM
     Fault fault = NoFault;
 
+    if (interruptPending && flags == 0) {
+        return fault;
+    }
+
     // Align the fetch PC so it's at the start of a cache block.
     Addr fetch_PC = icacheBlockAlignPC(PC);
 
@@ -397,7 +409,8 @@ FrontEnd<Impl>::fetchCacheLine()
     // exists within the cache.
     if (icacheInterface && fault == NoFault) {
 #if FULL_SYSTEM
-        if (cpu->system->memctrl->badaddr(memReq->paddr)) {
+        if (cpu->system->memctrl->badaddr(memReq->paddr) ||
+            memReq->flags & UNCACHEABLE) {
             DPRINTF(FE, "Fetch: Bad address %#x (hopefully on a "
                     "misspeculating path!",
                     memReq->paddr);
@@ -497,7 +510,7 @@ FrontEnd<Impl>::processBarriers(DynInstPtr &inst)
             dispatchedTempSerializing++;
         }
 
-        // Change status over to BarrierStall so that other stages know
+        // Change status over to SerializeBlocked so that other stages know
         // what this is blocked on.
         status = SerializeBlocked;
 
@@ -613,8 +626,10 @@ FrontEnd<Impl>::processCacheCompletion(MemReqPtr &req)
 
     // Do something here.
     if (status != IcacheMissStall ||
-        req != memReq) {
+        req != memReq ||
+        switchedOut) {
         DPRINTF(FE, "Previous fetch was squashed.\n");
+        fetchIcacheSquashes++;
         return;
     }
 
@@ -702,6 +717,7 @@ FrontEnd<Impl>::getInstFromCacheline()
         DynInstPtr inst = barrierInst;
         status = Running;
         barrierInst = NULL;
+        inst->clearSerializeBefore();
         return inst;
     }
 
@@ -773,7 +789,7 @@ FrontEnd<Impl>::renameInst(DynInstPtr &inst)
             DPRINTF(FE, "[sn:%lli]: Src reg %i is inst [sn:%lli]\n",
                     inst->seqNum, (int)inst->srcRegIdx(i), src_inst->seqNum);
 
-            if (src_inst->isCompleted()) {
+            if (src_inst->isResultReady()) {
                 DPRINTF(FE, "Reg ready.\n");
                 inst->markSrcRegReady(i);
             } else {
@@ -807,6 +823,38 @@ FrontEnd<Impl>::wakeFromQuiesce()
     status = Running;
 }
 
+template <class Impl>
+void
+FrontEnd<Impl>::switchOut()
+{
+    switchedOut = true;
+    memReq = NULL;
+    squash(0, 0);
+    instBuffer.clear();
+    instBufferSize = 0;
+    status = Idle;
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::takeOverFrom(ExecContext *old_xc)
+{
+    assert(freeRegs == numPhysRegs);
+    fetchCacheLineNextCycle = true;
+
+    cacheBlkValid = false;
+
+#if !FULL_SYSTEM
+//    pTable = params->pTable;
+#endif
+    fetchFault = NoFault;
+    serializeNext = false;
+    barrierInst = NULL;
+    status = Running;
+    switchedOut = false;
+    interruptPending = false;
+}
+
 template <class Impl>
 void
 FrontEnd<Impl>::dumpInsts()
diff --git a/cpu/ozone/lw_back_end.hh b/cpu/ozone/lw_back_end.hh
index f17c93ff4..028fdaf8c 100644
--- a/cpu/ozone/lw_back_end.hh
+++ b/cpu/ozone/lw_back_end.hh
@@ -17,6 +17,8 @@
 #include "mem/mem_req.hh"
 #include "sim/eventq.hh"
 
+template <class>
+class Checker;
 class ExecContext;
 
 template <class Impl>
@@ -126,6 +128,8 @@ class LWBackEnd
 
     Addr commitPC;
 
+    Tick lastCommitCycle;
+
     bool robEmpty() { return instList.empty(); }
 
     bool isFull() { return numInsts >= numROBEntries; }
@@ -133,7 +137,7 @@ class LWBackEnd
 
     void fetchFault(Fault &fault);
 
-    int wakeDependents(DynInstPtr &inst);
+    int wakeDependents(DynInstPtr &inst, bool memory_deps = false);
 
     /** Tells memory dependence unit that a memory instruction needs to be
      * rescheduled. It will re-execute once replayMemInst() is called.
@@ -182,6 +186,12 @@ class LWBackEnd
 
     void instToCommit(DynInstPtr &inst);
 
+    void switchOut();
+
+    void takeOverFrom(ExecContext *old_xc = NULL);
+
+    bool isSwitchedOut() { return switchedOut; }
+
   private:
     void generateTrapEvent(Tick latency = 0);
     void handleFault(Fault &fault, Tick latency = 0);
@@ -303,6 +313,10 @@ class LWBackEnd
     Fault faultFromFetch;
     bool fetchHasFault;
 
+    bool switchedOut;
+
+    DynInstPtr memBarrier;
+
   private:
     struct pqCompare {
         bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
@@ -327,7 +341,7 @@ class LWBackEnd
 
     bool exactFullStall;
 
-    bool fetchRedirect[Impl::MaxThreads];
+//    bool fetchRedirect[Impl::MaxThreads];
 
     // number of cycles stalled for D-cache misses
 /*    Stats::Scalar<> dcacheStallCycles;
@@ -414,6 +428,8 @@ class LWBackEnd
     Stats::VectorDistribution<> ROB_occ_dist;
   public:
     void dumpInsts();
+
+    Checker<DynInstPtr> *checker;
 };
 
 template <class Impl>
diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh
index d1290239c..d4829629d 100644
--- a/cpu/ozone/lw_back_end_impl.hh
+++ b/cpu/ozone/lw_back_end_impl.hh
@@ -1,5 +1,6 @@
 
 #include "encumbered/cpu/full/op_class.hh"
+#include "cpu/checker/cpu.hh"
 #include "cpu/ozone/lw_back_end.hh"
 
 template <class Impl>
@@ -10,28 +11,36 @@ LWBackEnd<Impl>::generateTrapEvent(Tick latency)
 
     TrapEvent *trap = new TrapEvent(this);
 
-    trap->schedule(curTick + latency);
+    trap->schedule(curTick + cpu->cycles(latency));
 
     thread->trapPending = true;
 }
 
 template <class Impl>
 int
-LWBackEnd<Impl>::wakeDependents(DynInstPtr &inst)
+LWBackEnd<Impl>::wakeDependents(DynInstPtr &inst, bool memory_deps)
 {
     assert(!inst->isSquashed());
-    std::vector<DynInstPtr> &dependents = inst->getDependents();
+    std::vector<DynInstPtr> &dependents = memory_deps ? inst->getMemDeps() :
+        inst->getDependents();
     int num_outputs = dependents.size();
 
     DPRINTF(BE, "Waking instruction [sn:%lli] dependents in IQ\n", inst->seqNum);
 
     for (int i = 0; i < num_outputs; i++) {
         DynInstPtr dep_inst = dependents[i];
-        dep_inst->markSrcRegReady();
+        if (!memory_deps) {
+            dep_inst->markSrcRegReady();
+        } else {
+            if (!dep_inst->isSquashed())
+                dep_inst->markMemInstReady(inst.get());
+        }
+
         DPRINTF(BE, "Marking source reg ready [sn:%lli] in IQ\n", dep_inst->seqNum);
 
         if (dep_inst->readyToIssue() && dep_inst->isInROB() &&
-            !dep_inst->isNonSpeculative()) {
+            !dep_inst->isNonSpeculative() &&
+            dep_inst->memDepReady() && !dep_inst->isMemBarrier() && !dep_inst->isWriteBarrier()) {
             DPRINTF(BE, "Adding instruction to exeList [sn:%lli]\n",
                     dep_inst->seqNum);
             exeList.push(dep_inst);
@@ -114,6 +123,9 @@ LWBackEnd<Impl>::LdWritebackEvent::process()
 
 //    iewStage->wakeCPU();
 
+    if (be->isSwitchedOut())
+        return;
+
     if (dcacheMiss) {
         be->removeDcacheMiss(inst);
     }
@@ -169,16 +181,18 @@ LWBackEnd<Impl>::DCacheCompletionEvent::description()
 template <class Impl>
 LWBackEnd<Impl>::LWBackEnd(Params *params)
     : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5),
-      xcSquash(false), cacheCompletionEvent(this),
+      trapSquash(false), xcSquash(false), cacheCompletionEvent(this),
       dcacheInterface(params->dcacheInterface), width(params->backEndWidth),
       exactFullStall(true)
 {
     numROBEntries = params->numROBEntries;
     numInsts = 0;
     numDispatchEntries = 32;
-    maxOutstandingMemOps = 4;
+    maxOutstandingMemOps = params->maxOutstandingMemOps;
     numWaitingMemOps = 0;
     waitingInsts = 0;
+    switchedOut = false;
+
 //    IQ.setBE(this);
     LSQ.setBE(this);
 
@@ -533,6 +547,7 @@ LWBackEnd<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     cpu = cpu_ptr;
     LSQ.setCPU(cpu_ptr);
+    checker = cpu->checker;
 }
 
 template <class Impl>
@@ -554,30 +569,35 @@ LWBackEnd<Impl>::checkInterrupts()
         !cpu->inPalMode(thread->readPC()) &&
         !trapSquash &&
         !xcSquash) {
-        // Will need to squash all instructions currently in flight and have
-        // the interrupt handler restart at the last non-committed inst.
-        // Most of that can be handled through the trap() function.  The
-        // processInterrupts() function really just checks for interrupts
-        // and then calls trap() if there is an interrupt present.
+        frontEnd->interruptPending = true;
+        if (robEmpty() && !LSQ.hasStoresToWB()) {
+            // Will need to squash all instructions currently in flight and have
+            // the interrupt handler restart at the last non-committed inst.
+            // Most of that can be handled through the trap() function.  The
+            // processInterrupts() function really just checks for interrupts
+            // and then calls trap() if there is an interrupt present.
 
-        // Not sure which thread should be the one to interrupt.  For now
-        // always do thread 0.
-        assert(!thread->inSyscall);
-        thread->inSyscall = true;
+            // Not sure which thread should be the one to interrupt.  For now
+            // always do thread 0.
+            assert(!thread->inSyscall);
+            thread->inSyscall = true;
 
-        // CPU will handle implementation of the interrupt.
-        cpu->processInterrupts();
+            // CPU will handle implementation of the interrupt.
+            cpu->processInterrupts();
 
-        // Now squash or record that I need to squash this cycle.
-        commitStatus = TrapPending;
+            // Now squash or record that I need to squash this cycle.
+            commitStatus = TrapPending;
 
-        // Exit state update mode to avoid accidental updating.
-        thread->inSyscall = false;
+            // Exit state update mode to avoid accidental updating.
+            thread->inSyscall = false;
 
-        // Generate trap squash event.
-        generateTrapEvent();
+            // Generate trap squash event.
+            generateTrapEvent();
 
-        DPRINTF(BE, "Interrupt detected.\n");
+            DPRINTF(BE, "Interrupt detected.\n");
+        } else {
+            DPRINTF(BE, "Interrupt must wait for ROB to drain.\n");
+        }
     }
 }
 
@@ -585,7 +605,7 @@ template <class Impl>
 void
 LWBackEnd<Impl>::handleFault(Fault &fault, Tick latency)
 {
-    DPRINTF(BE, "Handling fault!");
+    DPRINTF(BE, "Handling fault!\n");
 
     assert(!thread->inSyscall);
 
@@ -615,6 +635,9 @@ LWBackEnd<Impl>::tick()
 
     wbCycle = 0;
 
+    // Read in any done instruction information and update the IQ or LSQ.
+    updateStructures();
+
 #if FULL_SYSTEM
     checkInterrupts();
 
@@ -623,7 +646,7 @@ LWBackEnd<Impl>::tick()
         squashFromTrap();
     } else if (xcSquash) {
         squashFromXC();
-    } else if (fetchHasFault && robEmpty() && frontEnd->isEmpty()) {
+    } else if (fetchHasFault && robEmpty() && frontEnd->isEmpty() && !LSQ.hasStoresToWB()) {
         DPRINTF(BE, "ROB and front end empty, handling fetch fault\n");
         Fault fetch_fault = frontEnd->getFault();
         if (fetch_fault == NoFault) {
@@ -636,9 +659,6 @@ LWBackEnd<Impl>::tick()
     }
 #endif
 
-    // Read in any done instruction information and update the IQ or LSQ.
-    updateStructures();
-
     if (dispatchStatus != Blocked) {
         dispatchInsts();
     } else {
@@ -719,12 +739,41 @@ LWBackEnd<Impl>::dispatchInsts()
         for (int i = 0; i < inst->numDestRegs(); ++i)
             renameTable[inst->destRegIdx(i)] = inst;
 
-        if (inst->readyToIssue() && !inst->isNonSpeculative()) {
-            DPRINTF(BE, "Instruction [sn:%lli] ready, addding to exeList.\n",
-                    inst->seqNum);
-            exeList.push(inst);
+        if (inst->isMemBarrier() || inst->isWriteBarrier()) {
+            if (memBarrier) {
+                DPRINTF(BE, "Instruction [sn:%lli] is waiting on "
+                        "barrier [sn:%lli].\n",
+                        inst->seqNum, memBarrier->seqNum);
+                memBarrier->addMemDependent(inst);
+                inst->addSrcMemInst(memBarrier);
+            }
+            memBarrier = inst;
+            inst->setCanCommit();
+        } else if (inst->readyToIssue() && !inst->isNonSpeculative()) {
             if (inst->isMemRef()) {
+
                 LSQ.insert(inst);
+                if (memBarrier) {
+                    DPRINTF(BE, "Instruction [sn:%lli] is waiting on "
+                            "barrier [sn:%lli].\n",
+                            inst->seqNum, memBarrier->seqNum);
+                    memBarrier->addMemDependent(inst);
+                    inst->addSrcMemInst(memBarrier);
+                    addWaitingMemOp(inst);
+
+                    waitingList.push_front(inst);
+                    inst->iqIt = waitingList.begin();
+                    inst->iqItValid = true;
+                    waitingInsts++;
+                } else {
+                    DPRINTF(BE, "Instruction [sn:%lli] ready, addding to exeList.\n",
+                            inst->seqNum);
+                    exeList.push(inst);
+                }
+            } else {
+                DPRINTF(BE, "Instruction [sn:%lli] ready, addding to exeList.\n",
+                        inst->seqNum);
+                exeList.push(inst);
             }
         } else {
             if (inst->isNonSpeculative()) {
@@ -735,6 +784,14 @@ LWBackEnd<Impl>::dispatchInsts()
             if (inst->isMemRef()) {
                 addWaitingMemOp(inst);
                 LSQ.insert(inst);
+                if (memBarrier) {
+                    memBarrier->addMemDependent(inst);
+                    inst->addSrcMemInst(memBarrier);
+
+                    DPRINTF(BE, "Instruction [sn:%lli] is waiting on "
+                            "barrier [sn:%lli].\n",
+                            inst->seqNum, memBarrier->seqNum);
+                }
             }
 
             DPRINTF(BE, "Instruction [sn:%lli] not ready, addding to "
@@ -872,9 +929,6 @@ LWBackEnd<Impl>::executeInsts()
 
         ++funcExeInst;
         ++num_executed;
-        // keep an instruction count
-        thread->numInst++;
-        thread->numInsts++;
 
         exeList.pop();
 
@@ -915,7 +969,7 @@ LWBackEnd<Impl>::instToCommit(DynInstPtr &inst)
         inst->setCanCommit();
 
         if (inst->isExecuted()) {
-            inst->setCompleted();
+            inst->setResultReady();
             int dependents = wakeDependents(inst);
             if (dependents) {
                 producer_inst[0]++;
@@ -956,7 +1010,7 @@ LWBackEnd<Impl>::writebackInsts()
                     inst->seqNum, inst->readPC());
 
             inst->setCanCommit();
-            inst->setCompleted();
+            inst->setResultReady();
 
             if (inst->isExecuted()) {
                 int dependents = wakeDependents(inst);
@@ -997,7 +1051,9 @@ LWBackEnd<Impl>::commitInst(int inst_num)
     // If the instruction is not executed yet, then it is a non-speculative
     // or store inst.  Signal backwards that it should be executed.
     if (!inst->isExecuted()) {
-        if (inst->isNonSpeculative()) {
+        if (inst->isNonSpeculative() ||
+            inst->isMemBarrier() ||
+            inst->isWriteBarrier()) {
 #if !FULL_SYSTEM
             // Hack to make sure syscalls aren't executed until all stores
             // write back their data.  This direct communication shouldn't
@@ -1017,6 +1073,16 @@ LWBackEnd<Impl>::commitInst(int inst_num)
                     "instruction at the head of the ROB, PC %#x.\n",
                     inst->readPC());
 
+            if (inst->isMemBarrier() || inst->isWriteBarrier()) {
+                DPRINTF(BE, "Waking dependents on barrier [sn:%lli]\n",
+                        inst->seqNum);
+                assert(memBarrier);
+                wakeDependents(inst, true);
+                if (memBarrier == inst)
+                    memBarrier = NULL;
+                inst->clearMemDependents();
+            }
+
             // Send back the non-speculative instruction's sequence number.
             if (inst->iqItValid) {
                 DPRINTF(BE, "Removing instruction from waiting list\n");
@@ -1066,13 +1132,45 @@ LWBackEnd<Impl>::commitInst(int inst_num)
 
     // Not handled for now.
     assert(!inst->isThreadSync());
-
+    assert(inst->memDepReady());
+    // Stores will mark themselves as totally completed as they need
+    // to wait to writeback to memory.  @todo: Hack...attempt to fix
+    // having the checker be forced to wait until a store completes in
+    // order to check all of the instructions.  If the store at the
+    // head of the check list misses, but a later store hits, then
+    // loads in the checker may see the younger store values instead
+    // of the store they should see.  Either the checker needs its own
+    // memory (annoying to update), its own store buffer (how to tell
+    // which value is correct?), or something else...
+    if (!inst->isStore()) {
+        inst->setCompleted();
+    }
     // Check if the instruction caused a fault.  If so, trap.
     Fault inst_fault = inst->getFault();
 
+    // Use checker prior to updating anything due to traps or PC
+    // based events.
+    if (checker) {
+        checker->tick(inst);
+    }
+
     if (inst_fault != NoFault) {
         DPRINTF(BE, "Inst [sn:%lli] PC %#x has a fault\n",
                 inst->seqNum, inst->readPC());
+
+        // Instruction is completed as it has a fault.
+        inst->setCompleted();
+
+        if (LSQ.hasStoresToWB()) {
+            DPRINTF(BE, "Stores still in flight, will wait until drained.\n");
+            return false;
+        } else if (inst_num != 0) {
+            DPRINTF(BE, "Will wait until instruction is head of commit group.\n");
+            return false;
+        } else if (checker && inst->isStore()) {
+            checker->tick(inst);
+        }
+
         thread->setInst(
             static_cast<TheISA::MachInst>(inst->staticInst->machInst));
 #if FULL_SYSTEM
@@ -1094,6 +1192,8 @@ LWBackEnd<Impl>::commitInst(int inst_num)
     }
 
     if (inst->traceData) {
+        inst->traceData->setFetchSeq(inst->seqNum);
+        inst->traceData->setCPSeq(thread->numInst);
         inst->traceData->finalize();
         inst->traceData = NULL;
     }
@@ -1105,18 +1205,18 @@ LWBackEnd<Impl>::commitInst(int inst_num)
     instList.pop_back();
 
     --numInsts;
-    thread->numInsts++;
     ++thread->funcExeInst;
-    // Maybe move this to where teh fault is handled; if the fault is handled,
+    // Maybe move this to where the fault is handled; if the fault is handled,
     // don't try to set this myself as the fault will set it.  If not, then
     // I set thread->PC = thread->nextPC and thread->nextPC = thread->nextPC + 4.
     thread->setPC(thread->readNextPC());
+    thread->setNextPC(thread->readNextPC() + sizeof(TheISA::MachInst));
     updateComInstStats(inst);
 
     // Write the done sequence number here.
 //    LSQ.commitLoads(inst->seqNum);
-//    LSQ.commitStores(inst->seqNum);
     toIEW->doneSeqNum = inst->seqNum;
+    lastCommitCycle = curTick;
 
 #if FULL_SYSTEM
     int count = 0;
@@ -1243,6 +1343,22 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
         waitingInsts--;
     }
 
+    while (memBarrier && memBarrier->seqNum > sn) {
+        DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously squashed)\n", memBarrier->seqNum);
+        memBarrier->clearMemDependents();
+        if (memBarrier->memDepReady()) {
+            DPRINTF(BE, "No previous barrier\n");
+            memBarrier = NULL;
+        } else {
+            std::list<DynInstPtr> &srcs = memBarrier->getMemSrcs();
+            memBarrier = srcs.front();
+            srcs.pop_front();
+            assert(srcs.empty());
+            DPRINTF(BE, "Previous barrier: [sn:%lli]\n",
+                    memBarrier->seqNum);
+        }
+    }
+
     frontEnd->addFreeRegs(freed_regs);
 }
 
@@ -1254,6 +1370,7 @@ LWBackEnd<Impl>::squashFromXC()
     squash(squashed_inst);
     frontEnd->squash(squashed_inst, thread->readPC(),
                      false, false);
+    frontEnd->interruptPending = false;
 
     thread->trapPending = false;
     thread->inSyscall = false;
@@ -1269,6 +1386,7 @@ LWBackEnd<Impl>::squashFromTrap()
     squash(squashed_inst);
     frontEnd->squash(squashed_inst, thread->readPC(),
                      false, false);
+    frontEnd->interruptPending = false;
 
     thread->trapPending = false;
     thread->inSyscall = false;
@@ -1319,6 +1437,36 @@ LWBackEnd<Impl>::fetchFault(Fault &fault)
     fetchHasFault = true;
 }
 
+template <class Impl>
+void
+LWBackEnd<Impl>::switchOut()
+{
+    switchedOut = true;
+    // Need to get rid of all committed, non-speculative state and write it
+    // to memory/XC.  In this case this is stores that have committed and not
+    // yet written back.
+    LSQ.switchOut();
+    squash(0);
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::takeOverFrom(ExecContext *old_xc)
+{
+    switchedOut = false;
+    xcSquash = false;
+    trapSquash = false;
+
+    numInsts = 0;
+    numWaitingMemOps = 0;
+    waitingMemOps.clear();
+    waitingInsts = 0;
+    switchedOut = false;
+    dispatchStatus = Running;
+    commitStatus = Running;
+    LSQ.takeOverFrom(old_xc);
+}
+
 template <class Impl>
 void
 LWBackEnd<Impl>::updateExeInstStats(DynInstPtr &inst)
@@ -1358,7 +1506,11 @@ template <class Impl>
 void
 LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst)
 {
-    unsigned thread = inst->threadNumber;
+    unsigned tid = inst->threadNumber;
+
+    // keep an instruction count
+    thread->numInst++;
+    thread->numInsts++;
 
     cpu->numInst++;
     //
@@ -1366,33 +1518,33 @@ LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch()) {
-        stat_com_swp[thread]++;
+        stat_com_swp[tid]++;
     } else {
-        stat_com_inst[thread]++;
+        stat_com_inst[tid]++;
     }
 #else
-    stat_com_inst[thread]++;
+    stat_com_inst[tid]++;
 #endif
 
     //
     //  Control Instructions
     //
     if (inst->isControl())
-        stat_com_branches[thread]++;
+        stat_com_branches[tid]++;
 
     //
     //  Memory references
     //
     if (inst->isMemRef()) {
-        stat_com_refs[thread]++;
+        stat_com_refs[tid]++;
 
         if (inst->isLoad()) {
-            stat_com_loads[thread]++;
+            stat_com_loads[tid]++;
         }
     }
 
     if (inst->isMemBarrier()) {
-        stat_com_membars[thread]++;
+        stat_com_membars[tid]++;
     }
 }
 
diff --git a/cpu/ozone/lw_lsq.hh b/cpu/ozone/lw_lsq.hh
index eb9886244..042610324 100644
--- a/cpu/ozone/lw_lsq.hh
+++ b/cpu/ozone/lw_lsq.hh
@@ -41,6 +41,7 @@
 #include "cpu/inst_seq.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
+#include "sim/debug.hh"
 #include "sim/sim_object.hh"
 
 //class PageTable;
@@ -90,7 +91,10 @@ class OzoneLWLSQ {
         /** The writeback event for the store.  Needed for store
          * conditionals.
          */
+      public:
         Event *wbEvent;
+        bool miss;
+      private:
         /** The pointer to the LSQ unit that issued the store. */
         OzoneLWLSQ<Impl> *lsqPtr;
     };
@@ -228,6 +232,14 @@ class OzoneLWLSQ {
                         !storeQueue.back().completed &&
                         !dcacheInterface->isBlocked(); }
 
+    void switchOut();
+
+    void takeOverFrom(ExecContext *old_xc = NULL);
+
+    bool isSwitchedOut() { return switchedOut; }
+
+    bool switchedOut;
+
   private:
     /** Completes the store at the specified index. */
     void completeStore(int store_idx);
@@ -560,12 +572,10 @@ OzoneLWLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         sq_it++;
     }
 
-
     // If there's no forwarding case, then go access memory
     DPRINTF(OzoneLSQ, "Doing functional access for inst PC %#x\n",
             inst->readPC());
 
-
     // Setup MemReq pointer
     req->cmd = Read;
     req->completionEvent = NULL;
@@ -594,8 +604,12 @@ OzoneLWLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         DPRINTF(OzoneLSQ, "D-cache: PC:%#x reading from paddr:%#x "
                 "vaddr:%#x flags:%i\n",
                 inst->readPC(), req->paddr, req->vaddr, req->flags);
-
-
+/*
+        Addr debug_addr = ULL(0xfffffc0000be81a8);
+        if (req->vaddr == debug_addr) {
+            debug_break();
+        }
+*/
         assert(!req->completionEvent);
         req->completionEvent =
             new typename BackEnd::LdWritebackEvent(inst, be);
@@ -647,7 +661,15 @@ OzoneLWLSQ<Impl>::write(MemReqPtr &req, T &data, int store_idx)
     (*sq_it).req = req;
     (*sq_it).size = sizeof(T);
     (*sq_it).data = data;
-
+    assert(!req->data);
+    req->data = new uint8_t[64];
+    memcpy(req->data, (uint8_t *)&(*sq_it).data, req->size);
+/*
+    Addr debug_addr = ULL(0xfffffc0000be81a8);
+    if (req->vaddr == debug_addr) {
+        debug_break();
+    }
+*/
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/cpu/ozone/lw_lsq_impl.hh b/cpu/ozone/lw_lsq_impl.hh
index 7b22d2564..9b7e48f96 100644
--- a/cpu/ozone/lw_lsq_impl.hh
+++ b/cpu/ozone/lw_lsq_impl.hh
@@ -29,6 +29,7 @@
 #include "arch/isa_traits.hh"
 #include "base/str.hh"
 #include "cpu/ozone/lw_lsq.hh"
+#include "cpu/checker/cpu.hh"
 
 template <class Impl>
 OzoneLWLSQ<Impl>::StoreCompletionEvent::StoreCompletionEvent(DynInstPtr &_inst,
@@ -39,6 +40,7 @@ OzoneLWLSQ<Impl>::StoreCompletionEvent::StoreCompletionEvent(DynInstPtr &_inst,
       inst(_inst),
       be(_be),
       wbEvent(wb_event),
+      miss(false),
       lsqPtr(lsq_ptr)
 {
     this->setFlags(Event::AutoDelete);
@@ -54,13 +56,21 @@ OzoneLWLSQ<Impl>::StoreCompletionEvent::process()
     //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum);
 
 //    lsqPtr->cpu->wakeCPU();
+    if (lsqPtr->isSwitchedOut()) {
+        if (wbEvent)
+            delete wbEvent;
+
+        return;
+    }
+
     if (wbEvent) {
         wbEvent->process();
         delete wbEvent;
     }
 
     lsqPtr->completeStore(inst->sqIdx);
-    be->removeDcacheMiss(inst);
+    if (miss)
+        be->removeDcacheMiss(inst);
 }
 
 template <class Impl>
@@ -80,8 +90,7 @@ OzoneLWLSQ<Impl>::OzoneLWLSQ()
 template<class Impl>
 void
 OzoneLWLSQ<Impl>::init(Params *params, unsigned maxLQEntries,
-                     unsigned maxSQEntries, unsigned id)
-
+                       unsigned maxSQEntries, unsigned id)
 {
     DPRINTF(OzoneLSQ, "Creating OzoneLWLSQ%i object.\n",id);
 
@@ -90,7 +99,7 @@ OzoneLWLSQ<Impl>::init(Params *params, unsigned maxLQEntries,
     LQEntries = maxLQEntries;
     SQEntries = maxSQEntries;
 
-    for (int i = 0; i < LQEntries * 10; i++) {
+    for (int i = 0; i < LQEntries * 2; i++) {
         LQIndices.push(i);
         SQIndices.push(i);
     }
@@ -196,6 +205,7 @@ template <class Impl>
 void
 OzoneLWLSQ<Impl>::insertLoad(DynInstPtr &load_inst)
 {
+    assert(loads < LQEntries * 2);
     assert(!LQIndices.empty());
     int load_index = LQIndices.front();
     LQIndices.pop();
@@ -503,21 +513,13 @@ OzoneLWLSQ<Impl>::writebackStores()
         assert((*sq_it).req);
         assert(!(*sq_it).committed);
 
-        MemReqPtr req = (*sq_it).req;
         (*sq_it).committed = true;
 
+        MemReqPtr req = (*sq_it).req;
+
         req->cmd = Write;
         req->completionEvent = NULL;
         req->time = curTick;
-        assert(!req->data);
-        req->data = new uint8_t[64];
-        memcpy(req->data, (uint8_t *)&(*sq_it).data, req->size);
-
-        DPRINTF(OzoneLSQ, "D-Cache: Writing back store idx:%i PC:%#x "
-                "to Addr:%#x, data:%#x [sn:%lli]\n",
-                inst->sqIdx,inst->readPC(),
-                req->paddr, *(req->data),
-                inst->seqNum);
 
         switch((*sq_it).size) {
           case 1:
@@ -535,8 +537,25 @@ OzoneLWLSQ<Impl>::writebackStores()
           default:
             panic("Unexpected store size!\n");
         }
+        if (!(req->flags & LOCKED)) {
+            (*sq_it).inst->setCompleted();
+            if (cpu->checker) {
+                cpu->checker->tick((*sq_it).inst);
+            }
+        }
+
+        DPRINTF(OzoneLSQ, "D-Cache: Writing back store idx:%i PC:%#x "
+                "to Addr:%#x, data:%#x [sn:%lli]\n",
+                inst->sqIdx,inst->readPC(),
+                req->paddr, *(req->data),
+                inst->seqNum);
 
         if (dcacheInterface) {
+            assert(!req->completionEvent);
+            StoreCompletionEvent *store_event = new
+                StoreCompletionEvent(inst, be, NULL, this);
+            req->completionEvent = store_event;
+
             MemAccessResult result = dcacheInterface->access(req);
 
             if (isStalled() &&
@@ -551,13 +570,14 @@ OzoneLWLSQ<Impl>::writebackStores()
 
             if (result != MA_HIT && dcacheInterface->doEvents()) {
 //                Event *wb = NULL;
-
+                store_event->miss = true;
                 typename BackEnd::LdWritebackEvent *wb = NULL;
                 if (req->flags & LOCKED) {
                     // Stx_C does not generate a system port transaction.
 //                    req->result=1;
                     wb = new typename BackEnd::LdWritebackEvent(inst,
                                                             be);
+                    store_event->wbEvent = wb;
                 }
 
                 DPRINTF(OzoneLSQ,"D-Cache Write Miss!\n");
@@ -567,9 +587,6 @@ OzoneLWLSQ<Impl>::writebackStores()
 
                 // Will stores need their own kind of writeback events?
                 // Do stores even need writeback events?
-                assert(!req->completionEvent);
-                req->completionEvent = new
-                    StoreCompletionEvent(inst, be, wb, this);
                 be->addDcacheMiss(inst);
 
                 lastDcacheStall = curTick;
@@ -597,10 +614,10 @@ OzoneLWLSQ<Impl>::writebackStores()
                     typename BackEnd::LdWritebackEvent *wb =
                         new typename BackEnd::LdWritebackEvent(inst,
                                                                be);
-                    wb->schedule(curTick);
+                    store_event->wbEvent = wb;
                 }
                 sq_it--;
-                completeStore(inst->sqIdx);
+//                completeStore(inst->sqIdx);
             }
         } else {
             panic("Must HAVE DCACHE!!!!!\n");
@@ -758,31 +775,121 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx)
     DPRINTF(OzoneLSQ, "Completing store idx:%i [sn:%lli], storesToWB:%i\n",
             inst->sqIdx, inst->seqNum, storesToWB);
 
-    // A bit conservative because a store completion may not free up entries,
-    // but hopefully avoids two store completions in one cycle from making
-    // the CPU tick twice.
-//    cpu->activityThisCycle();
     assert(!storeQueue.empty());
     SQItHash.erase(sq_hash_it);
     SQIndices.push(inst->sqIdx);
     storeQueue.erase(sq_it);
     --stores;
-/*
-    SQIt oldest_store_it = --(storeQueue.end());
-    if (sq_it == oldest_store_it) {
-        do {
-            inst = (*oldest_store_it).inst;
-            sq_hash_it = SQItHash.find(inst->sqIdx);
-            assert(sq_hash_it != SQItHash.end());
-            SQItHash.erase(sq_hash_it);
-            SQIndices.push(inst->sqIdx);
-            storeQueue.erase(oldest_store_it--);
-
-            --stores;
-        } while ((*oldest_store_it).completed &&
-                 oldest_store_it != storeQueue.end());
-
-//        be->updateLSQNextCycle = true;
+//    assert(!inst->isCompleted());
+    inst->setCompleted();
+    if (cpu->checker) {
+        cpu->checker->tick(inst);
     }
-*/
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::switchOut()
+{
+    switchedOut = true;
+    SQIt sq_it = --(storeQueue.end());
+    while (storesToWB > 0 &&
+           sq_it != storeQueue.end() &&
+           (*sq_it).inst &&
+           (*sq_it).canWB) {
+
+        DynInstPtr inst = (*sq_it).inst;
+
+        if ((*sq_it).size == 0 && !(*sq_it).completed) {
+            sq_it--;
+//            completeStore(inst->sqIdx);
+
+            continue;
+        }
+
+        // Store conditionals don't complete until *after* they have written
+        // back.  If it's here and not yet sent to memory, then don't bother
+        // as it's not part of committed state.
+        if (inst->isDataPrefetch() || (*sq_it).committed ||
+            (*sq_it).req->flags & LOCKED) {
+            sq_it--;
+            continue;
+        }
+
+        assert((*sq_it).req);
+        assert(!(*sq_it).committed);
+
+        MemReqPtr req = (*sq_it).req;
+        (*sq_it).committed = true;
+
+        req->cmd = Write;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+        memcpy(req->data, (uint8_t *)&(*sq_it).data, req->size);
+
+        DPRINTF(OzoneLSQ, "Switching out : Writing back store idx:%i PC:%#x "
+                "to Addr:%#x, data:%#x directly to memory [sn:%lli]\n",
+                inst->sqIdx,inst->readPC(),
+                req->paddr, *(req->data),
+                inst->seqNum);
+
+        switch((*sq_it).size) {
+          case 1:
+            cpu->write(req, (uint8_t &)(*sq_it).data);
+            break;
+          case 2:
+            cpu->write(req, (uint16_t &)(*sq_it).data);
+            break;
+          case 4:
+            cpu->write(req, (uint32_t &)(*sq_it).data);
+            break;
+          case 8:
+            cpu->write(req, (uint64_t &)(*sq_it).data);
+            break;
+          default:
+            panic("Unexpected store size!\n");
+        }
+    }
+
+    // Clear the queue to free up resources
+    storeQueue.clear();
+    loadQueue.clear();
+    loads = stores = storesToWB = 0;
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::takeOverFrom(ExecContext *old_xc)
+{
+    // Clear out any old state. May be redundant if this is the first time
+    // the CPU is being used.
+    stalled = false;
+    isLoadBlocked = false;
+    loadBlockedHandled = false;
+    switchedOut = false;
+
+    // Could do simple checks here to see if indices are on twice
+    while (!LQIndices.empty())
+        LQIndices.pop();
+    while (!SQIndices.empty())
+        SQIndices.pop();
+
+    for (int i = 0; i < LQEntries * 2; i++) {
+        LQIndices.push(i);
+        SQIndices.push(i);
+    }
+
+    // May want to initialize these entries to NULL
+
+//    loadHead = loadTail = 0;
+
+//    storeHead = storeWBIdx = storeTail = 0;
+
+    usedPorts = 0;
+
+    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+
+    blockedLoadSeqNum = 0;
 }
diff --git a/cpu/ozone/simple_params.hh b/cpu/ozone/simple_params.hh
index e503654aa..647da1781 100644
--- a/cpu/ozone/simple_params.hh
+++ b/cpu/ozone/simple_params.hh
@@ -51,6 +51,7 @@ class SimpleParams : public BaseCPU::Params
     unsigned backEndLatency;
     unsigned maxInstBufferSize;
     unsigned numPhysicalRegs;
+    unsigned maxOutstandingMemOps;
     //
     // Fetch
     //
diff --git a/python/m5/objects/OzoneCPU.py b/python/m5/objects/OzoneCPU.py
index 8186a44bb..3fca61e28 100644
--- a/python/m5/objects/OzoneCPU.py
+++ b/python/m5/objects/OzoneCPU.py
@@ -9,12 +9,15 @@ class DerivOzoneCPU(BaseCPU):
     if not build_env['FULL_SYSTEM']:
         mem = Param.FunctionalMemory(NULL, "memory")
 
+    checker = Param.BaseCPU("Checker CPU")
+
     width = Param.Unsigned("Width")
     frontEndWidth = Param.Unsigned("Front end width")
     backEndWidth = Param.Unsigned("Back end width")
     backEndSquashLatency = Param.Unsigned("Back end squash latency")
     backEndLatency = Param.Unsigned("Back end latency")
     maxInstBufferSize = Param.Unsigned("Maximum instruction buffer size")
+    maxOutstandingMemOps = Param.Unsigned("Maximum number of outstanding memory operations")
     decodeToFetchDelay = Param.Unsigned("Decode to fetch delay")
     renameToFetchDelay = Param.Unsigned("Rename to fetch delay")
     iewToFetchDelay = Param.Unsigned("Issue/Execute/Writeback to fetch "

From bfa9cc2c3a7aa9003c145e6bda750edf18a01ea8 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 16 May 2006 13:48:05 -0400
Subject: [PATCH 24/50] Add some flags for the upcoming checker.

arch/alpha/isa/decoder.isa:
    Mark store conditionals as serializing.  This is slightly higher over head than they truly have in the 264, but it's close.  Normally they block any other instructions from entering the IQ until the IQ is empty.  This is higher overhead because it waits until the ROB is empty.

    Also mark RPCC as unverifiable.  The checker will just grab the value from the instruction and assume it's correct.
cpu/static_inst.hh:
    Add unverifiable flag, specifically for the CheckerCPU.

--HG--
extra : convert_revision : cbc34d1f2f5b07105d31d4bd8f19edae2cf8158e
---
 arch/alpha/isa/decoder.isa | 10 +++++++---
 cpu/static_inst.hh         |  4 ++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/alpha/isa/decoder.isa b/arch/alpha/isa/decoder.isa
index 905ace4e1..ac9f9fc4c 100644
--- a/arch/alpha/isa/decoder.isa
+++ b/arch/alpha/isa/decoder.isa
@@ -73,7 +73,9 @@ decode OPCODE default Unknown::unknown() {
                         uint64_t tmp = write_result;
                         // see stq_c
                         Ra = (tmp == 0 || tmp == 1) ? tmp : Ra;
-                    }}, mem_flags = LOCKED, inst_flags = IsNonSpeculative);
+                    }}, mem_flags = LOCKED, inst_flags = [IsNonSpeculative,
+                                                          IsSerializing,
+                                                          IsSerializeAfter]);
         0x2f: stq_c({{ Mem.uq = Ra; }},
                     {{
                         uint64_t tmp = write_result;
@@ -85,7 +87,9 @@ decode OPCODE default Unknown::unknown() {
                         // mailbox access, and we don't update the
                         // result register at all.
                         Ra = (tmp == 0 || tmp == 1) ? tmp : Ra;
-                    }}, mem_flags = LOCKED, inst_flags = IsNonSpeculative);
+                    }}, mem_flags = LOCKED, inst_flags = [IsNonSpeculative,
+                                                          IsSerializing,
+                                                          IsSerializeAfter]);
     }
 
     format IntegerOperate {
@@ -623,7 +627,7 @@ decode OPCODE default Unknown::unknown() {
 #else
                 Ra = curTick;
 #endif
-            }}, IsNonSpeculative);
+            }}, IsUnverifiable);
 
             // All of the barrier instructions below do nothing in
             // their execute() methods (hence the empty code blocks).
diff --git a/cpu/static_inst.hh b/cpu/static_inst.hh
index 550609ed7..0b8fe2f18 100644
--- a/cpu/static_inst.hh
+++ b/cpu/static_inst.hh
@@ -51,6 +51,7 @@ class AlphaDynInst;
 template <class Impl>
 class OzoneDynInst;
 
+class CheckerCPU;
 class FastCPU;
 class SimpleCPU;
 class InorderCPU;
@@ -128,6 +129,8 @@ class StaticInstBase : public RefCounted
         IsNonSpeculative, ///< Should not be executed speculatively
         IsQuiesce,
 
+        IsUnverifiable,
+
         NumFlags
     };
 
@@ -215,6 +218,7 @@ class StaticInstBase : public RefCounted
     bool isWriteBarrier() const { return flags[IsWriteBarrier]; }
     bool isNonSpeculative() const { return flags[IsNonSpeculative]; }
     bool isQuiesce() const { return flags[IsQuiesce]; }
+    bool isUnverifiable() const { return flags[IsUnverifiable]; }
     //@}
 
     /// Operation class.  Used to select appropriate function unit in issue.

From 989cc1735eb18f9894f91854acd28e9477fa3b60 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 16 May 2006 13:51:18 -0400
Subject: [PATCH 25/50] Sampling fixes related to the quiesce event.

cpu/cpu_exec_context.cc:
cpu/cpu_exec_context.hh:
    Sampling fixes.  The CPU models may switch during a quiesce period, so it needs to be sure to wake up the right XC.
cpu/exec_context.hh:
    Return the EndQuiesceEvent specifically.
sim/pseudo_inst.cc:
    Return the EndQuiesceEvent specifically for sampling.

--HG--
extra : convert_revision : f9aa1fc8d4db8058f05319cb6a3d4605ce93b4c8
---
 cpu/cpu_exec_context.cc | 10 ++++++++++
 cpu/cpu_exec_context.hh |  4 ++--
 cpu/exec_context.hh     |  5 +++--
 sim/pseudo_inst.cc      |  5 +++--
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/cpu/cpu_exec_context.cc b/cpu/cpu_exec_context.cc
index e15ba7e66..3d047856a 100644
--- a/cpu/cpu_exec_context.cc
+++ b/cpu/cpu_exec_context.cc
@@ -159,6 +159,16 @@ CPUExecContext::takeOverFrom(ExecContext *oldContext)
     func_exe_inst = oldContext->readFuncExeInst();
 #endif
 
+    EndQuiesceEvent *quiesce = oldContext->getQuiesceEvent();
+    if (quiesce) {
+        // Point the quiesce event's XC at this XC so that it wakes up
+        // the proper CPU.
+        quiesce->xc = proxy;
+    }
+    if (quiesceEvent) {
+        quiesceEvent->xc = proxy;
+    }
+
     storeCondFailures = 0;
 
     oldContext->setStatus(ExecContext::Unallocated);
diff --git a/cpu/cpu_exec_context.hh b/cpu/cpu_exec_context.hh
index 40153ff08..cac006925 100644
--- a/cpu/cpu_exec_context.hh
+++ b/cpu/cpu_exec_context.hh
@@ -135,9 +135,9 @@ class CPUExecContext
     Addr profilePC;
     void dumpFuncProfile();
 
-    Event *quiesceEvent;
+    EndQuiesceEvent *quiesceEvent;
 
-    Event *getQuiesceEvent() { return quiesceEvent; }
+    EndQuiesceEvent *getQuiesceEvent() { return quiesceEvent; }
 
     Tick readLastActivate() { return lastActivate; }
 
diff --git a/cpu/exec_context.hh b/cpu/exec_context.hh
index 039b04527..7bd7d5682 100644
--- a/cpu/exec_context.hh
+++ b/cpu/exec_context.hh
@@ -42,6 +42,7 @@
 class AlphaDTB;
 class AlphaITB;
 class BaseCPU;
+class EndQuiesceEvent;
 class Event;
 class FunctionalMemory;
 class PhysicalMemory;
@@ -130,7 +131,7 @@ class ExecContext
     virtual void unserialize(Checkpoint *cp, const std::string &section) = 0;
 
 #if FULL_SYSTEM
-    virtual Event *getQuiesceEvent() = 0;
+    virtual EndQuiesceEvent *getQuiesceEvent() = 0;
 
     // Not necessarily the best location for these...
     // Having an extra function just to read these is obnoxious
@@ -277,7 +278,7 @@ class ProxyExecContext : public ExecContext
     { actualXC->unserialize(cp, section); }
 
 #if FULL_SYSTEM
-    Event *getQuiesceEvent() { return actualXC->getQuiesceEvent(); }
+    EndQuiesceEvent *getQuiesceEvent() { return actualXC->getQuiesceEvent(); }
 
     Tick readLastActivate() { return actualXC->readLastActivate(); }
     Tick readLastSuspend() { return actualXC->readLastSuspend(); }
diff --git a/sim/pseudo_inst.cc b/sim/pseudo_inst.cc
index e475006e7..4d9541b58 100644
--- a/sim/pseudo_inst.cc
+++ b/sim/pseudo_inst.cc
@@ -38,6 +38,7 @@
 #include "cpu/base.hh"
 #include "cpu/sampler/sampler.hh"
 #include "cpu/exec_context.hh"
+#include "cpu/quiesce_event.hh"
 #include "kern/kernel_stats.hh"
 #include "sim/param.hh"
 #include "sim/serialize.hh"
@@ -83,7 +84,7 @@ namespace AlphaPseudo
         if (!doQuiesce || ns == 0)
             return;
 
-        Event *quiesceEvent = xc->getQuiesceEvent();
+        EndQuiesceEvent *quiesceEvent = xc->getQuiesceEvent();
 
         if (quiesceEvent->scheduled())
             quiesceEvent->reschedule(curTick + Clock::Int::ns * ns);
@@ -100,7 +101,7 @@ namespace AlphaPseudo
         if (!doQuiesce || cycles == 0)
             return;
 
-        Event *quiesceEvent = xc->getQuiesceEvent();
+        EndQuiesceEvent *quiesceEvent = xc->getQuiesceEvent();
 
         if (quiesceEvent->scheduled())
             quiesceEvent->reschedule(curTick +

From c23b23f4e7f9f0faec555cb282c899b77223a110 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 16 May 2006 13:59:29 -0400
Subject: [PATCH 27/50] Add in checker.  Supports dynamically verifying the
 execution of instructions, as well as limited amount of control path
 verification.  It will verify anything within the program, but anything
 external (traps, interrupts, XC) it assumes is redirected properly by the
 CPU.  Similarly it assumes the results of store conditionals, uncached loads,
 and instructions marked as "unverifiable" are correct from the CPU.

base/traceflags.py:
build/SConstruct:
cpu/SConscript:
cpu/cpu_models.py:
    Add in Checker.
cpu/base.cc:
    Add in checker support.  Also XC status starts off as suspended.
cpu/base.hh:
    Add in checker.

--HG--
extra : convert_revision : 091b5cc83e837858adb681ef0137a0beb30bd1b2
---
 base/traceflags.py            |   3 +-
 build/SConstruct              |   2 +-
 cpu/SConscript                |   7 +
 cpu/base.cc                   |  10 +-
 cpu/base.hh                   |   2 +
 cpu/checker/cpu.cc            | 857 ++++++++++++++++++++++++++++++++++
 cpu/checker/cpu.hh            | 336 +++++++++++++
 cpu/checker/cpu_builder.cc    | 126 +++++
 cpu/checker/exec_context.hh   | 225 +++++++++
 cpu/checker/o3_cpu_builder.cc | 126 +++++
 cpu/cpu_models.py             |   3 +
 11 files changed, 1692 insertions(+), 5 deletions(-)
 create mode 100644 cpu/checker/cpu.cc
 create mode 100644 cpu/checker/cpu.hh
 create mode 100644 cpu/checker/cpu_builder.cc
 create mode 100644 cpu/checker/exec_context.hh
 create mode 100644 cpu/checker/o3_cpu_builder.cc

diff --git a/base/traceflags.py b/base/traceflags.py
index bd0f258a0..47ed59c3a 100644
--- a/base/traceflags.py
+++ b/base/traceflags.py
@@ -150,7 +150,8 @@ baseFlags = [
     'DependGraph',
     'Activity',
     'Scoreboard',
-    'Writeback'
+    'Writeback',
+    'Checker'
     ]
 
 #
diff --git a/build/SConstruct b/build/SConstruct
index c40f59bc2..110a0f250 100644
--- a/build/SConstruct
+++ b/build/SConstruct
@@ -223,7 +223,7 @@ env['ALL_ISA_LIST'] = ['alpha', 'sparc', 'mips']
 
 # Define the universe of supported CPU models
 env['ALL_CPU_LIST'] = ['SimpleCPU', 'FastCPU', 'FullCPU', 'AlphaFullCPU',
-                       'OzoneSimpleCPU', 'OzoneCPU']
+                       'OzoneSimpleCPU', 'OzoneCPU', 'CheckerCPU']
 
 
 # Sticky options get saved in the options file so they persist from
diff --git a/cpu/SConscript b/cpu/SConscript
index c34971acf..5d727bd25 100644
--- a/cpu/SConscript
+++ b/cpu/SConscript
@@ -150,6 +150,13 @@ if 'OzoneCPU' in env['CPU_MODELS']:
         ozone/lw_lsq.cc
         ''')
 
+if 'CheckerCPU' in env['CPU_MODELS']:
+    sources += Split('''
+        checker/cpu.cc
+        checker/cpu_builder.cc
+        checker/o3_cpu_builder.cc
+        ''')
+
 # FullCPU sources are included from m5/SConscript since they're not
 # below this point in the file hierarchy.
 
diff --git a/cpu/base.cc b/cpu/base.cc
index 2eb5f7fd3..74b679d5d 100644
--- a/cpu/base.cc
+++ b/cpu/base.cc
@@ -164,6 +164,7 @@ BaseCPU::Params::Params()
 #if FULL_SYSTEM
     profile = false;
 #endif
+    checker = NULL;
 }
 
 void
@@ -229,15 +230,18 @@ BaseCPU::registerExecContexts()
 {
     for (int i = 0; i < execContexts.size(); ++i) {
         ExecContext *xc = execContexts[i];
+
+        if (xc->status() == ExecContext::Suspended) {
 #if FULL_SYSTEM
-        int id = params->cpu_id;
-        if (id != -1)
-            id += i;
+            int id = params->cpu_id;
+            if (id != -1)
+                id += i;
 
         xc->setCpuId(system->registerExecContext(xc, id));
 #else
         xc->setCpuId(xc->getProcessPtr()->registerExecContext(xc));
 #endif
+        }
     }
 }
 
diff --git a/cpu/base.hh b/cpu/base.hh
index d9d5d2b88..20166d7ee 100644
--- a/cpu/base.hh
+++ b/cpu/base.hh
@@ -44,6 +44,7 @@ namespace Kernel { class Statistics; }
 #endif
 
 class BranchPred;
+class CheckerCPU;
 class ExecContext;
 
 class BaseCPU : public SimObject
@@ -128,6 +129,7 @@ class BaseCPU : public SimObject
         int cpu_id;
         Tick profile;
 #endif
+        BaseCPU *checker;
 
         Params();
     };
diff --git a/cpu/checker/cpu.cc b/cpu/checker/cpu.cc
new file mode 100644
index 000000000..f1b43f601
--- /dev/null
+++ b/cpu/checker/cpu.cc
@@ -0,0 +1,857 @@
+/*
+ * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//#include <cmath>
+#include <cstdio>
+//#include <cstdlib>
+#include <iostream>
+#include <iomanip>
+#include <list>
+//#include <sstream>
+#include <string>
+
+//#include "base/cprintf.hh"
+//#include "base/inifile.hh"
+//#include "base/loader/symtab.hh"
+#include "base/misc.hh"
+//#include "base/pollevent.hh"
+//#include "base/range.hh"
+#include "base/refcnt.hh"
+//#include "base/stats/events.hh"
+#include "cpu/base.hh"
+#include "cpu/base_dyn_inst.hh"
+#include "cpu/checker/cpu.hh"
+#include "cpu/cpu_exec_context.hh"
+#include "cpu/exec_context.hh"
+//#include "cpu/exetrace.hh"
+//#include "cpu/profile.hh"
+#include "cpu/sampler/sampler.hh"
+//#include "cpu/smt.hh"
+#include "cpu/static_inst.hh"
+//#include "kern/kernel_stats.hh"
+#include "mem/base_mem.hh"
+#include "mem/mem_interface.hh"
+#include "sim/byteswap.hh"
+#include "sim/builder.hh"
+//#include "sim/debug.hh"
+//#include "sim/host.hh"
+//#include "sim/sim_events.hh"
+#include "sim/sim_object.hh"
+#include "sim/stats.hh"
+
+#include "cpu/o3/alpha_dyn_inst.hh"
+#include "cpu/o3/alpha_impl.hh"
+
+#include "cpu/ozone/dyn_inst.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
+
+#if FULL_SYSTEM
+#include "base/remote_gdb.hh"
+#include "mem/functional/memory_control.hh"
+#include "mem/functional/physical.hh"
+#include "sim/system.hh"
+#include "arch/tlb.hh"
+#include "arch/stacktrace.hh"
+#include "arch/vtophys.hh"
+#else // !FULL_SYSTEM
+#include "mem/functional/functional.hh"
+#endif // FULL_SYSTEM
+
+using namespace std;
+//The CheckerCPU does alpha only
+using namespace AlphaISA;
+
+void
+CheckerCPU::init()
+{
+/*
+    BaseCPU::init();
+#if FULL_SYSTEM
+    for (int i = 0; i < execContexts.size(); ++i) {
+        ExecContext *xc = execContexts[i];
+
+        // initialize CPU, including PC
+        TheISA::initCPU(xc, xc->readCpuId());
+    }
+#endif
+*/
+}
+
+CheckerCPU::CheckerCPU(Params *p)
+    : BaseCPU(p), cpuXC(NULL), xcProxy(NULL)
+{
+    memReq = new MemReq();
+    memReq->xc = xcProxy;
+    memReq->asid = 0;
+    memReq->data = new uint8_t[64];
+
+    numInst = 0;
+    startNumInst = 0;
+    numLoad = 0;
+    startNumLoad = 0;
+    youngestSN = 0;
+
+    changedPC = willChangePC = changedNextPC = false;
+
+    exitOnError = p->exitOnError;
+#if FULL_SYSTEM
+    itb = p->itb;
+    dtb = p->dtb;
+    systemPtr = NULL;
+    memPtr = NULL;
+#endif
+}
+
+CheckerCPU::~CheckerCPU()
+{
+}
+
+void
+CheckerCPU::setMemory(FunctionalMemory *mem)
+{
+    memPtr = mem;
+#if !FULL_SYSTEM
+    cpuXC = new CPUExecContext(this, /* thread_num */ 0, mem,
+                               /* asid */ 0);
+
+    cpuXC->setStatus(ExecContext::Suspended);
+    xcProxy = cpuXC->getProxy();
+    execContexts.push_back(xcProxy);
+#else
+    if (systemPtr) {
+        cpuXC = new CPUExecContext(this, 0, systemPtr, itb, dtb, memPtr);
+
+        cpuXC->setStatus(ExecContext::Suspended);
+        xcProxy = cpuXC->getProxy();
+        execContexts.push_back(xcProxy);
+        memReq->xc = xcProxy;
+    }
+#endif
+}
+
+#if FULL_SYSTEM
+void
+CheckerCPU::setSystem(System *system)
+{
+    systemPtr = system;
+
+    if (memPtr) {
+        cpuXC = new CPUExecContext(this, 0, systemPtr, itb, dtb, memPtr);
+
+        cpuXC->setStatus(ExecContext::Suspended);
+        xcProxy = cpuXC->getProxy();
+        execContexts.push_back(xcProxy);
+        memReq->xc = xcProxy;
+    }
+}
+#endif
+
+void
+CheckerCPU::serialize(ostream &os)
+{
+/*
+    BaseCPU::serialize(os);
+    SERIALIZE_SCALAR(inst);
+    nameOut(os, csprintf("%s.xc", name()));
+    cpuXC->serialize(os);
+    cacheCompletionEvent.serialize(os);
+*/
+}
+
+void
+CheckerCPU::unserialize(Checkpoint *cp, const string &section)
+{
+/*
+    BaseCPU::unserialize(cp, section);
+    UNSERIALIZE_SCALAR(inst);
+    cpuXC->unserialize(cp, csprintf("%s.xc", section));
+*/
+}
+
+Fault
+CheckerCPU::copySrcTranslate(Addr src)
+{
+    static bool no_warn = true;
+    int blk_size = 64;
+    // Only support block sizes of 64 atm.
+    assert(blk_size == 64);
+    int offset = src & (blk_size - 1);
+
+    // Make sure block doesn't span page
+    if (no_warn &&
+        (src & PageMask) != ((src + blk_size) & PageMask) &&
+        (src >> 40) != 0xfffffc) {
+        warn("Copied block source spans pages %x.", src);
+        no_warn = false;
+    }
+
+    memReq->reset(src & ~(blk_size - 1), blk_size);
+
+    // translate to physical address
+    Fault fault = cpuXC->translateDataReadReq(memReq);
+
+    if (fault == NoFault) {
+        cpuXC->copySrcAddr = src;
+        cpuXC->copySrcPhysAddr = memReq->paddr + offset;
+    } else {
+        assert(!fault->isAlignmentFault());
+
+        cpuXC->copySrcAddr = 0;
+        cpuXC->copySrcPhysAddr = 0;
+    }
+    return fault;
+}
+
+Fault
+CheckerCPU::copy(Addr dest)
+{
+    static bool no_warn = true;
+    int blk_size = 64;
+    // Only support block sizes of 64 atm.
+    assert(blk_size == 64);
+    uint8_t data[blk_size];
+    //assert(cpuXC->copySrcAddr);
+    int offset = dest & (blk_size - 1);
+
+    // Make sure block doesn't span page
+    if (no_warn &&
+        (dest & PageMask) != ((dest + blk_size) & PageMask) &&
+        (dest >> 40) != 0xfffffc) {
+        no_warn = false;
+        warn("Copied block destination spans pages %x. ", dest);
+    }
+
+    memReq->reset(dest & ~(blk_size -1), blk_size);
+    // translate to physical address
+    Fault fault = cpuXC->translateDataWriteReq(memReq);
+
+    if (fault == NoFault) {
+        Addr dest_addr = memReq->paddr + offset;
+        // Need to read straight from memory since we have more than 8 bytes.
+        memReq->paddr = cpuXC->copySrcPhysAddr;
+        cpuXC->mem->read(memReq, data);
+        memReq->paddr = dest_addr;
+        cpuXC->mem->write(memReq, data);
+        memReq->cmd = Copy;
+        memReq->completionEvent = NULL;
+        memReq->paddr = cpuXC->copySrcPhysAddr;
+        memReq->dest = dest_addr;
+        memReq->size = 64;
+        memReq->time = curTick;
+        memReq->flags &= ~INST_READ;
+    }
+    else
+        assert(!fault->isAlignmentFault());
+
+    return fault;
+}
+
+// precise architected memory state accessor macros
+template <class T>
+Fault
+CheckerCPU::read(Addr addr, T &data, unsigned flags)
+{
+    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+    // Should I probe the DTB?  Or should I just take the physical address
+    // and assume correct translation?
+    translateDataReadReq(memReq);
+
+    // if we have a cache, do cache access too
+    memReq->cmd = Read;
+    memReq->completionEvent = NULL;
+    memReq->time = curTick;
+    memReq->flags &= ~INST_READ;
+
+    if (!(memReq->flags & UNCACHEABLE)) {
+        cpuXC->read(memReq, data);
+    } else {
+        // Assume the data is correct if it's an uncached access
+        memcpy(&data, &unverifiedResult.integer, sizeof(T));
+    }
+
+    return NoFault;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+template
+Fault
+CheckerCPU::read(Addr addr, uint64_t &data, unsigned flags);
+
+template
+Fault
+CheckerCPU::read(Addr addr, uint32_t &data, unsigned flags);
+
+template
+Fault
+CheckerCPU::read(Addr addr, uint16_t &data, unsigned flags);
+
+template
+Fault
+CheckerCPU::read(Addr addr, uint8_t &data, unsigned flags);
+
+#endif //DOXYGEN_SHOULD_SKIP_THIS
+
+template<>
+Fault
+CheckerCPU::read(Addr addr, double &data, unsigned flags)
+{
+    return read(addr, *(uint64_t*)&data, flags);
+}
+
+template<>
+Fault
+CheckerCPU::read(Addr addr, float &data, unsigned flags)
+{
+    return read(addr, *(uint32_t*)&data, flags);
+}
+
+template<>
+Fault
+CheckerCPU::read(Addr addr, int32_t &data, unsigned flags)
+{
+    return read(addr, (uint32_t&)data, flags);
+}
+
+template <class T>
+Fault
+CheckerCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
+{
+    memReq->reset(addr, sizeof(T), flags);
+
+    // translate to physical address
+    cpuXC->translateDataWriteReq(memReq);
+
+    if ((!(unverifiedReq->flags & LOCKED) ||
+        ((unverifiedReq->flags & LOCKED) &&
+         unverifiedReq->result == 1)) &&
+        !(unverifiedReq->flags & UNCACHEABLE)) {
+        // do functional access
+//        cpuXC->read(memReq, data);
+
+        memReq->cmd = Write;
+//    memcpy(memReq->data,(uint8_t *)&data,memReq->size);
+        T inst_data;
+        memcpy(&inst_data, unverifiedReq->data, sizeof(T));
+        memReq->completionEvent = NULL;
+        memReq->time = curTick;
+        memReq->flags &= ~INST_READ;
+
+        // Hard to verify this as the data writes back after the
+        // instruction commits.  May only be able to check that the
+        // value produced from execute() matches the value produced
+        // from the instruction's first execution.
+        if (data != inst_data) {
+            warn("Store value does not match value in memory! "
+                 "Instruction: %#x, memory: %#x",
+                 inst_data, data);
+            handleError();
+        }
+    }
+
+    // Assume the result was the same as the one passed in.  This checker
+    // doesn't check if the SC should succeed or fail, it just checks the
+    // value.
+    if (res)
+        *res = unverifiedReq->result;
+
+    return NoFault;
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+template
+Fault
+CheckerCPU::write(uint64_t data, Addr addr, unsigned flags, uint64_t *res);
+
+template
+Fault
+CheckerCPU::write(uint32_t data, Addr addr, unsigned flags, uint64_t *res);
+
+template
+Fault
+CheckerCPU::write(uint16_t data, Addr addr, unsigned flags, uint64_t *res);
+
+template
+Fault
+CheckerCPU::write(uint8_t data, Addr addr, unsigned flags, uint64_t *res);
+
+#endif //DOXYGEN_SHOULD_SKIP_THIS
+
+template<>
+Fault
+CheckerCPU::write(double data, Addr addr, unsigned flags, uint64_t *res)
+{
+    return write(*(uint64_t*)&data, addr, flags, res);
+}
+
+template<>
+Fault
+CheckerCPU::write(float data, Addr addr, unsigned flags, uint64_t *res)
+{
+    return write(*(uint32_t*)&data, addr, flags, res);
+}
+
+template<>
+Fault
+CheckerCPU::write(int32_t data, Addr addr, unsigned flags, uint64_t *res)
+{
+    return write((uint32_t)data, addr, flags, res);
+}
+
+
+#if FULL_SYSTEM
+Addr
+CheckerCPU::dbg_vtophys(Addr addr)
+{
+    return vtophys(xcProxy, addr);
+}
+#endif // FULL_SYSTEM
+
+#if FULL_SYSTEM
+void
+CheckerCPU::post_interrupt(int int_num, int index)
+{
+    BaseCPU::post_interrupt(int_num, index);
+
+    if (cpuXC->status() == ExecContext::Suspended) {
+                DPRINTF(IPI,"Suspended Processor awoke\n");
+        cpuXC->activate();
+    }
+}
+#endif // FULL_SYSTEM
+
+bool
+CheckerCPU::translateInstReq(MemReqPtr &req)
+{
+#if FULL_SYSTEM
+    return (cpuXC->translateInstReq(req) == NoFault);
+#else
+    cpuXC->translateInstReq(req);
+    return true;
+#endif
+}
+
+void
+CheckerCPU::translateDataReadReq(MemReqPtr &req)
+{
+    cpuXC->translateDataReadReq(req);
+
+    if (req->vaddr != unverifiedReq->vaddr) {
+        warn("Request virtual addresses do not match! Inst: %#x, checker:"
+             " %#x",
+             unverifiedReq->vaddr, req->vaddr);
+    }
+    req->paddr = unverifiedReq->paddr;
+
+    if (checkFlags(req)) {
+        warn("Request flags do not match! Inst: %#x, checker: %#x",
+             unverifiedReq->flags, req->flags);
+        handleError();
+    }
+}
+
+void
+CheckerCPU::translateDataWriteReq(MemReqPtr &req)
+{
+    cpuXC->translateDataWriteReq(req);
+
+    if (req->vaddr != unverifiedReq->vaddr) {
+        warn("Request virtual addresses do not match! Inst: %#x, checker:"
+             " %#x",
+             unverifiedReq->vaddr, req->vaddr);
+    }
+    req->paddr = unverifiedReq->paddr;
+
+    if (checkFlags(req)) {
+        warn("Request flags do not match! Inst: %#x, checker: %#x",
+             unverifiedReq->flags, req->flags);
+        handleError();
+    }
+}
+
+bool
+CheckerCPU::checkFlags(MemReqPtr &req)
+{
+    // Remove any dynamic flags that don't have to do with the request itself.
+    unsigned flags = unverifiedReq->flags;
+    unsigned mask = LOCKED | PHYSICAL | VPTE | ALTMODE | UNCACHEABLE | NO_FAULT;
+    flags = flags & (mask);
+    if (flags == req->flags) {
+        return false;
+    } else {
+        return true;
+    }
+}
+
+/* start simulation, program loaded, processor precise state initialized */
+template <class DynInstPtr>
+void
+Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
+{
+    DynInstPtr inst;
+
+    if (!instList.empty()) {
+        if (youngestSN < completed_inst->seqNum) {
+            DPRINTF(Checker, "Adding instruction [sn:%lli] PC:%#x to list.\n",
+                    completed_inst->seqNum, completed_inst->readPC());
+            instList.push_back(completed_inst);
+            youngestSN = completed_inst->seqNum;
+        }
+
+        if (!instList.front()->isCompleted()) {
+            return;
+        } else {
+            inst = instList.front();
+            instList.pop_front();
+        }
+    } else {
+        if (!completed_inst->isCompleted()) {
+            if (youngestSN < completed_inst->seqNum) {
+                DPRINTF(Checker, "Adding instruction [sn:%lli] PC:%#x to list.\n",
+                        completed_inst->seqNum, completed_inst->readPC());
+                instList.push_back(completed_inst);
+                youngestSN = completed_inst->seqNum;
+            }
+            return;
+        } else {
+            if (youngestSN < completed_inst->seqNum) {
+                inst = completed_inst;
+                youngestSN = completed_inst->seqNum;
+            } else {
+//                panic("SN already seen yet the list is empty!");
+                return;
+            }
+        }
+    }
+
+    while (1) {
+        DPRINTF(Checker, "Processing instruction [sn:%lli] PC:%#x.\n",
+                inst->seqNum, inst->readPC());
+//    verifyInst = completed_inst;
+        unverifiedResult.integer = inst->readIntResult();
+        unverifiedReq = inst->req;
+        numCycles++;
+
+        Fault fault = NoFault;
+
+        // maintain $r0 semantics
+        cpuXC->setIntReg(ZeroReg, 0);
+#ifdef TARGET_ALPHA
+        cpuXC->setFloatRegDouble(ZeroReg, 0.0);
+#endif // TARGET_ALPHA
+
+        // Try to fetch an instruction
+
+        // set up memory request for instruction fetch
+#if FULL_SYSTEM
+#define IFETCH_FLAGS(pc)	((pc) & 1) ? PHYSICAL : 0
+#else
+#define IFETCH_FLAGS(pc)	0
+#endif
+
+        if (changedPC) {
+            DPRINTF(Checker, "Changed PC recently to %#x\n",
+                    cpuXC->readPC());
+            if (willChangePC) {
+                if (newPC == cpuXC->readPC()) {
+                    DPRINTF(Checker, "Changed PC matches expected PC\n");
+                } else {
+                    warn("Changed PC does not match expected PC, changed: %#x, "
+                         "expected: %#x",
+                         cpuXC->readPC(), newPC);
+                    handleError();
+                }
+                willChangePC = false;
+            }
+            changedPC = false;
+        }
+        if (changedNextPC) {
+            DPRINTF(Checker, "Changed NextPC recently to %#x\n",
+                    cpuXC->readNextPC());
+            changedNextPC = false;
+        }
+
+        memReq->cmd = Read;
+        memReq->reset(cpuXC->readPC() & ~3, sizeof(uint32_t),
+                      IFETCH_FLAGS(cpuXC->readPC()));
+
+        bool succeeded = translateInstReq(memReq);
+
+        if (!succeeded) {
+            warn("Instruction PC %#x was not found in the ITB!",
+                 cpuXC->readPC());
+            handleError();
+
+            // go to the next instruction
+            cpuXC->setPC(cpuXC->readNextPC());
+            cpuXC->setNextPC(cpuXC->readNextPC() + sizeof(MachInst));
+
+            return;
+        }
+
+//    if (fault == NoFault)
+//        fault = cpuXC->mem->read(memReq, machInst);
+        cpuXC->mem->read(memReq, machInst);
+
+        // If we've got a valid instruction (i.e., no fault on instruction
+        // fetch), then execute it.
+
+        // keep an instruction count
+        numInst++;
+//	numInsts++;
+
+        // decode the instruction
+        machInst = gtoh(machInst);
+        // Checks that the instruction matches what we expected it to be.
+        // Checks both the machine instruction and the PC.
+        validateInst(inst);
+
+        curStaticInst = StaticInst::decode(makeExtMI(machInst, cpuXC->readPC()));
+
+#if FULL_SYSTEM
+        cpuXC->setInst(machInst);
+#endif // FULL_SYSTEM
+
+        fault = inst->getFault();
+
+        // Either the instruction was a fault and we should process the fault,
+        // or we should just go ahead execute the instruction.  This assumes
+        // that the instruction is properly marked as a fault.
+        if (fault == NoFault) {
+
+            cpuXC->func_exe_inst++;
+
+            fault = curStaticInst->execute(this, NULL);
+
+            // Checks to make sure instrution results are correct.
+            validateExecution(inst);
+
+//	if (curStaticInst->isMemRef()) {
+//	    numMemRefs++;
+//	}
+
+            if (curStaticInst->isLoad()) {
+                ++numLoad;
+            }
+        }
+
+        if (fault != NoFault) {
+#if FULL_SYSTEM
+            fault->invoke(xcProxy);
+            willChangePC = true;
+            newPC = cpuXC->readPC();
+            DPRINTF(Checker, "Fault, PC is now %#x\n", newPC);
+#else // !FULL_SYSTEM
+            fatal("fault (%d) detected @ PC 0x%08p", fault, cpuXC->readPC());
+#endif // FULL_SYSTEM
+        } else {
+#if THE_ISA != MIPS_ISA
+            // go to the next instruction
+            cpuXC->setPC(cpuXC->readNextPC());
+            cpuXC->setNextPC(cpuXC->readNextPC() + sizeof(MachInst));
+#else
+            // go to the next instruction
+            cpuXC->setPC(cpuXC->readNextPC());
+            cpuXC->setNextPC(cpuXC->readNextNPC());
+            cpuXC->setNextNPC(cpuXC->readNextNPC() + sizeof(MachInst));
+#endif
+
+        }
+
+#if FULL_SYSTEM
+        Addr oldpc;
+        int count = 0;
+        do {
+            oldpc = cpuXC->readPC();
+            system->pcEventQueue.service(xcProxy);
+            count++;
+        } while (oldpc != cpuXC->readPC());
+        if (count > 1) {
+            willChangePC = true;
+            newPC = cpuXC->readPC();
+            DPRINTF(Checker, "PC Event, PC is now %#x\n", newPC);
+        }
+#endif
+
+        // Checks PC, next PC.  Optionally can check all registers. (Or just those
+        // that have been modified).
+        validateState();
+
+        if (instList.empty()) {
+            break;
+        } else if (instList.front()->isCompleted()) {
+            inst = instList.front();
+            instList.pop_front();
+        } else {
+            break;
+        }
+    }
+}
+
+template <class DynInstPtr>
+void
+Checker<DynInstPtr>::switchOut(Sampler *s)
+{
+    sampler = s;
+    instList.clear();
+}
+
+template <class DynInstPtr>
+void
+Checker<DynInstPtr>::takeOverFrom(BaseCPU *oldCPU)
+{
+//    BaseCPU::takeOverFrom(oldCPU);
+
+    // if any of this CPU's ExecContexts are active, mark the CPU as
+    // running and schedule its tick event.
+/*
+    for (int i = 0; i < execContexts.size(); ++i) {
+        ExecContext *xc = execContexts[i];
+    }
+*/
+}
+
+template <class DynInstPtr>
+void
+Checker<DynInstPtr>::validateInst(DynInstPtr &inst)
+{
+    if (inst->readPC() != cpuXC->readPC()) {
+        warn("PCs do not match! Inst: %#x, checker: %#x",
+             inst->readPC(), cpuXC->readPC());
+        if (changedPC) {
+            warn("Changed PCs recently, may not be an error");
+        } else {
+            handleError();
+        }
+    }
+
+    if (static_cast<MachInst>(inst->staticInst->machInst) !=
+        machInst) {
+        warn("Binary instructions do not match! Inst: %#x, checker: %#x",
+             static_cast<MachInst>(inst->staticInst->machInst),
+             machInst);
+        handleError();
+    }
+}
+
+template <class DynInstPtr>
+void
+Checker<DynInstPtr>::validateExecution(DynInstPtr &inst)
+{
+    if (inst->numDestRegs()) {
+        if (inst->isUnverifiable()) {
+            // @todo: Support more destination registers.
+            // Grab the result from the instruction and write it to the
+            // register.
+            RegIndex idx = inst->destRegIdx(0);
+            if (idx < TheISA::FP_Base_DepTag) {
+                cpuXC->setIntReg(idx, inst->readIntResult());
+            } else if (idx < TheISA::Fpcr_DepTag) {
+                cpuXC->setFloatRegInt(idx, inst->readIntResult());
+            } else {
+                cpuXC->setMiscReg(idx, inst->readIntResult());
+            }
+        } else if (result.integer != inst->readIntResult()) {
+            warn("Instruction results do not match! (May not be integer results) "
+                 "Inst: %#x, checker: %#x",
+                 inst->readIntResult(), result.integer);
+            handleError();
+        }
+    }
+
+    if (inst->readNextPC() != cpuXC->readNextPC()) {
+        warn("Instruction next PCs do not match! Inst: %#x, checker: %#x",
+             inst->readNextPC(), cpuXC->readNextPC());
+        handleError();
+    }
+
+    // Checking side effect registers can be difficult if they are not
+    // checked simultaneously with the execution of the instruction.
+    // This is because other valid instructions may have modified
+    // these registers in the meantime, and their values are not
+    // stored within the DynInst.
+    while (!miscRegIdxs.empty()) {
+        int misc_reg_idx = miscRegIdxs.front();
+        miscRegIdxs.pop();
+
+        if (inst->xcBase()->readMiscReg(misc_reg_idx) !=
+            cpuXC->readMiscReg(misc_reg_idx)) {
+            warn("Misc reg idx %i (side effect) does not match! Inst: %#x, "
+                 "checker: %#x",
+                 misc_reg_idx, inst->xcBase()->readMiscReg(misc_reg_idx),
+                 cpuXC->readMiscReg(misc_reg_idx));
+            handleError();
+        }
+    }
+}
+
+template <class DynInstPtr>
+void
+Checker<DynInstPtr>::validateState()
+{
+}
+
+template <class DynInstPtr>
+void
+Checker<DynInstPtr>::dumpInsts()
+{
+    int num = 0;
+
+    InstListIt inst_list_it = --(instList.end());
+
+    cprintf("Inst list size: %i\n", instList.size());
+
+    while (inst_list_it != instList.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Completed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isCompleted());
+
+        cprintf("\n");
+
+        inst_list_it--;
+        ++num;
+    }
+
+}
+
+template
+class Checker<RefCountingPtr<OzoneDynInst<OzoneImpl> > >;
+
+template
+class Checker<RefCountingPtr<AlphaDynInst<AlphaSimpleImpl> > >;
diff --git a/cpu/checker/cpu.hh b/cpu/checker/cpu.hh
new file mode 100644
index 000000000..678e888df
--- /dev/null
+++ b/cpu/checker/cpu.hh
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_CHECKER_CPU_HH__
+#define __CPU_CHECKER_CPU_HH__
+
+#include <list>
+#include <queue>
+#include <map>
+
+#include "base/statistics.hh"
+#include "config/full_system.hh"
+#include "cpu/base.hh"
+#include "cpu/base_dyn_inst.hh"
+#include "cpu/cpu_exec_context.hh"
+#include "cpu/pc_event.hh"
+#include "cpu/sampler/sampler.hh"
+#include "cpu/static_inst.hh"
+#include "sim/eventq.hh"
+
+// forward declarations
+#if FULL_SYSTEM
+class Processor;
+class AlphaITB;
+class AlphaDTB;
+class PhysicalMemory;
+
+class RemoteGDB;
+class GDBListener;
+
+#else
+
+class Process;
+
+#endif // FULL_SYSTEM
+template <class>
+class BaseDynInst;
+class ExecContext;
+class MemInterface;
+class Checkpoint;
+
+class CheckerCPU : public BaseCPU
+{
+  protected:
+    typedef TheISA::MachInst MachInst;
+    typedef TheISA::MiscReg MiscReg;
+  public:
+    // main simulation loop (one cycle)
+    virtual void init();
+
+    struct Params : public BaseCPU::Params
+    {
+#if FULL_SYSTEM
+        AlphaITB *itb;
+        AlphaDTB *dtb;
+        FunctionalMemory *mem;
+#else
+        Process *process;
+#endif
+        bool exitOnError;
+    };
+
+  public:
+    void post_interrupt(int int_num, int index);
+
+    CheckerCPU(Params *p);
+    virtual ~CheckerCPU();
+
+    void setMemory(FunctionalMemory *mem);
+
+    FunctionalMemory *memPtr;
+
+#if FULL_SYSTEM
+    void setSystem(System *system);
+
+    System *systemPtr;
+#endif
+  public:
+    // execution context
+    CPUExecContext *cpuXC;
+
+    ExecContext *xcProxy;
+
+    AlphaITB *itb;
+    AlphaDTB *dtb;
+
+#if FULL_SYSTEM
+    Addr dbg_vtophys(Addr addr);
+
+    bool interval_stats;
+#endif
+
+    union Result {
+        uint64_t integer;
+        float fp;
+        double dbl;
+    };
+
+    Result result;
+
+    // current instruction
+    MachInst machInst;
+
+    // Refcounted pointer to the one memory request.
+    MemReqPtr memReq;
+
+    // Pointer to the sampler that is telling us to switchover.
+    // Used to signal the completion of the pipe drain and schedule
+    // the next switchover
+    Sampler *sampler;
+
+    StaticInstPtr curStaticInst;
+
+    // number of simulated instructions
+    Counter numInst;
+    Counter startNumInst;
+
+    std::queue<int> miscRegIdxs;
+
+    virtual Counter totalInstructions() const
+    {
+        return numInst - startNumInst;
+    }
+
+    // number of simulated loads
+    Counter numLoad;
+    Counter startNumLoad;
+
+    virtual void serialize(std::ostream &os);
+    virtual void unserialize(Checkpoint *cp, const std::string &section);
+
+    template <class T>
+    Fault read(Addr addr, T &data, unsigned flags);
+
+    template <class T>
+    Fault write(T data, Addr addr, unsigned flags, uint64_t *res);
+
+    // These functions are only used in CPU models that split
+    // effective address computation from the actual memory access.
+    void setEA(Addr EA) { panic("SimpleCPU::setEA() not implemented\n"); }
+    Addr getEA() 	{ panic("SimpleCPU::getEA() not implemented\n"); }
+
+    void prefetch(Addr addr, unsigned flags)
+    {
+        // need to do this...
+    }
+
+    void writeHint(Addr addr, int size, unsigned flags)
+    {
+        // need to do this...
+    }
+
+    Fault copySrcTranslate(Addr src);
+
+    Fault copy(Addr dest);
+
+    // The register accessor methods provide the index of the
+    // instruction's operand (e.g., 0 or 1), not the architectural
+    // register index, to simplify the implementation of register
+    // renaming.  We find the architectural register index by indexing
+    // into the instruction's own operand index table.  Note that a
+    // raw pointer to the StaticInst is provided instead of a
+    // ref-counted StaticInstPtr to redice overhead.  This is fine as
+    // long as these methods don't copy the pointer into any long-term
+    // storage (which is pretty hard to imagine they would have reason
+    // to do).
+
+    uint64_t readIntReg(const StaticInst *si, int idx)
+    {
+        return cpuXC->readIntReg(si->srcRegIdx(idx));
+    }
+
+    float readFloatRegSingle(const StaticInst *si, int idx)
+    {
+        int reg_idx = si->srcRegIdx(idx) - TheISA::FP_Base_DepTag;
+        return cpuXC->readFloatRegSingle(reg_idx);
+    }
+
+    double readFloatRegDouble(const StaticInst *si, int idx)
+    {
+        int reg_idx = si->srcRegIdx(idx) - TheISA::FP_Base_DepTag;
+        return cpuXC->readFloatRegDouble(reg_idx);
+    }
+
+    uint64_t readFloatRegInt(const StaticInst *si, int idx)
+    {
+        int reg_idx = si->srcRegIdx(idx) - TheISA::FP_Base_DepTag;
+        return cpuXC->readFloatRegInt(reg_idx);
+    }
+
+    void setIntReg(const StaticInst *si, int idx, uint64_t val)
+    {
+        cpuXC->setIntReg(si->destRegIdx(idx), val);
+        result.integer = val;
+    }
+
+    void setFloatRegSingle(const StaticInst *si, int idx, float val)
+    {
+        int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag;
+        cpuXC->setFloatRegSingle(reg_idx, val);
+        result.fp = val;
+    }
+
+    void setFloatRegDouble(const StaticInst *si, int idx, double val)
+    {
+        int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag;
+        cpuXC->setFloatRegDouble(reg_idx, val);
+        result.dbl = val;
+    }
+
+    void setFloatRegInt(const StaticInst *si, int idx, uint64_t val)
+    {
+        int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag;
+        cpuXC->setFloatRegInt(reg_idx, val);
+        result.integer = val;
+    }
+
+    uint64_t readPC() { return cpuXC->readPC(); }
+    void setNextPC(uint64_t val) {
+        cpuXC->setNextPC(val);
+    }
+
+    MiscReg readMiscReg(int misc_reg)
+    {
+        return cpuXC->readMiscReg(misc_reg);
+    }
+
+    MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault)
+    {
+        return cpuXC->readMiscRegWithEffect(misc_reg, fault);
+    }
+
+    Fault setMiscReg(int misc_reg, const MiscReg &val)
+    {
+        result.integer = val;
+        miscRegIdxs.push(misc_reg);
+        return cpuXC->setMiscReg(misc_reg, val);
+    }
+
+    Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val)
+    {
+        miscRegIdxs.push(misc_reg);
+        return cpuXC->setMiscRegWithEffect(misc_reg, val);
+    }
+
+    void recordPCChange(uint64_t val) { changedPC = true; }
+    void recordNextPCChange(uint64_t val) { changedNextPC = true; }
+
+    bool translateInstReq(MemReqPtr &req);
+    void translateDataWriteReq(MemReqPtr &req);
+    void translateDataReadReq(MemReqPtr &req);
+
+#if FULL_SYSTEM
+    Fault hwrei() { return cpuXC->hwrei(); }
+    int readIntrFlag() { return cpuXC->readIntrFlag(); }
+    void setIntrFlag(int val) { cpuXC->setIntrFlag(val); }
+    bool inPalMode() { return cpuXC->inPalMode(); }
+    void ev5_trap(Fault fault) { fault->invoke(xcProxy); }
+    bool simPalCheck(int palFunc) { return cpuXC->simPalCheck(palFunc); }
+#else
+    // Assume that the normal CPU's call to syscall was successful.
+    void syscall() { }
+#endif
+
+    void handleError()
+    {
+        if (exitOnError)
+            panic("Checker found error!");
+    }
+    bool checkFlags(MemReqPtr &req);
+
+    ExecContext *xcBase() { return xcProxy; }
+    CPUExecContext *cpuXCBase() { return cpuXC; }
+
+    Result unverifiedResult;
+    MemReqPtr unverifiedReq;
+
+    bool changedPC;
+    bool willChangePC;
+    uint64_t newPC;
+    bool changedNextPC;
+    bool exitOnError;
+
+    InstSeqNum youngestSN;
+//    std::map<Addr, uint64_t> storeBuff;
+//    typedef std::map<Addr, uint64_t>::iterator map_it;
+};
+
+template <class DynInstPtr>
+class Checker : public CheckerCPU
+{
+  public:
+    Checker(Params *p)
+        : CheckerCPU(p)
+    { }
+
+    void switchOut(Sampler *s);
+    void takeOverFrom(BaseCPU *oldCPU);
+
+    void tick(DynInstPtr &inst);
+
+    void validateInst(DynInstPtr &inst);
+    void validateExecution(DynInstPtr &inst);
+    void validateState();
+
+    std::list<DynInstPtr> instList;
+    typedef typename std::list<DynInstPtr>::iterator InstListIt;
+    void dumpInsts();
+};
+
+#endif // __CPU_CHECKER_CPU_HH__
diff --git a/cpu/checker/cpu_builder.cc b/cpu/checker/cpu_builder.cc
new file mode 100644
index 000000000..397ccab14
--- /dev/null
+++ b/cpu/checker/cpu_builder.cc
@@ -0,0 +1,126 @@
+
+#include <string>
+
+#include "cpu/checker/cpu.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/ozone/dyn_inst.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "mem/base_mem.hh"
+#include "sim/builder.hh"
+#include "sim/process.hh"
+#include "sim/sim_object.hh"
+
+class OzoneChecker : public Checker<RefCountingPtr<OzoneDynInst<OzoneImpl> > >
+{
+  public:
+    OzoneChecker(Params *p)
+        : Checker<RefCountingPtr<OzoneDynInst<OzoneImpl> > >(p)
+    { }
+};
+
+////////////////////////////////////////////////////////////////////////
+//
+//  CheckerCPU Simulation Object
+//
+BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker)
+
+    Param<Counter> max_insts_any_thread;
+    Param<Counter> max_insts_all_threads;
+    Param<Counter> max_loads_any_thread;
+    Param<Counter> max_loads_all_threads;
+
+#if FULL_SYSTEM
+    SimObjectParam<AlphaITB *> itb;
+    SimObjectParam<AlphaDTB *> dtb;
+    SimObjectParam<FunctionalMemory *> mem;
+    SimObjectParam<System *> system;
+    Param<int> cpu_id;
+    Param<Tick> profile;
+#else
+    SimObjectParam<Process *> workload;
+#endif // FULL_SYSTEM
+    Param<int> clock;
+    SimObjectParam<BaseMem *> icache;
+    SimObjectParam<BaseMem *> dcache;
+
+    Param<bool> defer_registration;
+    Param<bool> exitOnError;
+    Param<bool> function_trace;
+    Param<Tick> function_trace_start;
+
+END_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker)
+
+BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker)
+
+    INIT_PARAM(max_insts_any_thread,
+               "terminate when any thread reaches this inst count"),
+    INIT_PARAM(max_insts_all_threads,
+               "terminate when all threads have reached this inst count"),
+    INIT_PARAM(max_loads_any_thread,
+               "terminate when any thread reaches this load count"),
+    INIT_PARAM(max_loads_all_threads,
+               "terminate when all threads have reached this load count"),
+
+#if FULL_SYSTEM
+    INIT_PARAM(itb, "Instruction TLB"),
+    INIT_PARAM(dtb, "Data TLB"),
+    INIT_PARAM(mem, "memory"),
+    INIT_PARAM(system, "system object"),
+    INIT_PARAM(cpu_id, "processor ID"),
+    INIT_PARAM(profile, ""),
+#else
+    INIT_PARAM(workload, "processes to run"),
+#endif // FULL_SYSTEM
+
+    INIT_PARAM(clock, "clock speed"),
+    INIT_PARAM(icache, "L1 instruction cache object"),
+    INIT_PARAM(dcache, "L1 data cache object"),
+
+    INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
+    INIT_PARAM(exitOnError, "exit on error"),
+    INIT_PARAM(function_trace, "Enable function trace"),
+    INIT_PARAM(function_trace_start, "Cycle to start function trace")
+
+END_INIT_SIM_OBJECT_PARAMS(OzoneChecker)
+
+
+CREATE_SIM_OBJECT(OzoneChecker)
+{
+    OzoneChecker::Params *params = new OzoneChecker::Params();
+    params->name = getInstanceName();
+    params->numberOfThreads = 1;
+    params->max_insts_any_thread = 0;
+    params->max_insts_all_threads = 0;
+    params->max_loads_any_thread = 0;
+    params->max_loads_all_threads = 0;
+    params->exitOnError = exitOnError;
+    params->deferRegistration = defer_registration;
+    params->functionTrace = function_trace;
+    params->functionTraceStart = function_trace_start;
+    params->clock = clock;
+    // Hack to touch all parameters.  Consider not deriving Checker
+    // from BaseCPU..it's not really a CPU in the end.
+    Counter temp;
+    temp = max_insts_any_thread;
+    temp = max_insts_all_threads;
+    temp = max_loads_any_thread;
+    temp = max_loads_all_threads;
+    BaseMem *cache = icache;
+    cache = dcache;
+
+#if FULL_SYSTEM
+    params->itb = itb;
+    params->dtb = dtb;
+    params->mem = mem;
+    params->system = system;
+    params->cpu_id = cpu_id;
+    params->profile = profile;
+#else
+    params->process = workload;
+#endif
+
+    OzoneChecker *cpu = new OzoneChecker(params);
+    return cpu;
+}
+
+REGISTER_SIM_OBJECT("OzoneChecker", OzoneChecker)
diff --git a/cpu/checker/exec_context.hh b/cpu/checker/exec_context.hh
new file mode 100644
index 000000000..4843d1cf0
--- /dev/null
+++ b/cpu/checker/exec_context.hh
@@ -0,0 +1,225 @@
+#ifndef __CPU_CHECKER_EXEC_CONTEXT_HH__
+#define __CPU_CHECKER_EXEC_CONTEXT_HH__
+
+#include "cpu/checker/cpu.hh"
+#include "cpu/cpu_exec_context.hh"
+#include "cpu/exec_context.hh"
+
+class EndQuiesceEvent;
+
+template <class XC>
+class CheckerExecContext : public ExecContext
+{
+  public:
+    CheckerExecContext(XC *actual_xc,
+                       CheckerCPU *checker_cpu)
+        : actualXC(actual_xc), checkerXC(checker_cpu->cpuXC), checkerCPU(checker_cpu)
+    { }
+
+  private:
+    XC *actualXC;
+    CPUExecContext *checkerXC;
+    CheckerCPU *checkerCPU;
+
+  public:
+
+    BaseCPU *getCpuPtr() { return actualXC->getCpuPtr(); }
+
+    void setCpuId(int id)
+    {
+        actualXC->setCpuId(id);
+        checkerXC->setCpuId(id);
+    }
+
+    int readCpuId() { return actualXC->readCpuId(); }
+
+    FunctionalMemory *getMemPtr() { return actualXC->getMemPtr(); }
+
+#if FULL_SYSTEM
+    System *getSystemPtr() { return actualXC->getSystemPtr(); }
+
+    PhysicalMemory *getPhysMemPtr() { return actualXC->getPhysMemPtr(); }
+
+    AlphaITB *getITBPtr() { return actualXC->getITBPtr(); }
+
+    AlphaDTB *getDTBPtr() { return actualXC->getDTBPtr(); }
+#else
+    Process *getProcessPtr() { return actualXC->getProcessPtr(); }
+#endif
+
+    Status status() const { return actualXC->status(); }
+
+    void setStatus(Status new_status)
+    { actualXC->setStatus(new_status);
+    checkerXC->setStatus(new_status); }
+
+    /// Set the status to Active.  Optional delay indicates number of
+    /// cycles to wait before beginning execution.
+    void activate(int delay = 1) { actualXC->activate(delay); }
+
+    /// Set the status to Suspended.
+    void suspend() { actualXC->suspend(); }
+
+    /// Set the status to Unallocated.
+    void deallocate() { actualXC->deallocate(); }
+
+    /// Set the status to Halted.
+    void halt() { actualXC->halt(); }
+
+#if FULL_SYSTEM
+    void dumpFuncProfile() { actualXC->dumpFuncProfile(); }
+#endif
+
+    void takeOverFrom(ExecContext *oldContext)
+    {
+        actualXC->takeOverFrom(oldContext);
+        checkerXC->takeOverFrom(oldContext);
+    }
+
+    void regStats(const std::string &name) { actualXC->regStats(name); }
+
+    void serialize(std::ostream &os) { actualXC->serialize(os); }
+    void unserialize(Checkpoint *cp, const std::string &section)
+    { actualXC->unserialize(cp, section); }
+
+#if FULL_SYSTEM
+    EndQuiesceEvent *getQuiesceEvent() { return actualXC->getQuiesceEvent(); }
+
+    Tick readLastActivate() { return actualXC->readLastActivate(); }
+    Tick readLastSuspend() { return actualXC->readLastSuspend(); }
+
+    void profileClear() { return actualXC->profileClear(); }
+    void profileSample() { return actualXC->profileSample(); }
+#endif
+
+    int getThreadNum() { return actualXC->getThreadNum(); }
+
+    // @todo: Do I need this?
+    MachInst getInst() { return actualXC->getInst(); }
+
+    // @todo: Do I need this?
+    void copyArchRegs(ExecContext *xc)
+    {
+        actualXC->copyArchRegs(xc);
+        checkerXC->copyArchRegs(xc);
+    }
+
+    void clearArchRegs()
+    {
+        actualXC->clearArchRegs();
+        checkerXC->clearArchRegs();
+    }
+
+    //
+    // New accessors for new decoder.
+    //
+    uint64_t readIntReg(int reg_idx)
+    { return actualXC->readIntReg(reg_idx); }
+
+    float readFloatRegSingle(int reg_idx)
+    { return actualXC->readFloatRegSingle(reg_idx); }
+
+    double readFloatRegDouble(int reg_idx)
+    { return actualXC->readFloatRegDouble(reg_idx); }
+
+    uint64_t readFloatRegInt(int reg_idx)
+    { return actualXC->readFloatRegInt(reg_idx); }
+
+    void setIntReg(int reg_idx, uint64_t val)
+    {
+        actualXC->setIntReg(reg_idx, val);
+        checkerXC->setIntReg(reg_idx, val);
+    }
+
+    void setFloatRegSingle(int reg_idx, float val)
+    {
+        actualXC->setFloatRegSingle(reg_idx, val);
+        checkerXC->setFloatRegSingle(reg_idx, val);
+    }
+
+    void setFloatRegDouble(int reg_idx, double val)
+    {
+        actualXC->setFloatRegDouble(reg_idx, val);
+        checkerXC->setFloatRegSingle(reg_idx, val);
+    }
+
+    void setFloatRegInt(int reg_idx, uint64_t val)
+    {
+        actualXC->setFloatRegInt(reg_idx, val);
+        checkerXC->setFloatRegInt(reg_idx, val);
+    }
+
+    uint64_t readPC() { return actualXC->readPC(); }
+
+    void setPC(uint64_t val)
+    {
+        actualXC->setPC(val);
+        checkerXC->setPC(val);
+        checkerCPU->recordPCChange(val);
+    }
+
+    uint64_t readNextPC() { return actualXC->readNextPC(); }
+
+    void setNextPC(uint64_t val)
+    {
+        actualXC->setNextPC(val);
+        checkerXC->setNextPC(val);
+        checkerCPU->recordNextPCChange(val);
+    }
+
+    MiscReg readMiscReg(int misc_reg)
+    { return actualXC->readMiscReg(misc_reg); }
+
+    MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault)
+    { return actualXC->readMiscRegWithEffect(misc_reg, fault); }
+
+    Fault setMiscReg(int misc_reg, const MiscReg &val)
+    {
+        checkerXC->setMiscReg(misc_reg, val);
+        return actualXC->setMiscReg(misc_reg, val);
+    }
+
+    Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val)
+    {
+        checkerXC->setMiscRegWithEffect(misc_reg, val);
+        return actualXC->setMiscRegWithEffect(misc_reg, val);
+    }
+
+    unsigned readStCondFailures()
+    { return actualXC->readStCondFailures(); }
+
+    void setStCondFailures(unsigned sc_failures)
+    {
+        checkerXC->setStCondFailures(sc_failures);
+        actualXC->setStCondFailures(sc_failures);
+    }
+#if FULL_SYSTEM
+    bool inPalMode() { return actualXC->inPalMode(); }
+#endif
+
+    // @todo: Fix this!
+    bool misspeculating() { return actualXC->misspeculating(); }
+
+#if !FULL_SYSTEM
+    IntReg getSyscallArg(int i) { return actualXC->getSyscallArg(i); }
+
+    // used to shift args for indirect syscall
+    void setSyscallArg(int i, IntReg val)
+    {
+        checkerXC->setSyscallArg(i, val);
+        actualXC->setSyscallArg(i, val);
+    }
+
+    void setSyscallReturn(SyscallReturn return_value)
+    {
+        checkerXC->setSyscallReturn(return_value);
+        actualXC->setSyscallReturn(return_value);
+    }
+
+//    void syscall() { actualXC->syscall(); }
+
+    Counter readFuncExeInst() { return actualXC->readFuncExeInst(); }
+#endif
+};
+
+#endif // __CPU_CHECKER_EXEC_CONTEXT_HH__
diff --git a/cpu/checker/o3_cpu_builder.cc b/cpu/checker/o3_cpu_builder.cc
new file mode 100644
index 000000000..125bfa398
--- /dev/null
+++ b/cpu/checker/o3_cpu_builder.cc
@@ -0,0 +1,126 @@
+
+#include <string>
+
+#include "cpu/checker/cpu.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/o3/alpha_dyn_inst.hh"
+#include "cpu/o3/alpha_impl.hh"
+#include "mem/base_mem.hh"
+#include "sim/builder.hh"
+#include "sim/process.hh"
+#include "sim/sim_object.hh"
+
+class O3Checker : public Checker<RefCountingPtr<AlphaDynInst<AlphaSimpleImpl> > >
+{
+  public:
+    O3Checker(Params *p)
+        : Checker<RefCountingPtr<AlphaDynInst<AlphaSimpleImpl> > >(p)
+    { }
+};
+
+////////////////////////////////////////////////////////////////////////
+//
+//  CheckerCPU Simulation Object
+//
+BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker)
+
+    Param<Counter> max_insts_any_thread;
+    Param<Counter> max_insts_all_threads;
+    Param<Counter> max_loads_any_thread;
+    Param<Counter> max_loads_all_threads;
+
+#if FULL_SYSTEM
+    SimObjectParam<AlphaITB *> itb;
+    SimObjectParam<AlphaDTB *> dtb;
+    SimObjectParam<FunctionalMemory *> mem;
+    SimObjectParam<System *> system;
+    Param<int> cpu_id;
+    Param<Tick> profile;
+#else
+    SimObjectParam<Process *> workload;
+#endif // FULL_SYSTEM
+    Param<int> clock;
+    SimObjectParam<BaseMem *> icache;
+    SimObjectParam<BaseMem *> dcache;
+
+    Param<bool> defer_registration;
+    Param<bool> exitOnError;
+    Param<bool> function_trace;
+    Param<Tick> function_trace_start;
+
+END_DECLARE_SIM_OBJECT_PARAMS(O3Checker)
+
+BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker)
+
+    INIT_PARAM(max_insts_any_thread,
+               "terminate when any thread reaches this inst count"),
+    INIT_PARAM(max_insts_all_threads,
+               "terminate when all threads have reached this inst count"),
+    INIT_PARAM(max_loads_any_thread,
+               "terminate when any thread reaches this load count"),
+    INIT_PARAM(max_loads_all_threads,
+               "terminate when all threads have reached this load count"),
+
+#if FULL_SYSTEM
+    INIT_PARAM(itb, "Instruction TLB"),
+    INIT_PARAM(dtb, "Data TLB"),
+    INIT_PARAM(mem, "memory"),
+    INIT_PARAM(system, "system object"),
+    INIT_PARAM(cpu_id, "processor ID"),
+    INIT_PARAM(profile, ""),
+#else
+    INIT_PARAM(workload, "processes to run"),
+#endif // FULL_SYSTEM
+
+    INIT_PARAM(clock, "clock speed"),
+    INIT_PARAM(icache, "L1 instruction cache object"),
+    INIT_PARAM(dcache, "L1 data cache object"),
+
+    INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
+    INIT_PARAM(exitOnError, "exit on error"),
+    INIT_PARAM(function_trace, "Enable function trace"),
+    INIT_PARAM(function_trace_start, "Cycle to start function trace")
+
+END_INIT_SIM_OBJECT_PARAMS(O3Checker)
+
+
+CREATE_SIM_OBJECT(O3Checker)
+{
+    O3Checker::Params *params = new O3Checker::Params();
+    params->name = getInstanceName();
+    params->numberOfThreads = 1;
+    params->max_insts_any_thread = 0;
+    params->max_insts_all_threads = 0;
+    params->max_loads_any_thread = 0;
+    params->max_loads_all_threads = 0;
+    params->exitOnError = exitOnError;
+    params->deferRegistration = defer_registration;
+    params->functionTrace = function_trace;
+    params->functionTraceStart = function_trace_start;
+    params->clock = clock;
+    // Hack to touch all parameters.  Consider not deriving Checker
+    // from BaseCPU..it's not really a CPU in the end.
+    Counter temp;
+    temp = max_insts_any_thread;
+    temp = max_insts_all_threads;
+    temp = max_loads_any_thread;
+    temp = max_loads_all_threads;
+    BaseMem *cache = icache;
+    cache = dcache;
+
+#if FULL_SYSTEM
+    params->itb = itb;
+    params->dtb = dtb;
+    params->mem = mem;
+    params->system = system;
+    params->cpu_id = cpu_id;
+    params->profile = profile;
+#else
+    params->process = workload;
+#endif
+
+    O3Checker *cpu = new O3Checker(params);
+    return cpu;
+}
+
+REGISTER_SIM_OBJECT("O3Checker", O3Checker)
diff --git a/cpu/cpu_models.py b/cpu/cpu_models.py
index 8912673f7..2b1ae6277 100644
--- a/cpu/cpu_models.py
+++ b/cpu/cpu_models.py
@@ -74,4 +74,7 @@ CpuModel('OzoneSimpleCPU', 'ozone_simple_exec.cc',
 CpuModel('OzoneCPU', 'ozone_exec.cc',
          '#include "cpu/ozone/dyn_inst.hh"',
          { 'CPU_exec_context': 'OzoneDynInst<OzoneImpl>' })
+CpuModel('CheckerCPU', 'checker_cpu_exec.cc',
+         '#include "cpu/checker/cpu.hh"',
+         { 'CPU_exec_context': 'CheckerCPU' })
 

From ef6e2eb3c4dbf337df7380ae93360c13140f11f6 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 16 May 2006 14:06:35 -0400
Subject: [PATCH 28/50] Updates for sampler, checker, and general correctness.

cpu/o3/alpha_cpu.hh:
    Update for sampler to work properly.  Also code cleanup.
cpu/o3/alpha_cpu_builder.cc:
cpu/o3/alpha_dyn_inst.hh:
    Updates to support the checker.
cpu/o3/alpha_cpu_impl.hh:
    Updates to support the checker.  Also general code cleanup.
cpu/o3/alpha_dyn_inst_impl.hh:
    Code cleanup.
cpu/o3/alpha_params.hh:
    Updates to support the checker.  Also supports trap latencies set through the parameters.
cpu/o3/commit.hh:
    Supports sampler, checker.  Code cleanup.
cpu/o3/commit_impl.hh:
    Updates to support the sampler and checker, as well as general code cleanup.
cpu/o3/cpu.cc:
cpu/o3/cpu.hh:
    Support sampler and checker.
cpu/o3/decode_impl.hh:
    Supports sampler.
cpu/o3/fetch.hh:
    Supports sampler.  Also update to hold the youngest valid SN fetch has seen to ensure that the entire pipeline has been drained.
cpu/o3/fetch_impl.hh:
    Sampler updates.  Also be sure to not fetches to uncached space (bad path).
cpu/o3/iew.hh:
cpu/o3/iew_impl.hh:
    Sampler updates.
cpu/o3/lsq_unit_impl.hh:
    Supports checker.
cpu/o3/regfile.hh:
    No need for accessing xcProxies directly.
cpu/o3/rename.hh:
cpu/o3/rename_impl.hh:
    Sampler support.

--HG--
extra : convert_revision : 03881885dd50ebbca13ef31f31492fd4ef59121c
---
 cpu/o3/alpha_cpu.hh           |  79 ++---
 cpu/o3/alpha_cpu_builder.cc   |  16 +-
 cpu/o3/alpha_cpu_impl.hh      | 189 +++++-------
 cpu/o3/alpha_dyn_inst.hh      |  30 +-
 cpu/o3/alpha_dyn_inst_impl.hh |   8 +-
 cpu/o3/alpha_params.hh        |   6 +-
 cpu/o3/commit.hh              | 103 ++++---
 cpu/o3/commit_impl.hh         | 536 +++++++++++++++-------------------
 cpu/o3/cpu.cc                 |  86 +++---
 cpu/o3/cpu.hh                 |  23 +-
 cpu/o3/decode_impl.hh         |   1 +
 cpu/o3/fetch.hh               |   7 +
 cpu/o3/fetch_impl.hh          |  20 +-
 cpu/o3/iew.hh                 |   2 +
 cpu/o3/iew_impl.hh            |  14 +-
 cpu/o3/lsq_unit_impl.hh       |   9 +
 cpu/o3/regfile.hh             |   4 +-
 cpu/o3/rename.hh              |   2 +
 cpu/o3/rename_impl.hh         |   7 +
 19 files changed, 547 insertions(+), 595 deletions(-)

diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index dfdf092ed..f70793aaa 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -34,6 +34,8 @@
 #include "cpu/o3/cpu.hh"
 #include "sim/byteswap.hh"
 
+class EndQuiesceEvent;
+
 template <class Impl>
 class AlphaFullCPU : public FullO3CPU<Impl>
 {
@@ -61,7 +63,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         Tick lastActivate;
         Tick lastSuspend;
 
-        Event *quiesceEvent;
+        EndQuiesceEvent *quiesceEvent;
 
         virtual BaseCPU *getCpuPtr() { return cpu; }
 
@@ -112,10 +114,8 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual void unserialize(Checkpoint *cp, const std::string &section);
 
 #if FULL_SYSTEM
-        virtual Event *getQuiesceEvent();
+        virtual EndQuiesceEvent *getQuiesceEvent();
 
-        // Not necessarily the best location for these...
-        // Having an extra function just to read these is obnoxious
         virtual Tick readLastActivate();
         virtual Tick readLastSuspend();
 
@@ -125,17 +125,12 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         virtual int getThreadNum() { return thread->tid; }
 
-        // Also somewhat obnoxious.  Really only used for the TLB fault.
-        // However, may be quite useful in SPARC.
         virtual TheISA::MachInst getInst();
 
         virtual void copyArchRegs(ExecContext *xc);
 
         virtual void clearArchRegs();
 
-        //
-        // New accessors for new decoder.
-        //
         virtual uint64_t readIntReg(int reg_idx);
 
         virtual float readFloatRegSingle(int reg_idx);
@@ -172,9 +167,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         virtual Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
 
-        // Also not necessarily the best location for these two.
-        // Hopefully will go away once we decide upon where st cond
-        // failures goes.
+        // @todo: Figure out where these store cond failures should go.
         virtual unsigned readStCondFailures() { return thread->storeCondFailures; }
 
         virtual void setStCondFailures(unsigned sc_failures) { thread->storeCondFailures = sc_failures; }
@@ -183,27 +176,27 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual bool inPalMode() { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
 #endif
 
-        // Only really makes sense for old CPU model.  Still could be useful though.
+        // Only really makes sense for old CPU model.  Lots of code
+        // outside the CPU still checks this function, so it will
+        // always return false to keep everything working.
         virtual bool misspeculating() { return false; }
 
 #if !FULL_SYSTEM
         virtual IntReg getSyscallArg(int i);
 
-        // used to shift args for indirect syscall
         virtual void setSyscallArg(int i, IntReg val);
 
         virtual void setSyscallReturn(SyscallReturn return_value);
 
         virtual void syscall() { return cpu->syscall(thread->tid); }
 
-        // Same with st cond failures.
         virtual Counter readFuncExeInst() { return thread->funcExeInst; }
 #endif
     };
 
-    friend class AlphaXC;
+//    friend class AlphaXC;
 
-    std::vector<AlphaXC *> xcProxies;
+//    std::vector<ExecContext *> xcProxies;
 
 #if FULL_SYSTEM
     /** ITB pointer. */
@@ -216,13 +209,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     void regStats();
 
 #if FULL_SYSTEM
-    //Note that the interrupt stuff from the base CPU might be somewhat
-    //ISA specific (ie NumInterruptLevels).  These functions might not
-    //be needed in FullCPU though.
-//    void post_interrupt(int int_num, int index);
-//    void clear_interrupt(int int_num, int index);
-//    void clear_interrupts();
-
     /** Translates instruction requestion. */
     Fault translateInstReq(MemReqPtr &req)
     {
@@ -273,11 +259,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     }
 
 #endif
-
-    // Later on may want to remove this misc stuff from the regfile and
-    // have it handled at this level.  This would be similar to moving certain
-    // IPRs into the devices themselves.  Might prove to be an issue when
-    // trying to rename source/destination registers...
     MiscReg readMiscReg(int misc_reg, unsigned tid);
 
     MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault, unsigned tid);
@@ -302,18 +283,21 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
     /** Traps to handle given fault. */
     void trap(Fault fault, unsigned tid);
-    bool simPalCheck(int palFunc);
+    bool simPalCheck(int palFunc, unsigned tid);
 
     /** Processes any interrupts. */
     void processInterrupts();
+
+    /** Halts the CPU. */
+    void halt() { panic("Halt not implemented!\n"); }
 #endif
 
 
 #if !FULL_SYSTEM
-    // Need to change these into regfile calls that directly set a certain
-    // register.  Actually, these functions should handle most of this
-    // functionality by themselves; should look up the rename and then
-    // set the register.
+    /** Executes a syscall.
+     * @todo: Determine if this needs to be virtual.
+     */
+    void syscall(int thread_num);
     /** Gets a syscall argument. */
     IntReg getSyscallArg(int i, int tid);
 
@@ -322,25 +306,12 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
     /** Sets the return value of a syscall. */
     void setSyscallReturn(SyscallReturn return_value, int tid);
-
-    /** Executes a syscall.
-     * @todo: Determine if this needs to be virtual.
-     */
-    virtual void syscall(int thread_num);
-
 #endif
 
-  public:
-#if FULL_SYSTEM
-    /** Halts the CPU. */
-    void halt() { panic("Halt not implemented!\n"); }
-#endif
-
-    /** Old CPU read from memory function. No longer used. */
+    /** Read from memory function. */
     template <class T>
     Fault read(MemReqPtr &req, T &data)
     {
-//	panic("CPU READ NOT IMPLEMENTED W/NEW MEMORY\n");
 #if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
         if (req->flags & LOCKED) {
@@ -350,10 +321,14 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
 #endif
         Fault error;
+
+#if FULL_SYSTEM
+        // @todo: Fix this LL/SC hack.
         if (req->flags & LOCKED) {
             lockAddr = req->paddr;
             lockFlag = true;
         }
+#endif
 
         error = this->mem->read(req, data);
         data = gtoh(data);
@@ -367,7 +342,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         return this->iew.ldstQueue.read(req, data, load_idx);
     }
 
-    /** Old CPU write to memory function. No longer used. */
+    /** Write to memory function. */
     template <class T>
     Fault write(MemReqPtr &req, T &data)
     {
@@ -420,11 +395,13 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
 #endif
 
+#if FULL_SYSTEM
+        // @todo: Fix this LL/SC hack.
         if (req->flags & LOCKED) {
             if (req->flags & UNCACHEABLE) {
                 req->result = 2;
             } else {
-                if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
+                if (this->lockFlag) {
                     req->result = 1;
                 } else {
                     req->result = 0;
@@ -432,6 +409,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
                 }
             }
         }
+#endif
 
         return this->mem->write(req, (T)htog(data));
     }
@@ -444,6 +422,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     }
 
     Addr lockAddr;
+
     bool lockFlag;
 };
 
diff --git a/cpu/o3/alpha_cpu_builder.cc b/cpu/o3/alpha_cpu_builder.cc
index d676a69c1..0f9116d71 100644
--- a/cpu/o3/alpha_cpu_builder.cc
+++ b/cpu/o3/alpha_cpu_builder.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,6 +61,8 @@ SimObjectVectorParam<Process *> workload;
 
 SimObjectParam<FunctionalMemory *> mem;
 
+SimObjectParam<BaseCPU *> checker;
+
 Param<Counter> max_insts_any_thread;
 Param<Counter> max_insts_all_threads;
 Param<Counter> max_loads_any_thread;
@@ -103,6 +105,8 @@ Param<unsigned> iewToCommitDelay;
 Param<unsigned> renameToROBDelay;
 Param<unsigned> commitWidth;
 Param<unsigned> squashWidth;
+Param<Tick> trapLatency;
+Param<Tick> fetchTrapLatency;
 
 Param<unsigned> localPredictorSize;
 Param<unsigned> localCtrBits;
@@ -165,6 +169,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
 
+    INIT_PARAM_DFLT(checker, "Checker CPU", NULL),
+
     INIT_PARAM_DFLT(max_insts_any_thread,
                     "Terminate when any thread reaches this inst count",
                     0),
@@ -223,6 +229,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(renameToROBDelay, "Rename to reorder buffer delay"),
     INIT_PARAM(commitWidth, "Commit width"),
     INIT_PARAM(squashWidth, "Squash width"),
+    INIT_PARAM_DFLT(trapLatency, "Number of cycles before the trap is handled", 6),
+    INIT_PARAM_DFLT(fetchTrapLatency, "Number of cycles before the fetch trap is handled", 12),
 
     INIT_PARAM(localPredictorSize, "Size of local predictor"),
     INIT_PARAM(localCtrBits, "Bits per counter"),
@@ -301,12 +309,13 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
     params->dtb = dtb;
 #else
     params->workload = workload;
-    //@todo: change to pageTable
 //    params->pTable = page_table;
 #endif // FULL_SYSTEM
 
     params->mem = mem;
 
+    params->checker = checker;
+
     params->max_insts_any_thread = max_insts_any_thread;
     params->max_insts_all_threads = max_insts_all_threads;
     params->max_loads_any_thread = max_loads_any_thread;
@@ -351,7 +360,8 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
     params->renameToROBDelay = renameToROBDelay;
     params->commitWidth = commitWidth;
     params->squashWidth = squashWidth;
-
+    params->trapLatency = trapLatency;
+    params->fetchTrapLatency = fetchTrapLatency;
 
     params->localPredictorSize = localPredictorSize;
     params->localCtrBits = localCtrBits;
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 7a2d5d2b9..856fcb1c8 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,10 +30,9 @@
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
+#include "cpu/checker/exec_context.hh"
 #include "cpu/quiesce_event.hh"
-#include "mem/cache/cache.hh" // for dynamic cast
 #include "mem/mem_interface.hh"
-#include "sim/builder.hh"
 #include "sim/sim_events.hh"
 #include "sim/stats.hh"
 
@@ -63,11 +62,9 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params *params)
 
     for (int i = 0; i < this->numThreads; ++i) {
 #if FULL_SYSTEM
-        assert(i == 0);
+        assert(this->numThreads == 1);
         this->thread[i] = new Thread(this, 0, params->mem);
-//        this->system->execContexts[i] = this->thread[i]->getXCProxy();
         this->thread[i]->setStatus(ExecContext::Suspended);
-
 #else
         if (i < params->workload.size()) {
             DPRINTF(FullCPU, "FullCPU: Workload[%i]'s starting PC is %#x, "
@@ -91,19 +88,27 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params *params)
 
         this->thread[i]->numInst = 0;
 
-        xcProxies.push_back(new AlphaXC);
+        ExecContext *xc_proxy;
 
-        xcProxies[i]->cpu = this;
-        xcProxies[i]->thread = this->thread[i];
+        AlphaXC *alpha_xc_proxy = new AlphaXC;
 
-        xcProxies[i]->quiesceEvent = new EndQuiesceEvent(xcProxies[i]);
-        xcProxies[i]->lastActivate = 0;
-        xcProxies[i]->lastSuspend = 0;
+        if (params->checker) {
+            xc_proxy = new CheckerExecContext<AlphaXC>(alpha_xc_proxy, this->checker);
+        } else {
+            xc_proxy = alpha_xc_proxy;
+        }
 
+        alpha_xc_proxy->cpu = this;
+        alpha_xc_proxy->thread = this->thread[i];
 
-        this->thread[i]->xcProxy = xcProxies[i];
+        alpha_xc_proxy->quiesceEvent =
+            new EndQuiesceEvent(xc_proxy);
+        alpha_xc_proxy->lastActivate = 0;
+        alpha_xc_proxy->lastSuspend = 0;
 
-        this->execContexts.push_back(this->thread[i]->getXCProxy());
+        this->thread[i]->xcProxy = xc_proxy;
+
+        this->execContexts.push_back(xc_proxy);
     }
 
 
@@ -144,6 +149,7 @@ template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::dumpFuncProfile()
 {
+    // Currently not supported
 }
 #endif
 
@@ -167,6 +173,18 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
     thread->funcExeInst = old_context->readFuncExeInst();
 #endif
 
+    EndQuiesceEvent *other_quiesce = old_context->getQuiesceEvent();
+    if (other_quiesce) {
+        // Point the quiesce event's XC at this XC so that it wakes up
+        // the proper CPU.
+        other_quiesce->xc = this;
+    }
+    if (thread->quiesceEvent) {
+        thread->quiesceEvent->xc = this;
+    }
+//    storeCondFailures = 0;
+    cpu->lockFlag = false;
+
     old_context->setStatus(ExecContext::Unallocated);
 
     thread->inSyscall = false;
@@ -178,7 +196,7 @@ void
 AlphaFullCPU<Impl>::AlphaXC::activate(int delay)
 {
     DPRINTF(FullCPU, "Calling activate on AlphaXC\n");
-//    warn("Calling activate on AlphaXC");
+
     if (thread->status() == ExecContext::Active)
         return;
 
@@ -200,7 +218,7 @@ void
 AlphaFullCPU<Impl>::AlphaXC::suspend()
 {
     DPRINTF(FullCPU, "Calling suspend on AlphaXC\n");
-//    warn("Calling suspend on AlphaXC");
+
     if (thread->status() == ExecContext::Suspended)
         return;
 
@@ -224,7 +242,7 @@ void
 AlphaFullCPU<Impl>::AlphaXC::deallocate()
 {
     DPRINTF(FullCPU, "Calling deallocate on AlphaXC\n");
-//    warn("Calling deallocate on AlphaXC");
+
     if (thread->status() == ExecContext::Unallocated)
         return;
 
@@ -237,7 +255,7 @@ void
 AlphaFullCPU<Impl>::AlphaXC::halt()
 {
     DPRINTF(FullCPU, "Calling halt on AlphaXC\n");
-//    warn("Calling halt on AlphaXC");
+
     if (thread->status() == ExecContext::Halted)
         return;
 
@@ -254,6 +272,7 @@ template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::serialize(std::ostream &os)
 {}
+
 template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::unserialize(Checkpoint *cp, const std::string &section)
@@ -261,7 +280,7 @@ AlphaFullCPU<Impl>::AlphaXC::unserialize(Checkpoint *cp, const std::string &sect
 
 #if FULL_SYSTEM
 template <class Impl>
-Event *
+EndQuiesceEvent *
 AlphaFullCPU<Impl>::AlphaXC::getQuiesceEvent()
 {
     return quiesceEvent;
@@ -345,9 +364,6 @@ void
 AlphaFullCPU<Impl>::AlphaXC::clearArchRegs()
 {}
 
-//
-// New accessors for new decoder.
-//
 template <class Impl>
 uint64_t
 AlphaFullCPU<Impl>::AlphaXC::readIntReg(int reg_idx)
@@ -503,26 +519,6 @@ AlphaFullCPU<Impl>::AlphaXC::setSyscallReturn(SyscallReturn return_value)
     cpu->setSyscallReturn(return_value, thread->tid);
 }
 
-template <class Impl>
-void
-AlphaFullCPU<Impl>::syscall(int tid)
-{
-    DPRINTF(FullCPU, "AlphaFullCPU: [tid:%i] Executing syscall().\n\n", tid);
-
-    DPRINTF(Activity,"Activity: syscall() called.\n");
-
-    // Temporarily increase this by one to account for the syscall
-    // instruction.
-    ++(this->thread[tid]->funcExeInst);
-
-    // Execute the actual syscall.
-    this->thread[tid]->syscall();
-
-    // Decrease funcExeInst by one as the normal commit will handle
-    // incrementing it.
-    --(this->thread[tid]->funcExeInst);
-}
-
 #endif // FULL_SYSTEM
 
 template <class Impl>
@@ -544,14 +540,7 @@ template <class Impl>
 Fault
 AlphaFullCPU<Impl>::setMiscReg(int misc_reg, const MiscReg &val, unsigned tid)
 {
-    // I think that these registers should always be set, regardless of what
-    // mode the thread is in.  The main difference is if the thread needs to
-    // squash as a result of the write, which is controlled by the AlphaXC.
-//    if (!this->thread[tid]->trapPending) {
-        return this->regFile.setMiscReg(misc_reg, val, tid);
-//    } else {
-//        return NoFault;
-//    }
+    return this->regFile.setMiscReg(misc_reg, val, tid);
 }
 
 template <class Impl>
@@ -559,18 +548,13 @@ Fault
 AlphaFullCPU<Impl>::setMiscRegWithEffect(int misc_reg, const MiscReg &val,
                                          unsigned tid)
 {
-//    if (!this->thread[tid]->trapPending) {
-        return this->regFile.setMiscRegWithEffect(misc_reg, val, tid);
-//    } else {
-//        return NoFault;
-//    }
+    return this->regFile.setMiscRegWithEffect(misc_reg, val, tid);
 }
 
 template <class Impl>
 void
 AlphaFullCPU<Impl>::squashFromXC(unsigned tid)
 {
-//    this->thread[tid]->trapPending = true;
     this->thread[tid]->inSyscall = true;
     this->commit.generateXCEvent(tid);
 }
@@ -585,7 +569,8 @@ AlphaFullCPU<Impl>::post_interrupt(int int_num, int index)
 
     if (this->thread[0]->status() == ExecContext::Suspended) {
         DPRINTF(IPI,"Suspended Processor awoke\n");
-        xcProxies[0]->activate();
+//	xcProxies[0]->activate();
+        this->execContexts[0]->activate();
     }
 }
 
@@ -607,31 +592,24 @@ template <class Impl>
 Fault
 AlphaFullCPU<Impl>::hwrei(unsigned tid)
 {
-#if 0
-    if (!inPalMode(this->readPC(tid)))
-        return new AlphaISA::UnimplementedOpcodeFault;
-
-    setNextPC(cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR, tid), tid);
-
-    cpu->kernelStats->hwrei();
-
-//    if ((this->regFile.miscRegs[tid].readReg(AlphaISA::IPR_EXC_ADDR) & 1) == 0)
-//        AlphaISA::swap_palshadow(&regs, false);
-
-    cpu->checkInterrupts = true;
-#endif
-//    panic("Do not call this function!");
     // Need to clear the lock flag upon returning from an interrupt.
     this->lockFlag = false;
+
+    this->kernelStats->hwrei();
+
+    this->checkInterrupts = true;
+
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
 }
 
 template <class Impl>
 bool
-AlphaFullCPU<Impl>::simPalCheck(int palFunc)
+AlphaFullCPU<Impl>::simPalCheck(int palFunc, unsigned tid)
 {
-//    kernelStats.callpal(palFunc);
+    if (this->kernelStats)
+        this->kernelStats->callpal(palFunc,
+                                   this->execContexts[tid]);
 
     switch (palFunc) {
       case PAL::halt:
@@ -650,47 +628,11 @@ AlphaFullCPU<Impl>::simPalCheck(int palFunc)
     return true;
 }
 
-// Probably shouldn't be able to switch to the trap handler as quickly as
-// this.  Also needs to get the exception restart address from the commit
-// stage.
 template <class Impl>
 void
 AlphaFullCPU<Impl>::trap(Fault fault, unsigned tid)
 {
-
-    fault->invoke(this->xcProxies[tid]);
-/*    // Keep in mind that a trap may be initiated by fetch if there's a TLB
-    // miss
-    uint64_t PC = this->commit.readCommitPC();
-
-    DPRINTF(Fault, "Fault %s\n", fault->name());
-    this->recordEvent(csprintf("Fault %s", fault->name()));
-
-    //kernelStats.fault(fault);
-
-    if (fault->isA<ArithmeticFault>())
-        panic("Arithmetic traps are unimplemented!");
-
-    // exception restart address - Get the commit PC
-    if (!fault->isA<InterruptFault>() || !inPalMode(PC))
-        this->regFile.miscRegs.setReg(AlphaISA::IPR_EXC_ADDR, PC);
-
-    if (fault->isA<PalFault>() || fault->isA<ArithmeticFault>())
-    //    || fault == InterruptFault && !PC_PAL(regs.pc)
-        {
-        // traps...  skip faulting instruction
-        AlphaISA::MiscReg ipr_exc_addr =
-            this->regFile.miscRegs.readReg(AlphaISA::IPR_EXC_ADDR);
-        this->regFile.miscRegs.setReg(AlphaISA::IPR_EXC_ADDR,
-                                      ipr_exc_addr + 4);
-    }
-
-    if (!inPalMode(PC))
-        swapPALShadow(true);
-
-    this->regFile.setPC(this->regFile.miscRegs.readReg(AlphaISA::IPR_PAL_BASE) +
-                         (dynamic_cast<AlphaFault *>(fault.get()))->vect(), 0);
-    this->regFile.setNextPC(PC + sizeof(MachInst), 0);*/
+    fault->invoke(this->execContexts[tid]);
 }
 
 template <class Impl>
@@ -700,6 +642,8 @@ AlphaFullCPU<Impl>::processInterrupts()
     // Check for interrupts here.  For now can copy the code that
     // exists within isa_fullsys_traits.hh.  Also assume that thread 0
     // is the one that handles the interrupts.
+    // @todo: Possibly consolidate the interrupt checking code.
+    // @todo: Allow other threads to handle interrupts.
 
     // Check if there are any outstanding interrupts
     //Handle the interrupts
@@ -738,6 +682,10 @@ AlphaFullCPU<Impl>::processInterrupts()
     if (ipl && ipl > this->readMiscReg(IPR_IPLR, 0)) {
         this->setMiscReg(IPR_ISR, summary, 0);
         this->setMiscReg(IPR_INTID, ipl, 0);
+        if (this->checker) {
+            this->checker->cpuXCBase()->setMiscReg(IPR_ISR, summary);
+            this->checker->cpuXCBase()->setMiscReg(IPR_INTID, ipl);
+        }
         this->trap(Fault(new InterruptFault), 0);
         DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
                 this->readMiscReg(IPR_IPLR, 0), ipl, summary);
@@ -747,6 +695,27 @@ AlphaFullCPU<Impl>::processInterrupts()
 #endif // FULL_SYSTEM
 
 #if !FULL_SYSTEM
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::syscall(int tid)
+{
+    DPRINTF(FullCPU, "AlphaFullCPU: [tid:%i] Executing syscall().\n\n", tid);
+
+    DPRINTF(Activity,"Activity: syscall() called.\n");
+
+    // Temporarily increase this by one to account for the syscall
+    // instruction.
+    ++(this->thread[tid]->funcExeInst);
+
+    // Execute the actual syscall.
+    this->thread[tid]->syscall();
+
+    // Decrease funcExeInst by one as the normal commit will handle
+    // incrementing it.
+    --(this->thread[tid]->funcExeInst);
+}
+
 template <class Impl>
 TheISA::IntReg
 AlphaFullCPU<Impl>::getSyscallArg(int i, int tid)
diff --git a/cpu/o3/alpha_dyn_inst.hh b/cpu/o3/alpha_dyn_inst.hh
index 24774bd0a..1c5b738aa 100644
--- a/cpu/o3/alpha_dyn_inst.hh
+++ b/cpu/o3/alpha_dyn_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,14 +35,11 @@
 #include "cpu/o3/alpha_impl.hh"
 
 /**
- * Mostly implementation & ISA specific AlphaDynInst. As with most other classes
- * in the new CPU model, it is templated on the Impl to allow for passing in of
- * all types, such as the CPU type and the ISA type. The AlphaDynInst serves
- * as the primary interface to the CPU; it plays the role that the ExecContext
- * does for the old CPU and the SimpleCPU. The goal is to abstract ExecContext
- * purely into an interface, and have it forward calls to the appropriate
- * CPU interface, which in the new CPU model's case would be this AlphaDynInst,
- * or any other high level implementation specific DynInst.
+ * Mostly implementation & ISA specific AlphaDynInst. As with most
+ * other classes in the new CPU model, it is templated on the Impl to
+ * allow for passing in of all types, such as the CPU type and the ISA
+ * type. The AlphaDynInst serves as the primary interface to the CPU
+ * for instructions that are executing.
  */
 template <class Impl>
 class AlphaDynInst : public BaseDynInst<Impl>
@@ -78,8 +75,10 @@ class AlphaDynInst : public BaseDynInst<Impl>
     /** Executes the instruction.*/
     Fault execute();
 
+    /** Initiates the access.  Only valid for memory operations. */
     Fault initiateAcc();
 
+    /** Completes the access.  Only valid for memory operations. */
     Fault completeAcc();
 
   private:
@@ -100,6 +99,7 @@ class AlphaDynInst : public BaseDynInst<Impl>
 
     Fault setMiscReg(int misc_reg, const MiscReg &val)
     {
+        this->instResult.integer = val;
         return this->cpu->setMiscReg(misc_reg, val, this->threadNumber);
     }
 
@@ -126,8 +126,6 @@ class AlphaDynInst : public BaseDynInst<Impl>
     void syscall();
 #endif
 
-
-
   private:
     /** Physical register index of the destination registers of this
      *  instruction.
@@ -247,9 +245,9 @@ class AlphaDynInst : public BaseDynInst<Impl>
     }
 
   public:
-    /** Calculates EA part of a memory instruction. Currently unused, though
-     * it may be useful in the future when memory instructions aren't
-     * executed with the EA calculation and the memory access being atomic.
+    /** Calculates EA part of a memory instruction. Currently unused,
+     * though it may be useful in the future if we want to split
+     * memory operations into EA calculation and memory access parts.
      */
     Fault calcEA()
     {
@@ -257,8 +255,8 @@ class AlphaDynInst : public BaseDynInst<Impl>
     }
 
     /** Does the memory access part of a memory instruction. Currently unused,
-     * though it may be useful in the future when memory instructions aren't
-     * executed with the EA calculation and the memory access being atomic.
+     * though it may be useful in the future if we want to split
+     * memory operations into EA calculation and memory access parts.
      */
     Fault memAccess()
     {
diff --git a/cpu/o3/alpha_dyn_inst_impl.hh b/cpu/o3/alpha_dyn_inst_impl.hh
index b5999f8d1..541d5ab82 100644
--- a/cpu/o3/alpha_dyn_inst_impl.hh
+++ b/cpu/o3/alpha_dyn_inst_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -124,13 +124,9 @@ AlphaDynInst<Impl>::hwrei()
     this->setNextPC(this->cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR,
                                            this->threadNumber));
 
-    this->cpu->kernelStats->hwrei();
-
     // Tell CPU to clear any state it needs to if a hwrei is taken.
     this->cpu->hwrei(this->threadNumber);
 
-    this->cpu->checkInterrupts = true;
-
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
 }
@@ -167,7 +163,7 @@ template <class Impl>
 bool
 AlphaDynInst<Impl>::simPalCheck(int palFunc)
 {
-    return this->cpu->simPalCheck(palFunc);
+    return this->cpu->simPalCheck(palFunc, this->threadNumber);
 }
 #else
 template <class Impl>
diff --git a/cpu/o3/alpha_params.hh b/cpu/o3/alpha_params.hh
index 04b790815..b8ebae21e 100644
--- a/cpu/o3/alpha_params.hh
+++ b/cpu/o3/alpha_params.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -62,6 +62,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
 
     FunctionalMemory *mem;
 
+    BaseCPU *checker;
+
     //
     // Caches
     //
@@ -117,6 +119,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned renameToROBDelay;
     unsigned commitWidth;
     unsigned squashWidth;
+    Tick trapLatency;
+    Tick fetchTrapLatency;
 
     //
     // Branch predictor (BP & BTB)
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 028bd5295..73eccd2b0 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,25 +40,27 @@ template <class>
 class O3ThreadState;
 
 /**
- * DefaultCommit handles single threaded and SMT commit. Its width is specified
- * by the parameters; each cycle it tries to commit that many instructions. The
- * SMT policy decides which thread it tries to commit instructions from. Non-
- * speculative instructions must reach the head of the ROB before they are
- * ready to execute; once they reach the head, commit will broadcast the
- * instruction's sequence number to the previous stages so that they can issue/
- * execute the instruction. Only one non-speculative instruction is handled per
- * cycle. Commit is responsible for handling all back-end initiated redirects.
- * It receives the redirect, and then broadcasts it to all stages, indicating
- * the sequence number they should squash until, and any necessary branch mis-
- * prediction information as well. It priortizes redirects by instruction's age,
- * only broadcasting a redirect if it corresponds to an instruction that should
- * currently be in the ROB. This is done by tracking the sequence number of the
- * youngest instruction in the ROB, which gets updated to any squashing
- * instruction's sequence number, and only broadcasting a redirect if it
- * corresponds to an older instruction. Commit also supports multiple cycle
- * squashing, to model a ROB that can only remove a certain number of
- * instructions per cycle. Eventually traps and interrupts will most likely
- * be handled here as well.
+ * DefaultCommit handles single threaded and SMT commit. Its width is
+ * specified by the parameters; each cycle it tries to commit that
+ * many instructions. The SMT policy decides which thread it tries to
+ * commit instructions from. Non- speculative instructions must reach
+ * the head of the ROB before they are ready to execute; once they
+ * reach the head, commit will broadcast the instruction's sequence
+ * number to the previous stages so that they can issue/ execute the
+ * instruction. Only one non-speculative instruction is handled per
+ * cycle. Commit is responsible for handling all back-end initiated
+ * redirects.  It receives the redirect, and then broadcasts it to all
+ * stages, indicating the sequence number they should squash until,
+ * and any necessary branch misprediction information as well. It
+ * priortizes redirects by instruction's age, only broadcasting a
+ * redirect if it corresponds to an instruction that should currently
+ * be in the ROB. This is done by tracking the sequence number of the
+ * youngest instruction in the ROB, which gets updated to any
+ * squashing instruction's sequence number, and only broadcasting a
+ * redirect if it corresponds to an older instruction. Commit also
+ * supports multiple cycle squashing, to model a ROB that can only
+ * remove a certain number of instructions per cycle. Eventually traps
+ * and interrupts will most likely be handled here as well.
  */
 template<class Impl>
 class DefaultCommit
@@ -78,6 +80,7 @@ class DefaultCommit
     typedef typename CPUPol::IEWStruct IEWStruct;
     typedef typename CPUPol::RenameStruct RenameStruct;
 
+    typedef typename CPUPol::Fetch Fetch;
     typedef typename CPUPol::IEW IEW;
 
     typedef O3ThreadState<Impl> Thread;
@@ -155,11 +158,16 @@ class DefaultCommit
     /** Sets the pointer to the queue coming from IEW. */
     void setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr);
 
+    void setFetchStage(Fetch *fetch_stage);
+
+    Fetch *fetchStage;
+
     /** Sets the poitner to the IEW stage. */
     void setIEWStage(IEW *iew_stage);
 
-    /** The pointer to the IEW stage. Used solely to ensure that syscalls do
-     * not execute until all stores have written back.
+    /** The pointer to the IEW stage. Used solely to ensure that
+     * various events (traps, interrupts, syscalls) do not occur until
+     * all stores have written back.
      */
     IEW *iewStage;
 
@@ -177,6 +185,8 @@ class DefaultCommit
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom();
 
     /** Ticks the commit stage, which tries to commit instructions. */
@@ -213,13 +223,12 @@ class DefaultCommit
      */
     bool changedROBEntries();
 
+    void squashAll(unsigned tid);
+
     void squashFromTrap(unsigned tid);
 
     void squashFromXC(unsigned tid);
 
-    void squashInFlightInsts(unsigned tid);
-
-  private:
     /** Commits as many instructions as possible. */
     void commitInsts();
 
@@ -246,8 +255,10 @@ class DefaultCommit
     int oldestReady();
 
   public:
-    /** Returns the PC of the head instruction of the ROB. */
-    uint64_t readPC();
+    /** Returns the PC of the head instruction of the ROB.
+     * @todo: Probably remove this function as it returns only thread 0.
+     */
+    uint64_t readPC() { return PC[0]; }
 
     uint64_t readPC(unsigned tid) { return PC[tid]; }
 
@@ -257,9 +268,6 @@ class DefaultCommit
 
     void setNextPC(uint64_t val, unsigned tid) { nextPC[tid] = val; }
 
-    /** Sets that the ROB is currently squashing. */
-    void setSquashing(unsigned tid);
-
   private:
     /** Time buffer interface. */
     TimeBuffer<TimeStruct> *timeBuffer;
@@ -299,10 +307,10 @@ class DefaultCommit
 
     std::vector<Thread *> thread;
 
-  private:
     Fault fetchFault;
-    InstSeqNum fetchFaultSN;
+
     int fetchTrapWait;
+
     /** Records that commit has written to the time buffer this cycle. Used for
      * the CPU to determine if it can deschedule itself if there is no activity.
      */
@@ -355,11 +363,13 @@ class DefaultCommit
     /** Number of Active Threads */
     unsigned numThreads;
 
+    bool switchPending;
     bool switchedOut;
 
     Tick trapLatency;
 
     Tick fetchTrapLatency;
+
     Tick fetchFaultTick;
 
     Addr PC[Impl::MaxThreads];
@@ -390,27 +400,26 @@ class DefaultCommit
      * speculative instruction reaching the head of the ROB.
      */
     Stats::Scalar<> commitNonSpecStalls;
-    /** Stat for the total number of committed branches. */
-//    Stats::Scalar<> commitCommittedBranches;
-    /** Stat for the total number of committed loads. */
-//    Stats::Scalar<> commitCommittedLoads;
-    /** Stat for the total number of committed memory references. */
-//    Stats::Scalar<> commitCommittedMemRefs;
     /** Stat for the total number of branch mispredicts that caused a squash. */
     Stats::Scalar<> branchMispredicts;
     /** Distribution of the number of committed instructions each cycle. */
     Stats::Distribution<> numCommittedDist;
 
-    // total number of instructions committed
-    Stats::Vector<> stat_com_inst;
-    Stats::Vector<> stat_com_swp;
-    Stats::Vector<> stat_com_refs;
-    Stats::Vector<> stat_com_loads;
-    Stats::Vector<> stat_com_membars;
-    Stats::Vector<> stat_com_branches;
+    /** Total number of instructions committed. */
+    Stats::Vector<> statComInst;
+    /** Total number of software prefetches committed. */
+    Stats::Vector<> statComSwp;
+    /** Stat for the total number of committed memory references. */
+    Stats::Vector<> statComRefs;
+    /** Stat for the total number of committed loads. */
+    Stats::Vector<> statComLoads;
+    /** Total number of committed memory barriers. */
+    Stats::Vector<> statComMembars;
+    /** Total number of committed branches. */
+    Stats::Vector<> statComBranches;
 
-    Stats::Scalar<> commit_eligible_samples;
-    Stats::Vector<> commit_eligible;
+    Stats::Scalar<> commitEligibleSamples;
+    Stats::Vector<> commitEligible;
 };
 
 #endif // __CPU_O3_COMMIT_HH__
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 034565f90..170f5b01f 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@
 
 #include "base/loader/symtab.hh"
 #include "base/timebuf.hh"
+#include "cpu/checker/cpu.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/o3/commit.hh"
 #include "cpu/o3/thread_state.hh"
@@ -54,7 +55,8 @@ template <class Impl>
 void
 DefaultCommit<Impl>::TrapEvent::process()
 {
-    // This will get reset if it was switched out.
+    // This will get reset by commit if it was switched out at the
+    // time of this event processing.
     commit->trapSquash[tid] = true;
 }
 
@@ -77,7 +79,9 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
       iewWidth(params->executeWidth),
       commitWidth(params->commitWidth),
       numThreads(params->numberOfThreads),
-      switchedOut(false)
+      switchedOut(false),
+      trapLatency(params->trapLatency),
+      fetchTrapLatency(params->fetchTrapLatency)
 {
     _status = Active;
     _nextStatus = Inactive;
@@ -117,9 +121,6 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
         xcSquash[i] = false;
     }
 
-    // Hardcoded trap latency.
-    trapLatency = 6;
-    fetchTrapLatency = 12;
     fetchFaultTick = 0;
     fetchTrapWait = 0;
 }
@@ -153,20 +154,6 @@ DefaultCommit<Impl>::regStats()
         .desc("The number of times commit has been forced to stall to "
               "communicate backwards")
         .prereq(commitNonSpecStalls);
-/*
-    commitCommittedBranches
-        .name(name() + ".commitCommittedBranches")
-        .desc("The number of committed branches")
-        .prereq(commitCommittedBranches);
-    commitCommittedLoads
-        .name(name() + ".commitCommittedLoads")
-        .desc("The number of committed loads")
-        .prereq(commitCommittedLoads);
-    commitCommittedMemRefs
-        .name(name() + ".commitCommittedMemRefs")
-        .desc("The number of committed memory references")
-        .prereq(commitCommittedMemRefs);
-*/
     branchMispredicts
         .name(name() + ".branchMispredicts")
         .desc("The number of times a branch was mispredicted")
@@ -178,42 +165,42 @@ DefaultCommit<Impl>::regStats()
         .flags(Stats::pdf)
         ;
 
-    stat_com_inst
+    statComInst
         .init(cpu->number_of_threads)
         .name(name() + ".COM:count")
         .desc("Number of instructions committed")
         .flags(total)
         ;
 
-    stat_com_swp
+    statComSwp
         .init(cpu->number_of_threads)
         .name(name() + ".COM:swp_count")
         .desc("Number of s/w prefetches committed")
         .flags(total)
         ;
 
-    stat_com_refs
+    statComRefs
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:refs")
         .desc("Number of memory references committed")
         .flags(total)
         ;
 
-    stat_com_loads
+    statComLoads
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:loads")
         .desc("Number of loads committed")
         .flags(total)
         ;
 
-    stat_com_membars
+    statComMembars
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:membars")
         .desc("Number of memory barriers committed")
         .flags(total)
         ;
 
-    stat_com_branches
+    statComBranches
         .init(cpu->number_of_threads)
         .name(name() + ".COM:branches")
         .desc("Number of branches committed")
@@ -233,14 +220,14 @@ DefaultCommit<Impl>::regStats()
     //  -> The standard deviation is computed only over cycles where
     //  we reached the BW limit
     //
-    commit_eligible
+    commitEligible
         .init(cpu->number_of_threads)
         .name(name() + ".COM:bw_limited")
         .desc("number of insts not committed due to BW limits")
         .flags(total)
         ;
 
-    commit_eligible_samples
+    commitEligibleSamples
         .name(name() + ".COM:bw_lim_events")
         .desc("number cycles where commit BW limit reached")
         ;
@@ -257,8 +244,8 @@ DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr)
     // the simulation, so it starts as active.
     cpu->activateStage(FullCPU::CommitIdx);
 
-    trapLatency = cpu->cycles(6);
-    fetchTrapLatency = cpu->cycles(12);
+    trapLatency = cpu->cycles(trapLatency);
+    fetchTrapLatency = cpu->cycles(fetchTrapLatency);
 }
 
 template <class Impl>
@@ -315,6 +302,13 @@ DefaultCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
     fromIEW = iewQueue->getWire(-iewToCommitDelay);
 }
 
+template <class Impl>
+void
+DefaultCommit<Impl>::setFetchStage(Fetch *fetch_stage)
+{
+    fetchStage = fetch_stage;
+}
+
 template <class Impl>
 void
 DefaultCommit<Impl>::setIEWStage(IEW *iew_stage)
@@ -369,6 +363,15 @@ template <class Impl>
 void
 DefaultCommit<Impl>::switchOut()
 {
+    switchPending = true;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::doSwitchOut()
+{
+    switchedOut = true;
+    switchPending = false;
     rob->switchOut();
 }
 
@@ -376,6 +379,7 @@ template <class Impl>
 void
 DefaultCommit<Impl>::takeOverFrom()
 {
+    switchedOut = false;
     _status = Active;
     _nextStatus = Inactive;
     for (int i=0; i < numThreads; i++) {
@@ -392,9 +396,17 @@ template <class Impl>
 void
 DefaultCommit<Impl>::updateStatus()
 {
-    if (commitStatus[0] == TrapPending ||
-        commitStatus[0] == FetchTrapPending) {
-        _nextStatus = Active;
+    // reset ROB changed variable
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+        changedROBNumEntries[tid] = false;
+
+        // Also check if any of the threads has a trap pending
+        if (commitStatus[tid] == TrapPending ||
+            commitStatus[tid] == FetchTrapPending) {
+            _nextStatus = Active;
+        }
     }
 
     if (_nextStatus == Inactive && _status == Active) {
@@ -406,13 +418,6 @@ DefaultCommit<Impl>::updateStatus()
     }
 
     _status = _nextStatus;
-
-    // reset ROB changed variable
-    list<unsigned>::iterator threads = (*activeThreads).begin();
-    while (threads != (*activeThreads).end()) {
-        unsigned tid = *threads++;
-        changedROBNumEntries[tid] = false;
-    }
 }
 
 template <class Impl>
@@ -488,64 +493,8 @@ DefaultCommit<Impl>::generateXCEvent(unsigned tid)
 
 template <class Impl>
 void
-DefaultCommit<Impl>::squashFromTrap(unsigned tid)
+DefaultCommit<Impl>::squashAll(unsigned tid)
 {
-    // If we want to include the squashing instruction in the squash,
-    // then use one older sequence number.
-    // Hopefully this doesn't mess things up.  Basically I want to squash
-    // all instructions of this thread.
-    InstSeqNum squashed_inst = rob->isEmpty() ?
-        0 : rob->readHeadInst(tid)->seqNum - 1;
-
-    // All younger instructions will be squashed. Set the sequence
-    // number as the youngest instruction in the ROB (0 in this case.
-    // Hopefully nothing breaks.)
-    youngestSeqNum[tid] = 0;
-
-    rob->squash(squashed_inst, tid);
-    changedROBNumEntries[tid] = true;
-
-    // Send back the sequence number of the squashed instruction.
-    toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
-
-    // Send back the squash signal to tell stages that they should
-    // squash.
-    toIEW->commitInfo[tid].squash = true;
-
-    // Send back the rob squashing signal so other stages know that
-    // the ROB is in the process of squashing.
-    toIEW->commitInfo[tid].robSquashing = true;
-
-    toIEW->commitInfo[tid].branchMispredict = false;
-
-//    toIEW->commitInfo[tid].branchTaken = fromIEW->branchTaken[tid];
-
-    toIEW->commitInfo[tid].nextPC = PC[tid];
-
-    DPRINTF(Commit, "Squashing from trap, restarting at PC %#x\n", PC[tid]);
-    // Hopefully nobody tries to use the mispredPC becuase I said there
-    // wasn't a branch mispredict.
-//    toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
-
-    thread[tid]->trapPending = false;
-    thread[tid]->inSyscall = false;
-
-    trapSquash[tid] = false;
-
-    // Not sure what to set this to...
-    commitStatus[tid] = ROBSquashing;
-    cpu->activityThisCycle();
-
-    ++squashCounter;
-}
-
-template <class Impl>
-void
-DefaultCommit<Impl>::squashFromXC(unsigned tid)
-{
-    // For now these are identical.  In the future, the squash from trap
-    // might execute the trap prior to the squash.
-
     // If we want to include the squashing instruction in the squash,
     // then use one older sequence number.
     // Hopefully this doesn't mess things up.  Basically I want to squash
@@ -574,18 +523,39 @@ DefaultCommit<Impl>::squashFromXC(unsigned tid)
 
     toIEW->commitInfo[tid].branchMispredict = false;
 
-//    toIEW->commitInfo[tid].branchTaken = fromIEW->branchTaken[tid];
-
     toIEW->commitInfo[tid].nextPC = PC[tid];
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashFromTrap(unsigned tid)
+{
+    squashAll(tid);
+
+    DPRINTF(Commit, "Squashing from trap, restarting at PC %#x\n", PC[tid]);
+
+    thread[tid]->trapPending = false;
+    thread[tid]->inSyscall = false;
+
+    trapSquash[tid] = false;
+
+    commitStatus[tid] = ROBSquashing;
+    cpu->activityThisCycle();
+
+    ++squashCounter;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashFromXC(unsigned tid)
+{
+    squashAll(tid);
 
     DPRINTF(Commit, "Squashing from XC, restarting at PC %#x\n", PC[tid]);
-    // Hopefully nobody tries to use the mispredPC becuase I said there
-    // wasn't a branch mispredict.
-//    toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
 
     thread[tid]->inSyscall = false;
     assert(!thread[tid]->trapPending);
-    // Not sure what to set this to...
+
     commitStatus[tid] = ROBSquashing;
     cpu->activityThisCycle();
 
@@ -594,22 +564,6 @@ DefaultCommit<Impl>::squashFromXC(unsigned tid)
     ++squashCounter;
 }
 
-template <class Impl>
-void
-DefaultCommit<Impl>::squashInFlightInsts(unsigned tid)
-{
-    // @todo: Fix this hardcoded number.
-    for (int i = 0; i < -5; ++i) {
-        for (int j = 0; j < (*iewQueue)[i].size; ++j) {
-            DynInstPtr inst = (*iewQueue)[i].insts[j];
-            if (inst->threadNumber == tid &&
-                !inst->isSquashed()) {
-                inst->setSquashed();
-            }
-        }
-    }
-}
-
 template <class Impl>
 void
 DefaultCommit<Impl>::tick()
@@ -617,13 +571,15 @@ DefaultCommit<Impl>::tick()
     wroteToTimeBuffer = false;
     _nextStatus = Inactive;
 
-    // If the ROB is currently in its squash sequence, then continue
-    // to squash.  In this case, commit does not do anything.  Otherwise
-    // run commit.
+    if (switchPending && rob->isEmpty() && !iewStage->hasStoresToWB()) {
+        cpu->signalSwitched();
+        return;
+    }
+
     list<unsigned>::iterator threads = (*activeThreads).begin();
 
-    // Maybe this should be dependent upon any of the commits actually
-    // squashing.
+    // Check if any of the threads are done squashing.  Change the
+    // status if they are done.
     while (threads != (*activeThreads).end()) {
         unsigned tid = *threads++;
 
@@ -673,7 +629,7 @@ DefaultCommit<Impl>::tick()
 
 
     if (wroteToTimeBuffer) {
-        DPRINTF(Activity,"Activity This Cycle.\n");
+        DPRINTF(Activity, "Activity This Cycle.\n");
         cpu->activityThisCycle();
     }
 
@@ -689,28 +645,23 @@ DefaultCommit<Impl>::commit()
     // Check for interrupts
     //////////////////////////////////////
 
-    // Process interrupts if interrupts are enabled and not in PAL mode.
-    // Take the PC from commit and write it to the IPR, then squash.  The
-    // interrupt completing will take care of restoring the PC from that value
-    // in the IPR.  Look at IPR[EXC_ADDR];
-    // hwrei() is what resets the PC to the place where instruction execution
-    // beings again.
 #if FULL_SYSTEM
-//#if 0
+    // Process interrupts if interrupts are enabled, not in PAL mode,
+    // and no other traps or external squashes are currently pending.
+    // @todo: Allow other threads to handle interrupts.
     if (cpu->checkInterrupts &&
         cpu->check_interrupts() &&
         !cpu->inPalMode(readPC()) &&
         !trapSquash[0] &&
         !xcSquash[0]) {
-//        commitStatus[0] = TrapPending;
+        // Tell fetch that there is an interrupt pending.  This will
+        // make fetch wait until it sees a non PAL-mode PC, at which
+        // point it stops fetching instructions.
         toIEW->commitInfo[0].interruptPending = true;
-        if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
-            // Will need to squash all instructions currently in flight and have
-            // the interrupt handler restart at the last non-committed inst.
-            // Most of that can be handled through the trap() function.  The
-            // processInterrupts() function really just checks for interrupts
-            // and then calls trap() if there is an interrupt present.
 
+        // Wait until the ROB is empty and all stores have drained in
+        // order to enter the interrupt.
+        if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
             // Not sure which thread should be the one to interrupt.  For now
             // always do thread 0.
             assert(!thread[0]->inSyscall);
@@ -738,26 +689,27 @@ DefaultCommit<Impl>::commit()
 #endif // FULL_SYSTEM
 
     ////////////////////////////////////
-    // Check for squash signal, handle that first
+    // Check for any possible squashes, handle them first
     ////////////////////////////////////
 
-    // Check if the IEW stage is telling the ROB to squash.
     list<unsigned>::iterator threads = (*activeThreads).begin();
 
     while (threads != (*activeThreads).end()) {
         unsigned tid = *threads++;
 
         if (fromFetch->fetchFault && commitStatus[0] != TrapPending) {
-            // Record the fault.  Wait until it's empty in the ROB.  Then handle the trap.
-            // Ignore it if there's already a trap pending as fetch will be redirected.
+            // Record the fault.  Wait until it's empty in the ROB.
+            // Then handle the trap.  Ignore it if there's already a
+            // trap pending as fetch will be redirected.
             fetchFault = fromFetch->fetchFault;
-            fetchFaultSN = fromFetch->fetchFaultSN;
             fetchFaultTick = curTick + fetchTrapLatency;
             commitStatus[0] = FetchTrapPending;
             DPRINTF(Commit, "Fault from fetch recorded.  Will trap if the "
                     "ROB empties without squashing the fault.\n");
             fetchTrapWait = 0;
         }
+
+        // Fetch may tell commit to clear the trap if it's been squashed.
         if (fromFetch->clearFetchFault) {
             DPRINTF(Commit, "Received clear fetch fault signal\n");
             fetchTrapWait = 0;
@@ -783,10 +735,6 @@ DefaultCommit<Impl>::commit()
             commitStatus[tid] != TrapPending &&
             fromIEW->squashedSeqNum[tid] <= youngestSeqNum[tid]) {
 
-            DPRINTF(Commit, "[tid:%u]: Squashing instructions in the "
-                    "ROB.\n",
-                    tid);
-
             DPRINTF(Commit, "[tid:%i]: Squashing due to PC %#x [sn:%i]\n",
                     tid,
                     fromIEW->mispredPC[tid],
@@ -814,11 +762,8 @@ DefaultCommit<Impl>::commit()
             rob->squash(squashed_inst, tid);
             changedROBNumEntries[tid] = true;
 
-            // Send back the sequence number of the squashed instruction.
             toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
 
-            // Send back the squash signal to tell stages that they should
-            // squash.
             toIEW->commitInfo[tid].squash = true;
 
             // Send back the rob squashing signal so other stages know that
@@ -833,11 +778,7 @@ DefaultCommit<Impl>::commit()
 
             toIEW->commitInfo[tid].nextPC = fromIEW->nextPC[tid];
 
-            DPRINTF(Commit, "Squashing from IEW, restarting at PC %#x\n",
-                    fromIEW->nextPC[tid]);
-
-            toIEW->commitInfo[tid].mispredPC =
-                fromIEW->mispredPC[tid];
+            toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
 
             if (toIEW->commitInfo[tid].branchMispredict) {
                 ++branchMispredicts;
@@ -882,10 +823,11 @@ DefaultCommit<Impl>::commitInsts()
 {
     ////////////////////////////////////
     // Handle commit
-    // Note that commit will be handled prior to the ROB so that the ROB
-    // only tries to commit instructions it has in this current cycle, and
-    // not instructions it is writing in during this cycle.
-    // Can't commit and squash things at the same time...
+    // Note that commit will be handled prior to putting new
+    // instructions in the ROB so that the ROB only tries to commit
+    // instructions it has in this current cycle, and not instructions
+    // it is writing in during this cycle.  Can't commit and squash
+    // things at the same time...
     ////////////////////////////////////
 
     DPRINTF(Commit, "Trying to commit instructions in the ROB.\n");
@@ -894,51 +836,58 @@ DefaultCommit<Impl>::commitInsts()
 
     DynInstPtr head_inst;
 #if FULL_SYSTEM
-    if (commitStatus[0] == FetchTrapPending) {
+    // Not the best way to check if the front end is empty, but it should
+    // work.
+    // @todo: Try to avoid directly accessing fetch.
+    if (commitStatus[0] == FetchTrapPending && rob->isEmpty()) {
         DPRINTF(Commit, "Fault from fetch is pending.\n");
-        if (rob->isEmpty()) {
-            fetchTrapWait++;
-            if (fetchTrapWait > 10000000) {
-                panic("Fetch trap has been pending for a long time!");
-            }
-            if (fetchFaultTick > curTick) {
-                DPRINTF(Commit, "Not enough cycles since fault, fault will "
-                        "happen on %lli\n",
-                        fetchFaultTick);
-                cpu->activityThisCycle();
-                return;
-            } else if (iewStage->hasStoresToWB()) {
-                DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
-                        "they are completed. fetchTrapWait:%i\n",
-                        fetchTrapWait);
-                cpu->activityThisCycle();
-                return;
-            } else if (cpu->inPalMode(readPC())) {
-                DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
-                        fetchTrapWait);
-                return;
-            }
-            fetchTrapWait = 0;
-            DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
 
-            assert(!thread[0]->inSyscall);
-
-            thread[0]->inSyscall = true;
-
-            // Consider holding onto the trap and waiting until the trap event
-            // happens for this to be executed.
-            cpu->trap(fetchFault, 0);
-
-            // Exit state update mode to avoid accidental updating.
-            thread[0]->inSyscall = false;
-
-            commitStatus[0] = TrapPending;
-            // Set it up so that we squash next cycle
-            trapSquash[0] = true;
+        fetchTrapWait++;
+        if (fetchTrapWait > 10000000) {
+            panic("Fetch trap has been pending for a long time!");
+        }
+        if (fetchFaultTick > curTick) {
+            DPRINTF(Commit, "Not enough cycles since fault, fault will "
+                    "happen on %lli\n",
+                    fetchFaultTick);
+            cpu->activityThisCycle();
+            return;
+        } else if (iewStage->hasStoresToWB()) {
+            DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
+                    "they are completed. fetchTrapWait:%i\n",
+                    fetchTrapWait);
+            cpu->activityThisCycle();
+            return;
+        } else if (cpu->inPalMode(readPC())) {
+            DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
+                    fetchTrapWait);
+            return;
+        } else if (fetchStage->getYoungestSN() > youngestSeqNum[0]) {
+            DPRINTF(Commit, "Waiting for front end to drain. fetchTrapWait:%i\n",
+                    fetchTrapWait);
             return;
         }
+        fetchTrapWait = 0;
+        DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
+
+        assert(!thread[0]->inSyscall);
+
+        thread[0]->inSyscall = true;
+
+        // Consider holding onto the trap and waiting until the trap event
+        // happens for this to be executed.
+        cpu->trap(fetchFault, 0);
+
+        // Exit state update mode to avoid accidental updating.
+        thread[0]->inSyscall = false;
+
+        commitStatus[0] = TrapPending;
+        // Set it up so that we squash next cycle
+        trapSquash[0] = true;
+        return;
     }
 #endif
+
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
     while (num_committed < commitWidth) {
@@ -956,16 +905,13 @@ DefaultCommit<Impl>::commitInsts()
         DPRINTF(Commit, "Trying to commit head instruction, [sn:%i] [tid:%i]\n",
                 head_inst->seqNum, tid);
 
-        // If the head instruction is squashed, it is ready to retire at any
-        // time.  However, we need to avoid updating any other state
-        // incorrectly if it's already been squashed.
+        // If the head instruction is squashed, it is ready to retire
+        // (be removed from the ROB) at any time.
         if (head_inst->isSquashed()) {
 
             DPRINTF(Commit, "Retiring squashed instruction from "
                     "ROB.\n");
 
-            // Tell ROB to retire head instruction.  This retires the head
-            // inst in the ROB without affecting any other stages.
             rob->retireHead(commit_thread);
 
             ++commitSquashedInsts;
@@ -989,7 +935,6 @@ DefaultCommit<Impl>::commitInsts()
             if (commit_success) {
                 ++num_committed;
 
-                // Record that the number of ROB entries has changed.
                 changedROBNumEntries[tid] = true;
 
                 // Set the doneSeqNum to the youngest committed instruction.
@@ -1009,8 +954,11 @@ DefaultCommit<Impl>::commitInsts()
                 int count = 0;
                 Addr oldpc;
                 do {
+                    // Debug statement.  Checks to make sure we're not
+                    // currently updating state while handling PC events.
                     if (count == 0)
-                        assert(!thread[tid]->inSyscall && !thread[tid]->trapPending);
+                        assert(!thread[tid]->inSyscall &&
+                               !thread[tid]->trapPending);
                     oldpc = PC[tid];
                     cpu->system->pcEventQueue.service(
                         thread[tid]->getXCProxy());
@@ -1034,7 +982,7 @@ DefaultCommit<Impl>::commitInsts()
     numCommittedDist.sample(num_committed);
 
     if (num_committed == commitWidth) {
-        commit_eligible[0]++;
+        commitEligible[0]++;
     }
 }
 
@@ -1042,13 +990,12 @@ template <class Impl>
 bool
 DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 {
-    // Make sure instruction is valid
     assert(head_inst);
 
     int tid = head_inst->threadNumber;
 
-    // If the instruction is not executed yet, then it is a non-speculative
-    // or store inst.  Signal backwards that it should be executed.
+    // If the instruction is not executed yet, then it will need extra
+    // handling.  Signal backwards that it should be executed.
     if (!head_inst->isExecuted()) {
         // Keep this number correct.  We have not yet actually executed
         // and committed this instruction.
@@ -1059,10 +1006,16 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
         if (head_inst->isNonSpeculative() ||
             head_inst->isMemBarrier() ||
             head_inst->isWriteBarrier()) {
+
+            DPRINTF(Commit, "Encountered a barrier or non-speculative "
+                    "instruction [sn:%lli] at the head of the ROB, PC %#x.\n",
+                    head_inst->seqNum, head_inst->readPC());
+
 #if !FULL_SYSTEM
-            // Hack to make sure syscalls aren't executed until all stores
-            // write back their data.  This direct communication shouldn't
-            // be used for anything other than this.
+            // Hack to make sure syscalls/memory barriers/quiesces
+            // aren't executed until all stores write back their data.
+            // This direct communication shouldn't be used for
+            // anything other than this.
             if (inst_num > 0 || iewStage->hasStoresToWB())
 #else
             if ((head_inst->isMemBarrier() || head_inst->isWriteBarrier() ||
@@ -1074,11 +1027,6 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
                 return false;
             }
 
-            DPRINTF(Commit, "Encountered a barrier or non-speculative "
-                    "instruction [sn:%lli] at the head of the ROB, PC %#x.\n",
-                    head_inst->seqNum, head_inst->readPC());
-
-            // Send back the non-speculative instruction's sequence number.
             toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
 
             // Change the instruction so it won't try to commit again until
@@ -1093,7 +1041,7 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
                     head_inst->seqNum, head_inst->readPC());
 
             // Send back the non-speculative instruction's sequence
-            // number.  Maybe just tell the lsq to re-execute the load.
+            // number.  Tell the lsq to re-execute the load.
             toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
             toIEW->commitInfo[tid].uncached = true;
             toIEW->commitInfo[tid].uncachedLoad = head_inst;
@@ -1107,76 +1055,77 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
         }
     }
 
-    // Now check if it's one of the special trap or barrier or
-    // serializing instructions.
-    if (head_inst->isThreadSync())/*  ||
-//        head_inst->isMemBarrier()  ||
-head_inst->isWriteBarrier())*/
-    {
+    if (head_inst->isThreadSync()) {
         // Not handled for now.
-        panic("Barrier instructions are not handled yet.\n");
+        panic("Thread sync instructions are not handled yet.\n");
     }
 
+    // Stores mark themselves as completed.
     if (!head_inst->isStore()) {
         head_inst->setCompleted();
     }
 
+    // Use checker prior to updating anything due to traps or PC
+    // based events.
+    if (cpu->checker) {
+        cpu->checker->tick(head_inst);
+    }
+
     // Check if the instruction caused a fault.  If so, trap.
     Fault inst_fault = head_inst->getFault();
 
     if (inst_fault != NoFault) {
-        if (!head_inst->isNop()) {
+        head_inst->setCompleted();
 #if FULL_SYSTEM
-            DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
-                    head_inst->seqNum, head_inst->readPC());
-
-            if (iewStage->hasStoresToWB()) {
-                DPRINTF(Commit, "Stores outstanding, fault must wait.\n");
-                return false;
-            }
-
-            assert(!thread[tid]->inSyscall);
-
-            thread[tid]->inSyscall = true;
-
-            // Hack for now; DTB will sometimes need the machine instruction
-            // for when faults happen.  So we will set it here, prior to the
-            // DTB possibly needing it for this translation.
-            thread[tid]->setInst(
-                static_cast<TheISA::MachInst>(head_inst->staticInst->machInst));
-
-            // Consider holding onto the trap and waiting until the trap event
-            // happens for this to be executed.
-            cpu->trap(inst_fault, tid);
-
-            // Exit state update mode to avoid accidental updating.
-            thread[tid]->inSyscall = false;
-
-            commitStatus[tid] = TrapPending;
-
-            // Generate trap squash event.
-            generateTrapEvent(tid);
+        DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
+                head_inst->seqNum, head_inst->readPC());
 
+        if (iewStage->hasStoresToWB() || inst_num > 0) {
+            DPRINTF(Commit, "Stores outstanding, fault must wait.\n");
             return false;
-#else // !FULL_SYSTEM
-            panic("fault (%d) detected @ PC %08p", inst_fault,
-                  head_inst->PC);
-#endif // FULL_SYSTEM
         }
-    }
 
-    // Check if we're really ready to commit.  If not then return false.
-    // I'm pretty sure all instructions should be able to commit if they've
-    // reached this far.  For now leave this in as a check.
-    if (!rob->isHeadReady(tid)) {
-        panic("Unable to commit head instruction!\n");
+        if (cpu->checker && head_inst->isStore()) {
+            cpu->checker->tick(head_inst);
+        }
+
+        assert(!thread[tid]->inSyscall);
+
+        // Mark that we're in state update mode so that the trap's
+        // execution doesn't generate extra squashes.
+        thread[tid]->inSyscall = true;
+
+        // DTB will sometimes need the machine instruction for when
+        // faults happen.  So we will set it here, prior to the DTB
+        // possibly needing it for its fault.
+        thread[tid]->setInst(
+            static_cast<TheISA::MachInst>(head_inst->staticInst->machInst));
+
+        // Execute the trap.  Although it's slightly unrealistic in
+        // terms of timing (as it doesn't wait for the full timing of
+        // the trap event to complete before updating state), it's
+        // needed to update the state as soon as possible.  This
+        // prevents external agents from changing any specific state
+        // that the trap need.
+        cpu->trap(inst_fault, tid);
+
+        // Exit state update mode to avoid accidental updating.
+        thread[tid]->inSyscall = false;
+
+        commitStatus[tid] = TrapPending;
+
+        // Generate trap squash event.
+        generateTrapEvent(tid);
+
         return false;
+#else // !FULL_SYSTEM
+        panic("fault (%d) detected @ PC %08p", inst_fault,
+              head_inst->PC);
+#endif // FULL_SYSTEM
     }
 
     updateComInstStats(head_inst);
 
-    // Now that the instruction is going to be committed, finalize its
-    // trace data.
     if (head_inst->traceData) {
         head_inst->traceData->setFetchSeq(head_inst->seqNum);
         head_inst->traceData->setCPSeq(thread[tid]->numInst);
@@ -1201,13 +1150,7 @@ template <class Impl>
 void
 DefaultCommit<Impl>::getInsts()
 {
-    //////////////////////////////////////
-    // Handle ROB functions
-    //////////////////////////////////////
-
-    // Read any renamed instructions and place them into the ROB.  Do this
-    // prior to squashing to avoid having instructions in the ROB that
-    // don't get squashed properly.
+    // Read any renamed instructions and place them into the ROB.
     int insts_to_process = min((int)renameWidth, fromRename->size);
 
     for (int inst_num = 0; inst_num < insts_to_process; ++inst_num)
@@ -1246,7 +1189,8 @@ DefaultCommit<Impl>::markCompletedInsts()
          ++inst_num)
     {
         if (!fromIEW->insts[inst_num]->isSquashed()) {
-            DPRINTF(Commit, "[tid:%i]: Marking PC %#x, SN %i ready within ROB.\n",
+            DPRINTF(Commit, "[tid:%i]: Marking PC %#x, [sn:%lli] ready "
+                    "within ROB.\n",
                     fromIEW->insts[inst_num]->threadNumber,
                     fromIEW->insts[inst_num]->readPC(),
                     fromIEW->insts[inst_num]->seqNum);
@@ -1257,30 +1201,6 @@ DefaultCommit<Impl>::markCompletedInsts()
     }
 }
 
-template <class Impl>
-uint64_t
-DefaultCommit<Impl>::readPC()
-{
-    // @todo: Fix this single thread hack.
-    return PC[0];
-}
-
-template <class Impl>
-void
-DefaultCommit<Impl>::setSquashing(unsigned tid)
-{
-    if (_status == Inactive) {
-        DPRINTF(Activity, "Activating stage.\n");
-        _status = Active;
-        cpu->activateStage(FullCPU::CommitIdx);
-    }
-
-    if (commitStatus[tid] != ROBSquashing) {
-        commitStatus[tid] = ROBSquashing;
-        ++squashCounter;
-    }
-}
-
 template <class Impl>
 bool
 DefaultCommit<Impl>::robDoneSquashing()
@@ -1308,39 +1228,39 @@ DefaultCommit<Impl>::updateComInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch()) {
-        stat_com_swp[thread]++;
+        statComSwp[thread]++;
     } else {
-        stat_com_inst[thread]++;
+        statComInst[thread]++;
     }
 #else
-    stat_com_inst[thread]++;
+    statComInst[thread]++;
 #endif
 
     //
     //  Control Instructions
     //
     if (inst->isControl())
-        stat_com_branches[thread]++;
+        statComBranches[thread]++;
 
     //
     //  Memory references
     //
     if (inst->isMemRef()) {
-        stat_com_refs[thread]++;
+        statComRefs[thread]++;
 
         if (inst->isLoad()) {
-            stat_com_loads[thread]++;
+            statComLoads[thread]++;
         }
     }
 
     if (inst->isMemBarrier()) {
-        stat_com_membars[thread]++;
+        statComMembars[thread]++;
     }
 }
 
 ////////////////////////////////////////
 //                                    //
-//   SMT COMMIT POLICY MAITAINED HERE //
+//  SMT COMMIT POLICY MAINTAINED HERE //
 //                                    //
 ////////////////////////////////////////
 template <class Impl>
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index 59308d6a9..9a46f2e7c 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -35,6 +35,7 @@
 #endif
 #include "sim/root.hh"
 
+#include "cpu/checker/cpu.hh"
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/o3/alpha_dyn_inst.hh"
@@ -76,7 +77,6 @@ FullO3CPU<Impl>::TickEvent::description()
     return "FullO3CPU tick event";
 }
 
-//Call constructor to all the pipeline stages here
 template <class Impl>
 FullO3CPU<Impl>::FullO3CPU(Params *params)
     : BaseFullCPU(params),
@@ -126,13 +126,25 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 //      pTable(params->pTable),
       mem(params->workload[0]->getMemory()),
 #endif // FULL_SYSTEM
-
+      switchCount(0),
       icacheInterface(params->icacheInterface),
       dcacheInterface(params->dcacheInterface),
-      deferRegistration(params->deferRegistration)
+      deferRegistration(params->deferRegistration),
+      numThreads(number_of_threads)
 {
     _status = Idle;
 
+    if (params->checker) {
+        BaseCPU *temp_checker = params->checker;
+        checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker);
+        checker->setMemory(mem);
+#if FULL_SYSTEM
+        checker->setSystem(params->system);
+#endif
+    } else {
+        checker = NULL;
+    }
+
 #if !FULL_SYSTEM
     thread.resize(number_of_threads);
     tids.resize(number_of_threads);
@@ -168,20 +180,18 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     commit.setIEWQueue(&iewQueue);
     commit.setRenameQueue(&renameQueue);
 
+    commit.setFetchStage(&fetch);
     commit.setIEWStage(&iew);
     rename.setIEWStage(&iew);
     rename.setCommitStage(&commit);
 
-    //Make Sure That this a Valid Architeture
-    //@todo: move this up in constructor
-    numThreads = number_of_threads;
-
 #if !FULL_SYSTEM
     int active_threads = params->workload.size();
 #else
     int active_threads = 1;
 #endif
 
+    //Make Sure That this a Valid Architeture
     assert(params->numPhysIntRegs   >= numThreads * TheISA::NumIntRegs);
     assert(params->numPhysFloatRegs >= numThreads * TheISA::NumFloatRegs);
 
@@ -357,7 +367,7 @@ FullO3CPU<Impl>::tick()
         cleanUpRemovedInsts();
     }
 
-    if (activityCount && !tickEvent.scheduled()) {
+    if (_status != SwitchedOut && activityCount && !tickEvent.scheduled()) {
         tickEvent.schedule(curTick + cycles(1));
     }
 
@@ -380,13 +390,7 @@ FullO3CPU<Impl>::init()
     for (int i = 0; i < number_of_threads; ++i)
         thread[i]->inSyscall = true;
 
-
-    // Need to do a copy of the xc->regs into the CPU's regfile so
-    // that it can start properly.
-
     for (int tid=0; tid < number_of_threads; tid++) {
-        // Need to do a copy of the xc->regs into the CPU's regfile so
-        // that it can start properly.
 #if FULL_SYSTEM
         ExecContext *src_xc = execContexts[tid];
 #else
@@ -406,8 +410,7 @@ FullO3CPU<Impl>::init()
     for (int i = 0; i < number_of_threads; ++i)
         thread[i]->inSyscall = false;
 
-    // Probably should just make a call to all the stages to init stage,
-    // regardless of whether or not they need it.  Keeps it more independent.
+    // Initialize stages.
     fetch.initStage();
     iew.initStage();
     rename.initStage();
@@ -570,7 +573,6 @@ template <class Impl>
 void
 FullO3CPU<Impl>::activateContext(int tid, int delay)
 {
-
     // Needs to set each stage to running as well.
     list<unsigned>::iterator isActive = find(
         activeThreads.begin(), activeThreads.end(), tid);
@@ -658,30 +660,46 @@ FullO3CPU<Impl>::haltContext(int tid)
 
 template <class Impl>
 void
-FullO3CPU<Impl>::switchOut(Sampler *sampler)
+FullO3CPU<Impl>::switchOut(Sampler *_sampler)
 {
-//    panic("FullO3CPU does not have a switch out function.\n");
+    sampler = _sampler;
+    switchCount = 0;
     fetch.switchOut();
     decode.switchOut();
     rename.switchOut();
     iew.switchOut();
     commit.switchOut();
+}
 
-    instList.clear();
-    while (!removeList.empty()) {
-        removeList.pop();
+template <class Impl>
+void
+FullO3CPU<Impl>::signalSwitched()
+{
+    if (++switchCount == 5) {
+        fetch.doSwitchOut();
+        rename.doSwitchOut();
+        commit.doSwitchOut();
+        instList.clear();
+        while (!removeList.empty()) {
+            removeList.pop();
+        }
+
+        if (checker)
+            checker->switchOut(sampler);
+
+        if (tickEvent.scheduled())
+            tickEvent.squash();
+        sampler->signalSwitched();
+        _status = SwitchedOut;
     }
-
-    if (tickEvent.scheduled())
-        tickEvent.squash();
-    sampler->signalSwitched();
-    _status = SwitchedOut;
+    assert(switchCount <= 5);
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
+    // Flush out any old data from the activity buffers.
     for (int i = 0; i < 6; ++i) {
         timeBuffer.advance();
         fetchQueue.advance();
@@ -733,13 +751,6 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
         tickEvent.schedule(curTick);
 }
 
-template <class Impl>
-InstSeqNum
-FullO3CPU<Impl>::getAndIncrementInstSeq()
-{
-    return globalSeqNum++;
-}
-
 template <class Impl>
 uint64_t
 FullO3CPU<Impl>::readIntReg(int reg_idx)
@@ -982,14 +993,9 @@ FullO3CPU<Impl>::removeInstsNotInROB(unsigned tid)
     while (inst_it != end_it) {
         assert(!instList.empty());
 
-        bool break_loop = (inst_it == instList.begin());
-
         squashInstIt(inst_it, tid);
 
         inst_it--;
-
-        if (break_loop)
-            break;
     }
 
     // If the ROB was empty, then we actually need to remove the first
@@ -1095,8 +1101,6 @@ FullO3CPU<Impl>::dumpInsts()
         inst_list_it++;
         ++num;
     }
-
-
 }
 
 template <class Impl>
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index 621ddf541..789729e61 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -46,6 +46,8 @@
 #include "cpu/o3/thread_state.hh"
 #include "sim/process.hh"
 
+template <class>
+class Checker;
 class ExecContext;
 class MemInterface;
 class Process;
@@ -199,13 +201,16 @@ class FullO3CPU : public BaseFullCPU
      */
     void switchOut(Sampler *sampler);
 
+    void signalSwitched();
+
     /** Takes over from another CPU.
      *  @todo: Implement this.
      */
     void takeOverFrom(BaseCPU *oldCPU);
 
     /** Get the current instruction sequence number, and increment it. */
-    InstSeqNum getAndIncrementInstSeq();
+    InstSeqNum getAndIncrementInstSeq()
+    { return globalSeqNum++; }
 
 #if FULL_SYSTEM
     /** Check if this address is a valid instruction address. */
@@ -333,9 +338,9 @@ class FullO3CPU : public BaseFullCPU
      */
     std::queue<ListIt> removeList;
 
-#ifdef DEBUG
+//#ifdef DEBUG
     std::set<InstSeqNum> snList;
-#endif
+//#endif
 
     /** Records if instructions need to be removed this cycle due to being
      *  retired or squashed.
@@ -474,6 +479,8 @@ class FullO3CPU : public BaseFullCPU
     /** The global sequence number counter. */
     InstSeqNum globalSeqNum;
 
+    Checker<DynInstPtr> *checker;
+
 #if FULL_SYSTEM
     /** Pointer to the system. */
     System *system;
@@ -484,12 +491,16 @@ class FullO3CPU : public BaseFullCPU
     PhysicalMemory *physmem;
 #endif
 
-    // List of all ExecContexts.
-    std::vector<Thread *> thread;
-
     /** Pointer to memory. */
     FunctionalMemory *mem;
 
+    Sampler *sampler;
+
+    int switchCount;
+
+    // List of all ExecContexts.
+    std::vector<Thread *> thread;
+
 #if 0
     /** Page table pointer. */
     PageTable *pTable;
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index caa97067b..a419a8932 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -166,6 +166,7 @@ template <class Impl>
 void
 DefaultDecode<Impl>::switchOut()
 {
+    cpu->signalSwitched();
 }
 
 template <class Impl>
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index 6074831c6..b03d4afe3 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -165,6 +165,8 @@ class DefaultFetch
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom();
 
     bool isSwitchedOut() { return switchedOut; }
@@ -371,6 +373,11 @@ class DefaultFetch
 
     bool switchedOut;
 
+  public:
+    InstSeqNum &getYoungestSN() { return youngestSN; }
+  private:
+    InstSeqNum youngestSN;
+
 #if !FULL_SYSTEM
     /** Page table pointer. */
 //    PageTable *pTable;
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 92f923c65..b4ff69d89 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -372,6 +372,13 @@ void
 DefaultFetch<Impl>::switchOut()
 {
     switchedOut = true;
+    cpu->signalSwitched();
+}
+
+template <class Impl>
+void
+DefaultFetch<Impl>::doSwitchOut()
+{
     branchPred.switchOut();
 }
 
@@ -474,7 +481,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     unsigned flags = 0;
 #endif // FULL_SYSTEM
 
-    if (interruptPending && flags == 0) {
+    if (interruptPending && flags == 0 || switchedOut) {
         // Hold off fetch from getting new instructions while an interrupt
         // is pending.
         return false;
@@ -508,7 +515,8 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     // instruction.
     if (fault == NoFault) {
 #if FULL_SYSTEM
-        if (cpu->system->memctrl->badaddr(memReq[tid]->paddr)) {
+        if (cpu->system->memctrl->badaddr(memReq[tid]->paddr) ||
+            memReq[tid]->flags & UNCACHEABLE) {
             DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
                     "misspeculating path!",
                     memReq[tid]->paddr);
@@ -625,8 +633,8 @@ DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
 template<class Impl>
 void
 DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
-                                    const InstSeqNum &seq_num,
-                                    unsigned tid)
+                                     const InstSeqNum &seq_num,
+                                     unsigned tid)
 {
     DPRINTF(Fetch, "[tid:%i]: Squashing from decode.\n",tid);
 
@@ -635,6 +643,7 @@ DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
     // Tell the CPU to remove any instructions that are in flight between
     // fetch and decode.
     cpu->removeInstsUntil(seq_num, tid);
+    youngestSN = seq_num;
 }
 
 template<class Impl>
@@ -820,6 +829,7 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)
 
         // In any case, squash.
         squash(fromCommit->commitInfo[tid].nextPC,tid);
+        youngestSN = fromCommit->commitInfo[tid].doneSeqNum;
 
         // Also check if there's a mispredict that happened.
         if (fromCommit->commitInfo[tid].branchMispredict) {
@@ -999,6 +1009,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             // Get a sequence number.
             inst_seq = cpu->getAndIncrementInstSeq();
 
+            youngestSN = inst_seq;
+
             // Make sure this is a valid index.
             assert(offset <= cacheBlkSize - instSize);
 
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index ae0ba6a21..72be25668 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -159,6 +159,8 @@ class DefaultIEW
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom();
 
     bool isSwitchedOut() { return switchedOut; }
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 42d83ee72..cbd7396f7 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -55,7 +55,11 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
 
     //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
 
-    if (inst->isSquashed() || iewStage->isSwitchedOut()) {
+    if (iewStage->isSwitchedOut()) {
+        inst = NULL;
+        return;
+    } else if (inst->isSquashed()) {
+        iewStage->wakeCPU();
         inst = NULL;
         return;
     }
@@ -440,8 +444,16 @@ DefaultIEW<Impl>::setPageTable(PageTable *pt_ptr)
 template <class Impl>
 void
 DefaultIEW<Impl>::switchOut()
+{
+    cpu->signalSwitched();
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::doSwitchOut()
 {
     switchedOut = true;
+
     instQueue.switchOut();
     ldstQueue.switchOut();
     fuPool->switchOut();
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index 3bb9a81f8..dca808ac9 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "cpu/checker/cpu.hh"
 #include "cpu/o3/lsq_unit.hh"
 #include "base/str.hh"
 
@@ -690,6 +691,9 @@ LSQUnit<Impl>::writebackStores()
         }
         if (!(req->flags & LOCKED)) {
             storeQueue[storeWBIdx].inst->setCompleted();
+            if (cpu->checker) {
+                cpu->checker->tick(storeQueue[storeWBIdx].inst);
+            }
         }
 
         if (dcacheInterface) {
@@ -937,6 +941,11 @@ LSQUnit<Impl>::completeStore(int store_idx)
         stallingStoreIsn = 0;
         iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
     }
+
+    storeQueue[store_idx].inst->setCompleted();
+    if (cpu->checker) {
+        cpu->checker->tick(storeQueue[store_idx].inst);
+    }
 }
 
 template <class Impl>
diff --git a/cpu/o3/regfile.hh b/cpu/o3/regfile.hh
index 78674c32c..ed1238d36 100644
--- a/cpu/o3/regfile.hh
+++ b/cpu/o3/regfile.hh
@@ -200,7 +200,7 @@ class PhysRegFile
                                   unsigned thread_id)
     {
         return miscRegs[thread_id].readRegWithEffect(misc_reg, fault,
-                                                     cpu->xcProxies[thread_id]);
+                                                     cpu->xcBase(thread_id));
     }
 
     Fault setMiscReg(int misc_reg, const MiscReg &val, unsigned thread_id)
@@ -212,7 +212,7 @@ class PhysRegFile
                                unsigned thread_id)
     {
         return miscRegs[thread_id].setRegWithEffect(misc_reg, val,
-                                                    cpu->xcProxies[thread_id]);
+                                                    cpu->xcBase(thread_id));
     }
 
 #if FULL_SYSTEM
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index 4c5c46356..dd2cb0c18 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -155,6 +155,8 @@ class DefaultRename
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom();
 
     /** Squashes all instructions in a thread. */
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index d41058deb..db4bb2ffe 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -261,6 +261,13 @@ DefaultRename<Impl>::setScoreboard(Scoreboard *_scoreboard)
 template <class Impl>
 void
 DefaultRename<Impl>::switchOut()
+{
+    cpu->signalSwitched();
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::doSwitchOut()
 {
     for (int i = 0; i < numThreads; i++) {
         typename list<RenameHistory>::iterator hb_it = historyBuffer[i].begin();

From 52383ca7cc2b4698109b71a968cde16e9f7dc6e0 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 16 May 2006 14:09:04 -0400
Subject: [PATCH 29/50] Sampler updates.

cpu/ozone/cpu.hh:
    Updates for sampler.
cpu/ozone/cpu_impl.hh:
    Updates for sampler, checker.
cpu/ozone/inorder_back_end.hh:
    Sampler updates.  Also support old memory system.

--HG--
extra : convert_revision : 33ebe38e4c08d49c6af84032b819533b784b4fe8
---
 cpu/ozone/cpu.hh              |   8 ++-
 cpu/ozone/cpu_impl.hh         | 122 +++++++++++-----------------------
 cpu/ozone/front_end.hh        |   2 +
 cpu/ozone/front_end_impl.hh   |  10 +++
 cpu/ozone/inorder_back_end.hh |  44 ++++++++++--
 cpu/ozone/lw_back_end.hh      |   3 +-
 cpu/ozone/lw_back_end_impl.hh |  19 ++++++
 cpu/ozone/lw_lsq_impl.hh      |   9 ++-
 8 files changed, 122 insertions(+), 95 deletions(-)

diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index eec8902d8..1d522b2fa 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -64,6 +64,7 @@ class Process;
 #endif // FULL_SYSTEM
 
 class Checkpoint;
+class EndQuiesceEvent;
 class MemInterface;
 
 namespace Trace {
@@ -149,7 +150,7 @@ class OzoneCPU : public BaseCPU
         void unserialize(Checkpoint *cp, const std::string &section);
 
 #if FULL_SYSTEM
-        Event *getQuiesceEvent();
+        EndQuiesceEvent *getQuiesceEvent();
 
         Tick readLastActivate();
         Tick readLastSuspend();
@@ -330,8 +331,13 @@ class OzoneCPU : public BaseCPU
     int cpuId;
 
     void switchOut(Sampler *sampler);
+    void signalSwitched();
     void takeOverFrom(BaseCPU *oldCPU);
 
+    Sampler *sampler;
+
+    int switchCount;
+
 #if FULL_SYSTEM
     Addr dbg_vtophys(Addr addr);
 
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index 4f3fdf521..b085f077f 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -329,15 +329,30 @@ OzoneCPU<Impl>::copyToXC()
 */
 template <class Impl>
 void
-OzoneCPU<Impl>::switchOut(Sampler *sampler)
+OzoneCPU<Impl>::switchOut(Sampler *_sampler)
 {
+    sampler = _sampler;
+    switchCount = 0;
     // Front end needs state from back end, so switch out the back end first.
     backEnd->switchOut();
     frontEnd->switchOut();
-    _status = SwitchedOut;
-    if (tickEvent.scheduled())
-        tickEvent.squash();
-    sampler->signalSwitched();
+}
+
+template <class Impl>
+void
+OzoneCPU<Impl>::signalSwitched()
+{
+    if (++switchCount == 2) {
+        backEnd->doSwitchOut();
+        frontEnd->doSwitchOut();
+        if (checker)
+            checker->switchOut(sampler);
+        _status = SwitchedOut;
+        if (tickEvent.scheduled())
+            tickEvent.squash();
+        sampler->signalSwitched();
+    }
+    assert(switchCount <= 2);
 }
 
 template <class Impl>
@@ -366,6 +381,11 @@ OzoneCPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
             tickEvent.schedule(curTick);
         }
     }
+    // Nothing running, change status to reflect that we're no longer
+    // switched out.
+    if (_status == SwitchedOut) {
+        _status = Idle;
+    }
 }
 
 template <class Impl>
@@ -666,83 +686,6 @@ OzoneCPU<Impl>::tick()
     thread.renameTable[ZeroReg+TheISA::FP_Base_DepTag]->
         setDoubleResult(0.0);
 
-    // General code flow:
-    // Check for any interrupts.  Handle them if I do have one.
-    // Check if I have a need to fetch a new cache block.  Either a bit could be
-    // set by functions indicating that I need to fetch a new block, or I could
-    // hang onto the last PC of the last cache block I fetched and compare the
-    // current PC to that.  Setting a bit seems nicer but may be more error
-    // prone.
-    // Scan through the IQ to figure out if there's anything I can issue/execute
-    // Might need something close to the FU Pools to tell what instructions
-    // I can issue.  How to handle loads and stores vs other insts?
-    // Extremely slow way: find first inst that can possibly issue; if it's a
-    // load or a store, then iterate through load/store queue.
-    // If I can't find instructions to execute and I've got room in the IQ
-    // (which is just a counter), then grab a few instructions out of the cache
-    // line buffer until I either run out or can execute up until my limit.
-
-    numCycles++;
-
-    traceData = NULL;
-
-//    Fault fault = NoFault;
-
-#if 0 // FULL_SYSTEM
-    if (checkInterrupts && check_interrupts() && !inPalMode() &&
-        status() != IcacheMissComplete) {
-        int ipl = 0;
-        int summary = 0;
-        checkInterrupts = false;
-
-        if (readMiscReg(IPR_SIRR)) {
-            for (int i = INTLEVEL_SOFTWARE_MIN;
-                 i < INTLEVEL_SOFTWARE_MAX; i++) {
-                if (readMiscReg(IPR_SIRR) & (ULL(1) << i)) {
-                    // See table 4-19 of 21164 hardware reference
-                    ipl = (i - INTLEVEL_SOFTWARE_MIN) + 1;
-                    summary |= (ULL(1) << i);
-                }
-            }
-        }
-
-        // Is this method so that if the interrupts are switched over from
-        // another CPU they'll still be handled?
-//	uint64_t interrupts = cpuXC->cpu->intr_status();
-        uint64_t interrupts = intr_status();
-        for (int i = INTLEVEL_EXTERNAL_MIN;
-            i < INTLEVEL_EXTERNAL_MAX; i++) {
-            if (interrupts & (ULL(1) << i)) {
-                // See table 4-19 of 21164 hardware reference
-                ipl = i;
-                summary |= (ULL(1) << i);
-            }
-        }
-
-        if (readMiscReg(IPR_ASTRR))
-            panic("asynchronous traps not implemented\n");
-
-        if (ipl && ipl > readMiscReg(IPR_IPLR)) {
-            setMiscReg(IPR_ISR, summary);
-            setMiscReg(IPR_INTID, ipl);
-
-            Fault(new InterruptFault)->invoke(xc);
-
-            DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
-                    readMiscReg(IPR_IPLR), ipl, summary);
-        }
-    }
-#endif
-
-    // Make call to ISA to ensure 0 register semantics...actually because the
-    // DynInsts will generally be the register file, this should only have to
-    // happen when the xc is actually written to (during a syscall or something)
-    // maintain $r0 semantics
-//    assert(renameTable[ZeroReg]->readIntResult() == 0);
-#ifdef TARGET_ALPHA
-//    assert(renameTable[ZeroReg]->readDoubleResult() == 0);
-#endif // TARGET_ALPHA
-
     comm.advance();
     frontEnd->tick();
     backEnd->tick();
@@ -876,8 +819,8 @@ OzoneCPU<Impl>::processInterrupts()
         thread.setMiscReg(IPR_INTID, ipl);
         // @todo: Make this more transparent
         if (checker) {
-            checkerXC->setMiscReg(IPR_ISR, summary);
-            checkerXC->setMiscReg(IPR_INTID, ipl);
+            checker->cpuXCBase()->setMiscReg(IPR_ISR, summary);
+            checker->cpuXCBase()->setMiscReg(IPR_INTID, ipl);
         }
         Fault fault = new InterruptFault;
         fault->invoke(thread.getXCProxy());
@@ -993,6 +936,15 @@ OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
     setFuncExeInst(old_context->readFuncExeInst());
 #endif
 
+    EndQuiesceEvent *other_quiesce = old_context->getQuiesceEvent();
+    if (other_quiesce) {
+        // Point the quiesce event's XC at this XC so that it wakes up
+        // the proper CPU.
+        other_quiesce->xc = this;
+    }
+    if (thread->quiesceEvent) {
+        thread->quiesceEvent->xc = this;
+    }
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
 
@@ -1016,7 +968,7 @@ OzoneCPU<Impl>::OzoneXC::unserialize(Checkpoint *cp, const std::string &section)
 
 #if FULL_SYSTEM
 template <class Impl>
-Event *
+EndQuiesceEvent *
 OzoneCPU<Impl>::OzoneXC::getQuiesceEvent()
 {
     return thread->quiesceEvent;
diff --git a/cpu/ozone/front_end.hh b/cpu/ozone/front_end.hh
index 188925ae5..f9db9ea5c 100644
--- a/cpu/ozone/front_end.hh
+++ b/cpu/ozone/front_end.hh
@@ -68,6 +68,8 @@ class FrontEnd
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom(ExecContext *old_xc = NULL);
 
     bool isSwitchedOut() { return switchedOut; }
diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh
index a3eb809d0..8ae9ec696 100644
--- a/cpu/ozone/front_end_impl.hh
+++ b/cpu/ozone/front_end_impl.hh
@@ -240,6 +240,9 @@ template <class Impl>
 void
 FrontEnd<Impl>::tick()
 {
+    if (switchedOut)
+        return;
+
     // @todo: Maybe I want to just have direct communication...
     if (fromCommit->doneSeqNum) {
         branchPred.update(fromCommit->doneSeqNum, 0);
@@ -828,6 +831,13 @@ void
 FrontEnd<Impl>::switchOut()
 {
     switchedOut = true;
+    cpu->signalSwitched();
+}
+
+template <class Impl>
+void
+FrontEnd<Impl>::doSwitchOut()
+{
     memReq = NULL;
     squash(0, 0);
     instBuffer.clear();
diff --git a/cpu/ozone/inorder_back_end.hh b/cpu/ozone/inorder_back_end.hh
index 6519b79e5..4039d8384 100644
--- a/cpu/ozone/inorder_back_end.hh
+++ b/cpu/ozone/inorder_back_end.hh
@@ -97,6 +97,10 @@ class InorderBackEnd
 
     Addr commitPC;
 
+    void switchOut() { panic("Not implemented!"); }
+    void doSwitchOut() { panic("Not implemented!"); }
+    void takeOverFrom(ExecContext *old_xc = NULL) { panic("Not implemented!"); }
+
   public:
     FullCPU *cpu;
 
@@ -330,14 +334,17 @@ InorderBackEnd<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
     // translate to physical address
 //    Fault fault = cpu->translateDataReadReq(req);
+    req->cmd = Read;
+    req->completionEvent = NULL;
+    req->time = curTick;
+    assert(!req->data);
+    req->data = new uint8_t[64];
+    req->flags &= ~INST_READ;
+    Fault fault = cpu->read(req, data);
+    memcpy(req->data, &data, sizeof(T));
 
     // if we have a cache, do cache access too
     if (dcacheInterface) {
-        req->cmd = Read;
-        req->completionEvent = NULL;
-        req->data = new uint8_t[64];
-        req->time = curTick;
-        req->flags &= ~INST_READ;
         MemAccessResult result = dcacheInterface->access(req);
 
         // Ugly hack to get an event scheduled *only* if the access is
@@ -372,6 +379,30 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
     // translate to physical address
 //    Fault fault = cpu->translateDataWriteReq(req);
 
+    req->cmd = Write;
+    req->completionEvent = NULL;
+    req->time = curTick;
+    assert(!req->data);
+    req->data = new uint8_t[64];
+    memcpy(req->data, (uint8_t *)&data, req->size);
+
+    switch(req->size) {
+      case 1:
+        cpu->write(req, (uint8_t &)data);
+        break;
+      case 2:
+        cpu->write(req, (uint16_t &)data);
+        break;
+      case 4:
+        cpu->write(req, (uint32_t &)data);
+        break;
+      case 8:
+        cpu->write(req, (uint64_t &)data);
+        break;
+      default:
+        panic("Unexpected store size!\n");
+    }
+
     if (dcacheInterface) {
         req->cmd = Write;
         req->data = new uint8_t[64];
@@ -395,7 +426,7 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
 
         }
     }
-
+/*
     if (req->flags & LOCKED) {
         if (req->flags & UNCACHEABLE) {
             // Don't update result register (see stq_c in isa_desc)
@@ -404,6 +435,7 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
             req->result = 1;
         }
     }
+*/
 /*
     if (res && (fault == NoFault))
         *res = req->result;
diff --git a/cpu/ozone/lw_back_end.hh b/cpu/ozone/lw_back_end.hh
index 028fdaf8c..770b66ad5 100644
--- a/cpu/ozone/lw_back_end.hh
+++ b/cpu/ozone/lw_back_end.hh
@@ -187,7 +187,7 @@ class LWBackEnd
     void instToCommit(DynInstPtr &inst);
 
     void switchOut();
-
+    void doSwitchOut();
     void takeOverFrom(ExecContext *old_xc = NULL);
 
     bool isSwitchedOut() { return switchedOut; }
@@ -314,6 +314,7 @@ class LWBackEnd
     bool fetchHasFault;
 
     bool switchedOut;
+    bool switchPending;
 
     DynInstPtr memBarrier;
 
diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh
index d4829629d..a82dd5b70 100644
--- a/cpu/ozone/lw_back_end_impl.hh
+++ b/cpu/ozone/lw_back_end_impl.hh
@@ -192,6 +192,7 @@ LWBackEnd<Impl>::LWBackEnd(Params *params)
     numWaitingMemOps = 0;
     waitingInsts = 0;
     switchedOut = false;
+    switchPending = false;
 
 //    IQ.setBE(this);
     LSQ.setBE(this);
@@ -631,6 +632,11 @@ LWBackEnd<Impl>::tick()
 {
     DPRINTF(BE, "Ticking back end\n");
 
+    if (switchPending && robEmpty() && !LSQ.hasStoresToWB()) {
+        cpu->signalSwitched();
+        return;
+    }
+
     ROB_count[0]+= numInsts;
 
     wbCycle = 0;
@@ -682,6 +688,7 @@ LWBackEnd<Impl>::tick()
     assert(numInsts == instList.size());
     assert(waitingInsts == waitingList.size());
     assert(numWaitingMemOps == waitingMemOps.size());
+    assert(!switchedOut);
 #endif
 }
 
@@ -1440,12 +1447,24 @@ LWBackEnd<Impl>::fetchFault(Fault &fault)
 template <class Impl>
 void
 LWBackEnd<Impl>::switchOut()
+{
+    switchPending = true;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::doSwitchOut()
 {
     switchedOut = true;
+    switchPending = false;
     // Need to get rid of all committed, non-speculative state and write it
     // to memory/XC.  In this case this is stores that have committed and not
     // yet written back.
+    assert(robEmpty());
+    assert(!LSQ.hasStoresToWB());
+
     LSQ.switchOut();
+
     squash(0);
 }
 
diff --git a/cpu/ozone/lw_lsq_impl.hh b/cpu/ozone/lw_lsq_impl.hh
index 9b7e48f96..fdf6bff07 100644
--- a/cpu/ozone/lw_lsq_impl.hh
+++ b/cpu/ozone/lw_lsq_impl.hh
@@ -791,6 +791,8 @@ template <class Impl>
 void
 OzoneLWLSQ<Impl>::switchOut()
 {
+//    assert(loads == 0);
+    assert(storesToWB == 0);
     switchedOut = true;
     SQIt sq_it = --(storeQueue.end());
     while (storesToWB > 0 &&
@@ -810,10 +812,13 @@ OzoneLWLSQ<Impl>::switchOut()
         // Store conditionals don't complete until *after* they have written
         // back.  If it's here and not yet sent to memory, then don't bother
         // as it's not part of committed state.
-        if (inst->isDataPrefetch() || (*sq_it).committed ||
-            (*sq_it).req->flags & LOCKED) {
+        if (inst->isDataPrefetch() || (*sq_it).committed) {
             sq_it--;
             continue;
+        } else if ((*sq_it).req->flags & LOCKED) {
+            sq_it--;
+            assert(!(*sq_it).canWB || ((*sq_it).canWB && (*sq_it).req->flags & LOCKED));
+            continue;
         }
 
         assert((*sq_it).req);

From abe14c253b64eb3c991309bf24db60103095c70d Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 16 May 2006 14:47:09 -0400
Subject: [PATCH 30/50] Include checker and trap latency parameters.

--HG--
extra : convert_revision : 148c59f430874e8425952db6960ca4f5e57e2a42
---
 python/m5/objects/AlphaFullCPU.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/m5/objects/AlphaFullCPU.py b/python/m5/objects/AlphaFullCPU.py
index 284398b0e..1541b9494 100644
--- a/python/m5/objects/AlphaFullCPU.py
+++ b/python/m5/objects/AlphaFullCPU.py
@@ -9,6 +9,8 @@ class DerivAlphaFullCPU(BaseCPU):
     if not build_env['FULL_SYSTEM']:
         mem = Param.FunctionalMemory(NULL, "memory")
 
+    checker = Param.BaseCPU(NULL, "checker")
+
     cachePorts = Param.Unsigned("Cache Ports")
 
     decodeToFetchDelay = Param.Unsigned("Decode to fetch delay")
@@ -50,6 +52,8 @@ class DerivAlphaFullCPU(BaseCPU):
     renameToROBDelay = Param.Unsigned("Rename to reorder buffer delay")
     commitWidth = Param.Unsigned("Commit width")
     squashWidth = Param.Unsigned("Squash width")
+    trapLatency = Param.Tick("Trap latency")
+    fetchTrapLatency = Param.Tick("Fetch trap latency")
 
     localPredictorSize = Param.Unsigned("Size of local predictor")
     localCtrBits = Param.Unsigned("Bits per counter")

From 343bff3b7dadfe9f6e6062610a086dea0783722a Mon Sep 17 00:00:00 2001
From: Steve Reinhardt <stever@eecs.umich.edu>
Date: Wed, 17 May 2006 07:05:27 -0400
Subject: [PATCH 32/50] Backport ISA scanner fix from newmem to work with scons
 0.96.9* versions.

arch/SConscript:
    Backport ISA scanner fix from newmem.

--HG--
extra : convert_revision : 96be75660f85900fd26badef36fb4109b36d8394
---
 arch/SConscript | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/arch/SConscript b/arch/SConscript
index 0533261a2..92547c0ae 100644
--- a/arch/SConscript
+++ b/arch/SConscript
@@ -96,18 +96,12 @@ for hdr in isa_switch_hdrs:
 #
 import SCons.Scanner
 
-def ISAScan():
-   return SCons.Scanner.Classic("ISAScan",
-                                "$ISASUFFIXES",
-                                "SRCDIR",
-                                '^[ \t]*##[ \t]*include[ \t]*"([^>"]+)"')
+isa_scanner = SCons.Scanner.Classic("ISAScan",
+                                    [".isa", ".ISA"],
+                                    "SRCDIR",
+                                    r'^\s*##include\s+"([\w/.-]*)"')
 
-def ISAPath(env, dir, target=None, source=None, a=None):
-   return (Dir(env['SRCDIR']), Dir('.'))   
-
-iscan = Scanner(function = ISAScan().scan, skeys = [".isa", ".ISA"],
-                path_function = ISAPath)
-env.Append(SCANNERS = iscan)
+env.Append(SCANNERS = isa_scanner)
 
 #
 # Now create a Builder object that uses isa_parser.py to generate C++
@@ -134,8 +128,7 @@ def isa_desc_emitter(target, source, env):
     return (isa_desc_gen_files, [isa_parser, cpu_models_file] + source)
 
 # Pieces are in place, so create the builder.
-isa_desc_builder = Builder(action='$SOURCES $TARGET.dir $CPU_MODELS',
-                           source_scanner = iscan,
+isa_desc_builder = Builder(action='python $SOURCES $TARGET.dir $CPU_MODELS',
                            emitter = isa_desc_emitter)
 
 env.Append(BUILDERS = { 'ISADesc' : isa_desc_builder })

From 36581a534240c322e1fc28b8bd6e8f13f2b0fefd Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Wed, 17 May 2006 14:25:10 -0400
Subject: [PATCH 33/50] Faults generated at fetch are passed to the backend by
 creating a dummy nop instruction and giving it the fault.  This unifies front
 end faults and normal instruction faults.

cpu/checker/cpu.cc:
    Fixups for fetch fault being sent with the instruction.
cpu/o3/fetch_impl.hh:
cpu/ozone/front_end_impl.hh:
    Send any faults generated at fetch along with a fake nop instruction to the back end.  This avoids having to use direct communication to check if the entire front end has drained; it is naturally handled through the nop's fault being handled when it reaches the head of commit.
cpu/ozone/front_end.hh:
    Add extra status TrapPending.
cpu/ozone/lw_back_end_impl.hh:
    Fetch fault handled through a dummy nop carrying the fetch fault.

    Avoid putting Nops on the exeList.

--HG--
extra : convert_revision : 8d9899748b34c204763a49c48a9b5113864f5789
---
 cpu/checker/cpu.cc            |  45 +++++++-------
 cpu/o3/fetch_impl.hh          | 107 ++++++++++++----------------------
 cpu/ozone/front_end.hh        |   1 +
 cpu/ozone/front_end_impl.hh   |  36 +++++++++---
 cpu/ozone/lw_back_end_impl.hh |  10 +++-
 5 files changed, 99 insertions(+), 100 deletions(-)

diff --git a/cpu/checker/cpu.cc b/cpu/checker/cpu.cc
index f1b43f601..f76f1e063 100644
--- a/cpu/checker/cpu.cc
+++ b/cpu/checker/cpu.cc
@@ -607,41 +607,46 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
         bool succeeded = translateInstReq(memReq);
 
         if (!succeeded) {
-            warn("Instruction PC %#x was not found in the ITB!",
-                 cpuXC->readPC());
-            handleError();
+            if (inst->getFault() == NoFault) {
+                warn("Instruction PC %#x was not found in the ITB!",
+                     cpuXC->readPC());
+                handleError();
 
-            // go to the next instruction
-            cpuXC->setPC(cpuXC->readNextPC());
-            cpuXC->setNextPC(cpuXC->readNextPC() + sizeof(MachInst));
+                // go to the next instruction
+                cpuXC->setPC(cpuXC->readNextPC());
+                cpuXC->setNextPC(cpuXC->readNextPC() + sizeof(MachInst));
 
-            return;
+                return;
+            } else {
+                fault = inst->getFault();
+            }
         }
 
-//    if (fault == NoFault)
+        if (fault == NoFault) {
 //        fault = cpuXC->mem->read(memReq, machInst);
-        cpuXC->mem->read(memReq, machInst);
+            cpuXC->mem->read(memReq, machInst);
 
-        // If we've got a valid instruction (i.e., no fault on instruction
-        // fetch), then execute it.
+            // If we've got a valid instruction (i.e., no fault on instruction
+            // fetch), then execute it.
 
         // keep an instruction count
-        numInst++;
+            numInst++;
 //	numInsts++;
 
-        // decode the instruction
-        machInst = gtoh(machInst);
-        // Checks that the instruction matches what we expected it to be.
-        // Checks both the machine instruction and the PC.
-        validateInst(inst);
+            // decode the instruction
+            machInst = gtoh(machInst);
+            // Checks that the instruction matches what we expected it to be.
+            // Checks both the machine instruction and the PC.
+            validateInst(inst);
 
-        curStaticInst = StaticInst::decode(makeExtMI(machInst, cpuXC->readPC()));
+            curStaticInst = StaticInst::decode(makeExtMI(machInst, cpuXC->readPC()));
 
 #if FULL_SYSTEM
-        cpuXC->setInst(machInst);
+            cpuXC->setInst(machInst);
 #endif // FULL_SYSTEM
 
-        fault = inst->getFault();
+            fault = inst->getFault();
+        }
 
         // Either the instruction was a fault and we should process the fault,
         // or we should just go ahead execute the instruction.  This assumes
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index b4ff69d89..523719945 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -27,22 +27,21 @@
  */
 
 #include "arch/isa_traits.hh"
-#include "sim/byteswap.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/o3/fetch.hh"
 #include "mem/base_mem.hh"
 #include "mem/mem_interface.hh"
 #include "mem/mem_req.hh"
-
+#include "sim/byteswap.hh"
 #include "sim/root.hh"
 
 #if FULL_SYSTEM
+#include "arch/tlb.hh"
+#include "arch/vtophys.hh"
 #include "base/remote_gdb.hh"
 #include "mem/functional/memory_control.hh"
 #include "mem/functional/physical.hh"
 #include "sim/system.hh"
-#include "arch/tlb.hh"
-#include "arch/vtophys.hh"
 #else // !FULL_SYSTEM
 #include "mem/functional/functional.hh"
 #endif // FULL_SYSTEM
@@ -136,14 +135,7 @@ DefaultFetch<Impl>::DefaultFetch(Params *params)
 
         // Create a new memory request.
         memReq[tid] = NULL;
-//        memReq[tid] = new MemReq();
-/*
-        // Need a way of setting this correctly for parallel programs
-        // @todo: Figure out how to properly set asid vs thread_num.
-        memReq[tid]->asid = tid;
-        memReq[tid]->thread_num = tid;
-        memReq[tid]->data = new uint8_t[64];
-*/
+
         // Create space to store a cache line.
         cacheData[tid] = new uint8_t[cacheBlkSize];
 
@@ -261,10 +253,6 @@ DefaultFetch<Impl>::setCPU(FullCPU *cpu_ptr)
     DPRINTF(Fetch, "Setting the CPU pointer.\n");
     cpu = cpu_ptr;
 
-    // Set ExecContexts for Memory Requests
-//    for (int tid=0; tid < numThreads; tid++)
-//        memReq[tid]->xc = cpu->xcBase(tid);
-
     // Fetch needs to start fetching instructions at the very beginning,
     // so it must start up in active state.
     switchToActive();
@@ -362,9 +350,8 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
 
 //    memcpy(cacheData[tid], memReq[tid]->data, memReq[tid]->size);
 
-    // Reset the completion event to NULL.
+    // Reset the mem req to NULL.
     memReq[tid] = NULL;
-//    memReq[tid]->completionEvent = NULL;
 }
 
 template <class Impl>
@@ -468,10 +455,6 @@ template <class Impl>
 bool
 DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid)
 {
-    // Check if the instruction exists within the cache.
-    // If it does, then proceed on to read the instruction and the rest
-    // of the instructions in the cache line until either the end of the
-    // cache line or a predicted taken branch is encountered.
     Fault fault = NoFault;
 
 #if FULL_SYSTEM
@@ -509,7 +492,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
 //#endif
 
     // In the case of faults, the fetch stage may need to stall and wait
-    // on what caused the fetch (ITB or Icache miss).
+    // for the ITB miss to be handled.
 
     // If translation was successful, attempt to read the first
     // instruction.
@@ -518,7 +501,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
         if (cpu->system->memctrl->badaddr(memReq[tid]->paddr) ||
             memReq[tid]->flags & UNCACHEABLE) {
             DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
-                    "misspeculating path!",
+                    "misspeculating path)!",
                     memReq[tid]->paddr);
             ret_fault = TheISA::genMachineCheckFault();
             return false;
@@ -587,44 +570,9 @@ DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
     if (fetchStatus[tid] == IcacheMissStall && icacheInterface) {
         DPRINTF(Fetch, "[tid:%i]: Squashing outstanding Icache miss.\n",
                 tid);
-//        icacheInterface->squash(tid);
-/*
-        if (memReq[tid]->completionEvent) {
-            if (memReq[tid]->completionEvent->scheduled()) {
-                memReq[tid]->completionEvent->squash();
-            } else {
-                delete memReq[tid]->completionEvent;
-                memReq[tid]->completionEvent = NULL;
-            }
-        }
-*/
         memReq[tid] = NULL;
     }
 
-    if (fetchStatus[tid] == TrapPending) {
-        // @todo: Hardcoded number here
-
-        // This is only effective if communication to and from commit
-        // is identical.  If it's faster to commit than it is from
-        // commit to here, then it causes problems.
-
-        bool found_fault = false;
-        for (int i = 0; i > -5; --i) {
-            if (fetchQueue->access(i)->fetchFault) {
-                DPRINTF(Fetch, "[tid:%i]: Fetch used to be in a trap, "
-                        "clearing it.\n",
-                        tid);
-                fetchQueue->access(i)->fetchFault = NoFault;
-                found_fault = true;
-            }
-        }
-        if (!found_fault) {
-            warn("%lli Fault from fetch not found in time buffer!",
-                 curTick);
-        }
-        toDecode->clearFetchFault = true;
-    }
-
     fetchStatus[tid] = Squashing;
 
     ++fetchSquashCycles;
@@ -643,7 +591,6 @@ DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
     // Tell the CPU to remove any instructions that are in flight between
     // fetch and decode.
     cpu->removeInstsUntil(seq_num, tid);
-    youngestSN = seq_num;
 }
 
 template<class Impl>
@@ -829,7 +776,6 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)
 
         // In any case, squash.
         squash(fromCommit->commitInfo[tid].nextPC,tid);
-        youngestSN = fromCommit->commitInfo[tid].doneSeqNum;
 
         // Also check if there's a mispredict that happened.
         if (fromCommit->commitInfo[tid].branchMispredict) {
@@ -1009,8 +955,6 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             // Get a sequence number.
             inst_seq = cpu->getAndIncrementInstSeq();
 
-            youngestSN = inst_seq;
-
             // Make sure this is a valid index.
             assert(offset <= cacheBlkSize - instSize);
 
@@ -1095,14 +1039,37 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         // This stage will not be able to continue until all the ROB
         // slots are empty, at which point the fault can be handled.
         // The only other way it can wake up is if a squash comes along
-        // and changes the PC.  Not sure how to handle that case...perhaps
-        // have it handled by the upper level CPU class which peeks into the
-        // time buffer and sees if a squash comes along, in which case it
-        // changes the status.
+        // and changes the PC.
 #if FULL_SYSTEM
+        assert(numInst != fetchWidth);
+        // Get a sequence number.
+        inst_seq = cpu->getAndIncrementInstSeq();
+        // We will use a nop in order to carry the fault.
+        ext_inst = TheISA::NoopMachInst;
+
+        // Create a new DynInst from the dummy nop.
+        DynInstPtr instruction = new DynInst(ext_inst, fetch_PC,
+                                             next_PC,
+                                             inst_seq, cpu);
+        instruction->setPredTarg(next_PC + instSize);
+        instruction->setThread(tid);
+
+        instruction->setASID(tid);
+
+        instruction->setState(cpu->thread[tid]);
+
+        instruction->traceData = NULL;
+
+        instruction->setInstListIt(cpu->addInst(instruction));
+
+        instruction->fault = fault;
+
+        toDecode->insts[numInst] = instruction;
+        toDecode->size++;
+
         // Tell the commit stage the fault we had.
-        toDecode->fetchFault = fault;
-        toDecode->fetchFaultSN = cpu->globalSeqNum;
+//        toDecode->fetchFault = fault;
+//        toDecode->fetchFaultSN = cpu->globalSeqNum;
 
         DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid);
 
diff --git a/cpu/ozone/front_end.hh b/cpu/ozone/front_end.hh
index f9db9ea5c..326f7d2c9 100644
--- a/cpu/ozone/front_end.hh
+++ b/cpu/ozone/front_end.hh
@@ -120,6 +120,7 @@ class FrontEnd
         SerializeComplete,
         RenameBlocked,
         QuiescePending,
+        TrapPending,
         BEBlocked
     };
 
diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh
index 8ae9ec696..cd57aeef4 100644
--- a/cpu/ozone/front_end_impl.hh
+++ b/cpu/ozone/front_end_impl.hh
@@ -268,11 +268,9 @@ FrontEnd<Impl>::tick()
     }
 
     if (status == RenameBlocked || status == SerializeBlocked ||
-        status == BEBlocked) {
-        // This might cause the front end to run even though it
-        // shouldn't, but this should only be a problem for one cycle.
-        // Also will cause a one cycle bubble between changing state
-        // and restarting.
+        status == TrapPending || status == BEBlocked) {
+        // Will cause a one cycle bubble between changing state and
+        // restarting.
         DPRINTF(FE, "In blocked status.\n");
 
         fetchBlockedCycles++;
@@ -537,9 +535,32 @@ void
 FrontEnd<Impl>::handleFault(Fault &fault)
 {
     DPRINTF(FE, "Fault at fetch, telling commit\n");
-    backEnd->fetchFault(fault);
+//    backEnd->fetchFault(fault);
     // We're blocked on the back end until it handles this fault.
-    status = BEBlocked;
+    status = TrapPending;
+
+    // Get a sequence number.
+    InstSeqNum inst_seq = getAndIncrementInstSeq();
+    // We will use a nop in order to carry the fault.
+    ExtMachInst ext_inst = TheISA::NoopMachInst;
+
+    // Create a new DynInst from the dummy nop.
+    DynInstPtr instruction = new DynInst(ext_inst, PC,
+                                         PC+sizeof(MachInst),
+                                         inst_seq, cpu);
+    instruction->setPredTarg(instruction->readNextPC());
+//    instruction->setThread(tid);
+
+//    instruction->setASID(tid);
+
+    instruction->setState(thread);
+
+    instruction->traceData = NULL;
+
+    instruction->fault = fault;
+    instruction->setCanIssue();
+    instBuffer.push_back(instruction);
+    ++instBufferSize;
 }
 
 template <class Impl>
@@ -881,7 +902,6 @@ FrontEnd<Impl>::dumpInsts()
                 (*buff_it)->isSquashed());
         buff_it++;
     }
-
 }
 
 template <class Impl>
diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh
index a82dd5b70..db0872e52 100644
--- a/cpu/ozone/lw_back_end_impl.hh
+++ b/cpu/ozone/lw_back_end_impl.hh
@@ -652,7 +652,7 @@ LWBackEnd<Impl>::tick()
         squashFromTrap();
     } else if (xcSquash) {
         squashFromXC();
-    } else if (fetchHasFault && robEmpty() && frontEnd->isEmpty() && !LSQ.hasStoresToWB()) {
+    } /*else if (fetchHasFault && robEmpty() && frontEnd->isEmpty() && !LSQ.hasStoresToWB()) {
         DPRINTF(BE, "ROB and front end empty, handling fetch fault\n");
         Fault fetch_fault = frontEnd->getFault();
         if (fetch_fault == NoFault) {
@@ -662,7 +662,7 @@ LWBackEnd<Impl>::tick()
             handleFault(fetch_fault);
             fetchHasFault = false;
         }
-    }
+        }*/
 #endif
 
     if (dispatchStatus != Blocked) {
@@ -777,6 +777,12 @@ LWBackEnd<Impl>::dispatchInsts()
                             inst->seqNum);
                     exeList.push(inst);
                 }
+            } else if (inst->isNop()) {
+                DPRINTF(BE, "Nop encountered [sn:%lli], skipping exeList.\n",
+                        inst->seqNum);
+                inst->setIssued();
+                inst->setExecuted();
+                inst->setCanCommit();
             } else {
                 DPRINTF(BE, "Instruction [sn:%lli] ready, addding to exeList.\n",
                         inst->seqNum);

From c7e7d07ec395156015e3baf52048c403d28a6442 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 14:27:46 -0400
Subject: [PATCH 34/50] Fixes for regression build errors.

--HG--
extra : convert_revision : 1f59c853cb0e327d7cf586021b5139f1242e4f28
---
 cpu/cpu_exec_context.cc  | 4 ++--
 cpu/o3/alpha_cpu_impl.hh | 4 ++--
 cpu/ozone/cpu.hh         | 1 -
 cpu/ozone/cpu_impl.hh    | 4 ++--
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpu/cpu_exec_context.cc b/cpu/cpu_exec_context.cc
index 3d047856a..24de6d450 100644
--- a/cpu/cpu_exec_context.cc
+++ b/cpu/cpu_exec_context.cc
@@ -157,8 +157,7 @@ CPUExecContext::takeOverFrom(ExecContext *oldContext)
     cpu_id = oldContext->readCpuId();
 #if !FULL_SYSTEM
     func_exe_inst = oldContext->readFuncExeInst();
-#endif
-
+#else
     EndQuiesceEvent *quiesce = oldContext->getQuiesceEvent();
     if (quiesce) {
         // Point the quiesce event's XC at this XC so that it wakes up
@@ -168,6 +167,7 @@ CPUExecContext::takeOverFrom(ExecContext *oldContext)
     if (quiesceEvent) {
         quiesceEvent->xc = proxy;
     }
+#endif
 
     storeCondFailures = 0;
 
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 856fcb1c8..58b2b3548 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -171,8 +171,7 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
     setCpuId(old_context->readCpuId());
 #if !FULL_SYSTEM
     thread->funcExeInst = old_context->readFuncExeInst();
-#endif
-
+#else
     EndQuiesceEvent *other_quiesce = old_context->getQuiesceEvent();
     if (other_quiesce) {
         // Point the quiesce event's XC at this XC so that it wakes up
@@ -184,6 +183,7 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
     }
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
+#endif
 
     old_context->setStatus(ExecContext::Unallocated);
 
diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index 1d522b2fa..7e12e75e5 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -89,7 +89,6 @@ class OzoneCPU : public BaseCPU
     typedef typename Impl::FrontEnd FrontEnd;
     typedef typename Impl::BackEnd BackEnd;
     typedef typename Impl::DynInst DynInst;
-    typedef typename Impl::DynInst DynInst;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
     typedef TheISA::MiscReg MiscReg;
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index b085f077f..031b4b145 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -934,8 +934,7 @@ OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
     setCpuId(old_context->readCpuId());
 #if !FULL_SYSTEM
     setFuncExeInst(old_context->readFuncExeInst());
-#endif
-
+#else
     EndQuiesceEvent *other_quiesce = old_context->getQuiesceEvent();
     if (other_quiesce) {
         // Point the quiesce event's XC at this XC so that it wakes up
@@ -947,6 +946,7 @@ OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
     }
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
+#endif
 
     old_context->setStatus(ExecContext::Unallocated);
 }

From c4a87f874a69535f70c0f6f2733ea716e32c70cf Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:37:52 -0400
Subject: [PATCH 35/50] Move activity tracking code into its own class.  Now
 the CPU no longer has to keep track of the activity tracking internals; it
 just calls advance() on the class and uses it to tell if it should deschedule
 itself.

SConscript:
    Split off activity/idling code into its own class to do the processing separately.
cpu/o3/alpha_cpu_builder.cc:
cpu/o3/alpha_params.hh:
    Activity stuff.  This is mostly for debugging and may be removed later on (or changed to enable/disable activity idling).
cpu/o3/cpu.cc:
    Move activity idling stuff mostly into its own class, so it no longer clutters this file.
cpu/o3/cpu.hh:
    Move activity idling stuff into its own class.
python/m5/objects/AlphaFullCPU.py:
    Add parameter for initial activity value.

--HG--
extra : convert_revision : f32f7cc03895dc07ab57ddba78c5402a1a8b0f1a
---
 SConscript                        |   1 +
 cpu/activity.cc                   | 122 ++++++++++++++++++++++
 cpu/activity.hh                   |  67 ++++++++++++
 cpu/o3/alpha_cpu_builder.cc       |   3 +
 cpu/o3/alpha_params.hh            |   2 +
 cpu/o3/cpu.cc                     | 167 +++++++++---------------------
 cpu/o3/cpu.hh                     |  84 +++++----------
 python/m5/objects/AlphaFullCPU.py |   2 +-
 8 files changed, 273 insertions(+), 175 deletions(-)
 create mode 100644 cpu/activity.cc
 create mode 100644 cpu/activity.hh

diff --git a/SConscript b/SConscript
index 5546e6f71..e5ca7c380 100644
--- a/SConscript
+++ b/SConscript
@@ -80,6 +80,7 @@ base_sources = Split('''
 	base/stats/visit.cc
 	base/stats/text.cc
 
+        cpu/activity.cc
 	cpu/base.cc
         cpu/base_dyn_inst.cc
 	cpu/cpu_exec_context.cc
diff --git a/cpu/activity.cc b/cpu/activity.cc
new file mode 100644
index 000000000..6dcb6e341
--- /dev/null
+++ b/cpu/activity.cc
@@ -0,0 +1,122 @@
+
+#include "base/timebuf.hh"
+#include "cpu/activity.hh"
+
+ActivityRecorder::ActivityRecorder(int num_stages, int longest_latency,
+                                   int activity)
+    : activityBuffer(longest_latency, 0), longestLatency(longest_latency),
+      activityCount(activity), numStages(num_stages)
+{
+    stageActive = new bool[numStages];
+    memset(stageActive, 0, numStages);
+}
+
+void
+ActivityRecorder::activity()
+{
+    if (activityBuffer[0]) {
+        return;
+    }
+
+    activityBuffer[0] = true;
+
+    ++activityCount;
+
+    DPRINTF(Activity, "Activity: %i\n", activityCount);
+}
+
+void
+ActivityRecorder::advance()
+{
+    if (activityBuffer[-longestLatency]) {
+        --activityCount;
+
+        assert(activityCount >= 0);
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+
+        if (activityCount == 0) {
+            DPRINTF(Activity, "No activity left!\n");
+        }
+    }
+
+    activityBuffer.advance();
+}
+
+void
+ActivityRecorder::activateStage(const int idx)
+{
+    if (!stageActive[idx]) {
+        ++activityCount;
+
+        stageActive[idx] = true;
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+    } else {
+        DPRINTF(Activity, "Stage %i already active.\n", idx);
+    }
+
+//    assert(activityCount < longestLatency + numStages + 1);
+}
+
+void
+ActivityRecorder::deactivateStage(const int idx)
+{
+    if (stageActive[idx]) {
+        --activityCount;
+
+        stageActive[idx] = false;
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+    } else {
+        DPRINTF(Activity, "Stage %i already inactive.\n", idx);
+    }
+
+    assert(activityCount >= 0);
+}
+
+void
+ActivityRecorder::reset()
+{
+    activityCount = 0;
+    memset(stageActive, 0, numStages);
+    for (int i = 0; i < longestLatency + 1; ++i)
+        activityBuffer.advance();
+}
+
+void
+ActivityRecorder::dump()
+{
+    for (int i = 0; i <= longestLatency; ++i) {
+        cprintf("[Idx:%i %i] ", i, activityBuffer[-i]);
+    }
+
+    cprintf("\n");
+
+    for (int i = 0; i < numStages; ++i) {
+        cprintf("[Stage:%i %i]\n", i, stageActive[i]);
+    }
+
+    cprintf("\n");
+
+    cprintf("Activity count: %i\n", activityCount);
+}
+
+void
+ActivityRecorder::validate()
+{
+    int count = 0;
+    for (int i = 0; i <= longestLatency; ++i) {
+        if (activityBuffer[-i]) {
+            count++;
+        }
+    }
+
+    for (int i = 0; i < numStages; ++i) {
+        if (stageActive[i]) {
+            count++;
+        }
+    }
+
+    assert(count == activityCount);
+}
diff --git a/cpu/activity.hh b/cpu/activity.hh
new file mode 100644
index 000000000..2d53dc4bb
--- /dev/null
+++ b/cpu/activity.hh
@@ -0,0 +1,67 @@
+
+#ifndef __CPU_ACTIVITY_HH__
+#define __CPU_ACTIVITY_HH__
+
+#include "base/timebuf.hh"
+#include "base/trace.hh"
+
+class ActivityRecorder {
+  public:
+    ActivityRecorder(int num_stages, int longest_latency, int count);
+
+    /** Records that there is activity this cycle. */
+    void activity();
+    /** Advances the activity buffer, decrementing the activityCount if active
+     *  communication just left the time buffer, and descheduling the CPU if
+     *  there is no activity.
+     */
+    void advance();
+    /** Marks a stage as active. */
+    void activateStage(const int idx);
+    /** Deactivates a stage. */
+    void deactivateStage(const int idx);
+
+    int getActivityCount() { return activityCount; }
+
+    void setActivityCount(int count)
+    { activityCount = count; }
+
+    bool active() { return activityCount; }
+
+    void reset();
+
+    void dump();
+
+    void validate();
+
+  private:
+    /** Time buffer that tracks if any cycles has active communication
+     *  in them.  It should be as long as the longest communication
+     *  latency in the system.  Each time any time buffer is written,
+     *  the activity buffer should also be written to. The
+     *  activityBuffer is advanced along with all the other time
+     *  buffers, so it should have a 1 somewhere in it only if there
+     *  is active communication in a time buffer.
+     */
+    TimeBuffer<bool> activityBuffer;
+
+    int longestLatency;
+
+    /** Tracks how many stages and cycles of time buffer have
+     *  activity. Stages increment this count when they switch to
+     *  active, and decrement it when they switch to
+     *  inactive. Whenever a cycle that previously had no information
+     *  is written in the time buffer, this is incremented. When a
+     *  cycle that had information exits the time buffer due to age,
+     *  this count is decremented. When the count is 0, there is no
+     *  activity in the CPU, and it can be descheduled.
+     */
+    int activityCount;
+
+    int numStages;
+
+    /** Records which stages are active/inactive. */
+    bool *stageActive;
+};
+
+#endif // __CPU_ACTIVITY_HH__
diff --git a/cpu/o3/alpha_cpu_builder.cc b/cpu/o3/alpha_cpu_builder.cc
index 0f9116d71..b0d812edc 100644
--- a/cpu/o3/alpha_cpu_builder.cc
+++ b/cpu/o3/alpha_cpu_builder.cc
@@ -48,6 +48,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
 
     Param<int> clock;
     Param<int> numThreads;
+Param<int> activity;
 
 #if FULL_SYSTEM
 SimObjectParam<System *> system;
@@ -156,6 +157,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
 
     INIT_PARAM(clock, "clock speed"),
     INIT_PARAM(numThreads, "number of HW thread contexts"),
+    INIT_PARAM_DFLT(activity, "Initial activity count", 0),
 
 #if FULL_SYSTEM
     INIT_PARAM(system, "System object"),
@@ -301,6 +303,7 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
 
     params->name = getInstanceName();
     params->numberOfThreads = actual_num_threads;
+    params->activity = activity;
 
 #if FULL_SYSTEM
     params->system = system;
diff --git a/cpu/o3/alpha_params.hh b/cpu/o3/alpha_params.hh
index b8ebae21e..e3acf2c05 100644
--- a/cpu/o3/alpha_params.hh
+++ b/cpu/o3/alpha_params.hh
@@ -64,6 +64,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
 
     BaseCPU *checker;
 
+    unsigned activity;
+
     //
     // Caches
     //
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index 9a46f2e7c..8d72bdc41 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,8 @@
 #else
 #include "sim/process.hh"
 #endif
-#include "sim/root.hh"
 
+#include "cpu/activity.hh"
 #include "cpu/checker/cpu.hh"
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/exec_context.hh"
@@ -42,6 +42,7 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/cpu.hh"
 
+#include "sim/root.hh"
 #include "sim/stat_control.hh"
 
 using namespace std;
@@ -104,16 +105,15 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
                  TheISA::NumMiscRegs * number_of_threads,
                  TheISA::ZeroReg),
 
-      // What to pass to these time buffers?
       // For now just have these time buffers be pretty big.
-      // @todo: Make these time buffer sizes parameters.
+      // @todo: Make these time buffer sizes parameters or derived
+      // from latencies
       timeBuffer(5, 5),
       fetchQueue(5, 5),
       decodeQueue(5, 5),
       renameQueue(5, 5),
       iewQueue(5, 5),
-      activityBuffer(5, 0),
-      activityCount(0),
+      activityRec(NumStages, 10, params->activity),
 
       globalSeqNum(1),
 
@@ -150,9 +150,9 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     tids.resize(number_of_threads);
 #endif
 
-    // The stages also need their CPU pointer setup.  However this must be
-    // done at the upper level CPU because they have pointers to the upper
-    // level CPU, and not this FullO3CPU.
+    // The stages also need their CPU pointer setup.  However this
+    // must be done at the upper level CPU because they have pointers
+    // to the upper level CPU, and not this FullO3CPU.
 
     // Set up Pointers to the activeThreads list for each stage
     fetch.setActiveThreads(&activeThreads);
@@ -207,11 +207,11 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
         commitRenameMap[tid].init(TheISA::NumIntRegs,
                                   params->numPhysIntRegs,
-                                  lreg_idx,                   //Index for Logical. Regs
+                                  lreg_idx,            //Index for Logical. Regs
 
                                   TheISA::NumFloatRegs,
                                   params->numPhysFloatRegs,
-                                  freg_idx,                   //Index for Float Regs
+                                  freg_idx,            //Index for Float Regs
 
                                   TheISA::NumMiscRegs,
 
@@ -223,11 +223,11 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
         renameMap[tid].init(TheISA::NumIntRegs,
                             params->numPhysIntRegs,
-                            lreg_idx,                   //Index for Logical. Regs
+                            lreg_idx,                  //Index for Logical. Regs
 
                             TheISA::NumFloatRegs,
                             params->numPhysFloatRegs,
-                            freg_idx,                   //Index for Float Regs
+                            freg_idx,                  //Index for Float Regs
 
                             TheISA::NumMiscRegs,
 
@@ -258,10 +258,6 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
     lastRunningCycle = curTick;
 
-    for (int i = 0; i < NumStages; ++i) {
-        stageActive[i] = false;
-    }
-
     contextSwitch = false;
 }
 
@@ -336,7 +332,7 @@ FullO3CPU<Impl>::tick()
 
     ++numCycles;
 
-    activity = false;
+//    activity = false;
 
     //Tick each of the stages
     fetch.tick();
@@ -361,14 +357,22 @@ FullO3CPU<Impl>::tick()
     renameQueue.advance();
     iewQueue.advance();
 
-    advanceActivityBuffer();
+    activityRec.advance();
 
     if (removeInstsThisCycle) {
         cleanUpRemovedInsts();
     }
 
-    if (_status != SwitchedOut && activityCount && !tickEvent.scheduled()) {
-        tickEvent.schedule(curTick + cycles(1));
+    if (!tickEvent.scheduled()) {
+        if (_status == SwitchedOut) {
+            // increment stat
+            lastRunningCycle = curTick;
+        } else if (!activityRec.active()) {
+            lastRunningCycle = curTick;
+            timesIdled++;
+        } else {
+            tickEvent.schedule(curTick + cycles(1));
+        }
     }
 
 #if !FULL_SYSTEM
@@ -592,7 +596,7 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
 
     // Be sure to signal that there's some activity so the CPU doesn't
     // deschedule itself.
-    activityThisCycle();
+    activityRec.activity();
     fetch.wakeFromQuiesce();
 
     _status = Running;
@@ -669,13 +673,18 @@ FullO3CPU<Impl>::switchOut(Sampler *_sampler)
     rename.switchOut();
     iew.switchOut();
     commit.switchOut();
+
+    // Wake the CPU and record activity so everything can drain out if
+    // the CPU is currently idle.
+    wakeCPU();
+    activityRec.activity();
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::signalSwitched()
 {
-    if (++switchCount == 5) {
+    if (++switchCount == NumStages) {
         fetch.doSwitchOut();
         rename.doSwitchOut();
         commit.doSwitchOut();
@@ -699,18 +708,16 @@ template <class Impl>
 void
 FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
-    // Flush out any old data from the activity buffers.
-    for (int i = 0; i < 6; ++i) {
+    // Flush out any old data from the time buffers.
+    for (int i = 0; i < 10; ++i) {
         timeBuffer.advance();
         fetchQueue.advance();
         decodeQueue.advance();
         renameQueue.advance();
         iewQueue.advance();
-        activityBuffer.advance();
     }
 
-    activityCount = 0;
-    bzero(&stageActive, sizeof(stageActive));
+    activityRec.reset();
 
     BaseCPU::takeOverFrom(oldCPU);
 
@@ -722,23 +729,23 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 
     assert(!tickEvent.scheduled());
 
-    // @todo: Figure out how to properly select the tid to put onto the active threads list.
+    // @todo: Figure out how to properly select the tid to put onto
+    // the active threads list.
     int tid = 0;
 
     list<unsigned>::iterator isActive = find(
         activeThreads.begin(), activeThreads.end(), tid);
 
     if (isActive == activeThreads.end()) {
-        //May Need to Re-code this if the delay variable is the
-        //delay needed for thread to activate
+        //May Need to Re-code this if the delay variable is the delay
+        //needed for thread to activate
         DPRINTF(FullCPU, "Adding Thread %i to active threads list\n",
                 tid);
 
         activeThreads.push_back(tid);
     }
 
-    // Set all status's to active, schedule the
-    // CPU's tick event.
+    // Set all statuses to active, schedule the CPU's tick event.
     // @todo: Fix up statuses so this is handled properly
     for (int i = 0; i < execContexts.size(); ++i) {
         ExecContext *xc = execContexts[i];
@@ -850,10 +857,6 @@ template <class Impl>
 void
 FullO3CPU<Impl>::setArchIntReg(int reg_idx, uint64_t val, unsigned tid)
 {
-    if (reg_idx == TheISA::ZeroReg) {
-        warn("Setting r31 through ArchIntReg in CPU, cycle %i\n", curTick);
-    }
-
     PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
 
     regFile.setIntReg(phys_reg, val);
@@ -1049,8 +1052,8 @@ FullO3CPU<Impl>::squashInstIt(const ListIt &instIt, const unsigned &tid)
         // Mark it as squashed.
         (*instIt)->setSquashed();
 
-        //@todo: Formulate a consistent method for deleting
-        //instructions from the instruction list
+        // @todo: Formulate a consistent method for deleting
+        // instructions from the instruction list
         // Remove the instruction from the list.
         removeList.push(instIt);
     }
@@ -1074,14 +1077,14 @@ FullO3CPU<Impl>::cleanUpRemovedInsts()
 
     removeInstsThisCycle = false;
 }
-
+/*
 template <class Impl>
 void
 FullO3CPU<Impl>::removeAllInsts()
 {
     instList.clear();
 }
-
+*/
 template <class Impl>
 void
 FullO3CPU<Impl>::dumpInsts()
@@ -1102,98 +1105,30 @@ FullO3CPU<Impl>::dumpInsts()
         ++num;
     }
 }
-
+/*
 template <class Impl>
 void
 FullO3CPU<Impl>::wakeDependents(DynInstPtr &inst)
 {
     iew.wakeDependents(inst);
 }
-
+*/
 template <class Impl>
 void
 FullO3CPU<Impl>::wakeCPU()
 {
-    if (activityCount || tickEvent.scheduled()) {
+    if (activityRec.active() || tickEvent.scheduled()) {
+        DPRINTF(Activity, "CPU already running.\n");
         return;
     }
 
-    idleCycles += curTick - lastRunningCycle;
+    DPRINTF(Activity, "Waking up CPU\n");
+
+    idleCycles += (curTick - 1) - lastRunningCycle;
 
     tickEvent.schedule(curTick);
 }
 
-template <class Impl>
-void
-FullO3CPU<Impl>::activityThisCycle()
-{
-    if (activityBuffer[0]) {
-        return;
-    }
-
-    activityBuffer[0] = true;
-    activity = true;
-    ++activityCount;
-
-    DPRINTF(Activity, "Activity: %i\n", activityCount);
-}
-
-template <class Impl>
-void
-FullO3CPU<Impl>::advanceActivityBuffer()
-{
-    if (activityBuffer[-5]) {
-        --activityCount;
-
-        assert(activityCount >= 0);
-
-        DPRINTF(Activity, "Activity: %i\n", activityCount);
-
-        if (activityCount == 0) {
-            DPRINTF(FullCPU, "No activity left, going to idle!\n");
-            lastRunningCycle = curTick;
-            timesIdled++;
-        }
-    }
-
-    activityBuffer.advance();
-}
-
-template <class Impl>
-void
-FullO3CPU<Impl>::activateStage(const StageIdx idx)
-{
-    if (!stageActive[idx]) {
-        ++activityCount;
-
-        stageActive[idx] = true;
-
-        DPRINTF(Activity, "Activity: %i\n", activityCount);
-    } else {
-        DPRINTF(Activity, "Stage %i already active.\n", idx);
-    }
-
-    // @todo: Number is hardcoded for now.  Replace with parameter.
-    assert(activityCount < 15);
-}
-
-template <class Impl>
-void
-FullO3CPU<Impl>::deactivateStage(const StageIdx idx)
-{
-    if (stageActive[idx]) {
-        --activityCount;
-
-        stageActive[idx] = false;
-
-        DPRINTF(Activity, "Activity: %i\n", activityCount);
-    } else {
-        DPRINTF(Activity, "Stage %i already inactive.\n", idx);
-    }
-
-    assert(activityCount >= 0);
-}
-
 template <class Impl>
 int
 FullO3CPU<Impl>::getFreeTid()
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index 789729e61..8db65d501 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_FULL_CPU_HH__
-#define __CPU_O3_FULL_CPU_HH__
+#ifndef __CPU_O3_CPU_HH__
+#define __CPU_O3_CPU_HH__
 
 #include <iostream>
 #include <list>
@@ -38,6 +38,7 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "config/full_system.hh"
+#include "cpu/activity.hh"
 #include "cpu/base.hh"
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/o3/comm.hh"
@@ -70,7 +71,7 @@ template <class Impl>
 class FullO3CPU : public BaseFullCPU
 {
   public:
-    //Put typedefs from the Impl here.
+    // Typedefs from the Impl here.
     typedef typename Impl::CPUPol CPUPolicy;
     typedef typename Impl::Params Params;
     typedef typename Impl::DynInstPtr DynInstPtr;
@@ -191,20 +192,18 @@ class FullO3CPU : public BaseFullCPU
      *  Note: this is a virtual function. CPU-Specific
      *  functionality defined in derived classes
      */
-    virtual void syscall(int tid) {}
+    virtual void syscall(int tid) { panic("Unimplemented!"); }
 
     /** Check if there are any system calls pending. */
     void checkSyscalls();
 
     /** Switches out this CPU.
-     *  @todo: Implement this.
      */
     void switchOut(Sampler *sampler);
 
     void signalSwitched();
 
     /** Takes over from another CPU.
-     *  @todo: Implement this.
      */
     void takeOverFrom(BaseCPU *oldCPU);
 
@@ -299,12 +298,8 @@ class FullO3CPU : public BaseFullCPU
     /** Add Instructions to the CPU Remove List*/
     void addToRemoveList(DynInstPtr &inst);
 
-    /** Remove an instruction from the front of the list.  It is expected
-     *  that there are no instructions in front of it (that is, none are older
-     *  than the instruction being removed).  Used when retiring instructions.
-     *  @todo: Remove the argument to this function, and just have it remove
-     *  last instruction once it's verified that commit has the same ordering
-     *  as the instruction list.
+    /** Remove an instruction from the front end of the list.  There's
+     *  no restriction on location of the instruction.
      */
     void removeFrontInst(DynInstPtr &inst);
 
@@ -319,15 +314,15 @@ class FullO3CPU : public BaseFullCPU
     void cleanUpRemovedInsts();
 
     /** Remove all instructions from the list. */
-    void removeAllInsts();
+//    void removeAllInsts();
 
     void dumpInsts();
 
     /** Basically a wrapper function so that instructions executed at
-     *  commit can tell the instruction queue that they have completed.
-     *  Eventually this hack should be removed.
+     *  commit can tell the instruction queue that they have
+     *  completed.  Eventually this hack should be removed.
      */
-    void wakeDependents(DynInstPtr &inst);
+//    void wakeDependents(DynInstPtr &inst);
 
   public:
     /** List of all the instructions in flight. */
@@ -338,12 +333,12 @@ class FullO3CPU : public BaseFullCPU
      */
     std::queue<ListIt> removeList;
 
-//#ifdef DEBUG
+#ifdef DEBUG
     std::set<InstSeqNum> snList;
-//#endif
+#endif
 
-    /** Records if instructions need to be removed this cycle due to being
-     *  retired or squashed.
+    /** Records if instructions need to be removed this cycle due to
+     *  being retired or squashed.
      */
     bool removeInstsThisCycle;
 
@@ -425,46 +420,19 @@ class FullO3CPU : public BaseFullCPU
     /** The IEW stage's instruction queue. */
     TimeBuffer<IEWStruct> iewQueue;
 
-  private:
-    /** Time buffer that tracks if any cycles has active communication in them.
-     *  It should be as long as the longest communication latency in the system.
-     *  Each time any time buffer is written, the activity buffer should also
-     *  be written to. The activityBuffer is advanced along with all the other
-     *  time buffers, so it should always have a 1 somewhere in it only if there
-     *  is active communication in a time buffer.
-     */
-    TimeBuffer<bool> activityBuffer;
-
-    /** Tracks how many stages and cycles of time buffer have activity. Stages
-     *  increment this count when they switch to active, and decrement it when
-     *  they switch to inactive. Whenever a cycle that previously had no
-     *  information is written in the time buffer, this is incremented. When
-     *  a cycle that had information exits the time buffer due to age, this
-     *  count is decremented. When the count is 0, there is no activity in the
-     *  CPU, and it can be descheduled.
-     */
-    int activityCount;
-
-    /** Records if there has been activity this cycle. */
-    bool activity;
-
-    /** Records which stages are active/inactive. */
-    bool stageActive[NumStages];
-
   public:
+    ActivityRecorder activityRec;
+
+    void activityThisCycle() { activityRec.activity(); }
+
+    void activateStage(const StageIdx idx)
+    { activityRec.activateStage(idx); }
+
+    void deactivateStage(const StageIdx idx)
+    { activityRec.deactivateStage(idx); }
+
     /** Wakes the CPU, rescheduling the CPU if it's not already active. */
     void wakeCPU();
-    /** Records that there is activity this cycle. */
-    void activityThisCycle();
-    /** Advances the activity buffer, decrementing the activityCount if active
-     *  communication just left the time buffer, and descheduling the CPU if
-     *  there is no activity.
-     */
-    void advanceActivityBuffer();
-    /** Marks a stage as active. */
-    void activateStage(const StageIdx idx);
-    /** Deactivates a stage. */
-    void deactivateStage(const StageIdx idx);
 
     /** Gets a free thread id. Use if thread ids change across system. */
     int getFreeTid();
@@ -550,4 +518,4 @@ class FullO3CPU : public BaseFullCPU
     Stats::Formula totalIpc;
 };
 
-#endif
+#endif // __CPU_O3_CPU_HH__
diff --git a/python/m5/objects/AlphaFullCPU.py b/python/m5/objects/AlphaFullCPU.py
index 1541b9494..d719bf783 100644
--- a/python/m5/objects/AlphaFullCPU.py
+++ b/python/m5/objects/AlphaFullCPU.py
@@ -3,7 +3,7 @@ from BaseCPU import BaseCPU
 
 class DerivAlphaFullCPU(BaseCPU):
     type = 'DerivAlphaFullCPU'
-
+    activity = Param.Unsigned("Initial count")
     numThreads = Param.Unsigned("number of HW thread contexts")
 
     if not build_env['FULL_SYSTEM']:

From 5df3e61f168a5dd7d86ba2f81538539622d77bd2 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:44:03 -0400
Subject: [PATCH 36/50] IEW/IQ code cleanup and reorganization. Dependecy graph
 code moved into its own class. This requires the changes to the functional
 units, which is in the next check in.

cpu/o3/iew.hh:
cpu/o3/iew_impl.hh:
    IEW and IQ code cleanup and reorganization.
cpu/o3/inst_queue.cc:
    Dependency graph code moved into its own class now.
cpu/o3/inst_queue.hh:
    IEW/IQ code cleanup and reorganization.
    Dependecy graph code moved into its own class.
cpu/o3/inst_queue_impl.hh:
    IEW/IQ code cleanup and reorganization.
    Dependecy graph code moved into its own class.
    Issue loop cleaned up, with completion events for functional units now used more correctly (before they weren't used for multi-cycle ops with pipelined FU's).

--HG--
extra : convert_revision : 35e50192df6f71dc81d46a73fdd65f7ec07c10e4
---
 cpu/o3/dep_graph.hh       | 213 +++++++++++++++++
 cpu/o3/iew.hh             |  57 ++---
 cpu/o3/iew_impl.hh        | 152 +++++-------
 cpu/o3/inst_queue.cc      |   4 -
 cpu/o3/inst_queue.hh      | 114 +++------
 cpu/o3/inst_queue_impl.hh | 480 +++++++++++---------------------------
 6 files changed, 468 insertions(+), 552 deletions(-)
 create mode 100644 cpu/o3/dep_graph.hh

diff --git a/cpu/o3/dep_graph.hh b/cpu/o3/dep_graph.hh
new file mode 100644
index 000000000..f8ae38da4
--- /dev/null
+++ b/cpu/o3/dep_graph.hh
@@ -0,0 +1,213 @@
+
+#ifndef __CPU_O3_DEP_GRAPH_HH__
+#define __CPU_O3_DEP_GRAPH_HH__
+
+#include "cpu/o3/comm.hh"
+
+template <class DynInstPtr>
+class DependencyEntry
+{
+  public:
+    DependencyEntry()
+        : inst(NULL), next(NULL)
+    { }
+
+    DynInstPtr inst;
+    //Might want to include data about what arch. register the
+    //dependence is waiting on.
+    DependencyEntry<DynInstPtr> *next;
+};
+
+template <class DynInstPtr>
+class DependencyGraph
+{
+  public:
+    typedef DependencyEntry<DynInstPtr> DepEntry;
+
+    DependencyGraph()
+        : numEntries(0), memAllocCounter(0), nodesTraversed(0), nodesRemoved(0)
+    { }
+
+    void resize(int num_entries);
+
+    void reset();
+
+    void insert(PhysRegIndex idx, DynInstPtr &new_inst);
+
+    void setInst(PhysRegIndex idx, DynInstPtr &new_inst)
+    { dependGraph[idx].inst = new_inst; }
+
+    void clearInst(PhysRegIndex idx)
+    { dependGraph[idx].inst = NULL; }
+
+    void remove(PhysRegIndex idx, DynInstPtr &inst_to_remove);
+
+    DynInstPtr pop(PhysRegIndex idx);
+
+    bool empty(PhysRegIndex idx) { return !dependGraph[idx].next; }
+
+    /** Debugging function to dump out the dependency graph.
+     */
+    void dump();
+
+  private:
+    /** Array of linked lists.  Each linked list is a list of all the
+     *  instructions that depend upon a given register.  The actual
+     *  register's index is used to index into the graph; ie all
+     *  instructions in flight that are dependent upon r34 will be
+     *  in the linked list of dependGraph[34].
+     */
+    DepEntry *dependGraph;
+
+    int numEntries;
+
+    // Debug variable, remove when done testing.
+    unsigned memAllocCounter;
+
+  public:
+    uint64_t nodesTraversed;
+    uint64_t nodesRemoved;
+};
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::resize(int num_entries)
+{
+    numEntries = num_entries;
+    dependGraph = new DepEntry[numEntries];
+}
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::reset()
+{
+    // Clear the dependency graph
+    DepEntry *curr;
+    DepEntry *prev;
+
+    for (int i = 0; i < numEntries; ++i) {
+        curr = dependGraph[i].next;
+
+        while (curr) {
+            memAllocCounter--;
+
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
+
+            delete prev;
+        }
+
+        if (dependGraph[i].inst) {
+            dependGraph[i].inst = NULL;
+        }
+
+        dependGraph[i].next = NULL;
+    }
+}
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::insert(PhysRegIndex idx, DynInstPtr &new_inst)
+{
+    //Add this new, dependent instruction at the head of the dependency
+    //chain.
+
+    // First create the entry that will be added to the head of the
+    // dependency chain.
+    DepEntry *new_entry = new DepEntry;
+    new_entry->next = dependGraph[idx].next;
+    new_entry->inst = new_inst;
+
+    // Then actually add it to the chain.
+    dependGraph[idx].next = new_entry;
+
+    ++memAllocCounter;
+}
+
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::remove(PhysRegIndex idx,
+                                    DynInstPtr &inst_to_remove)
+{
+    DepEntry *prev = &dependGraph[idx];
+    DepEntry *curr = dependGraph[idx].next;
+
+    // Make sure curr isn't NULL.  Because this instruction is being
+    // removed from a dependency list, it must have been placed there at
+    // an earlier time.  The dependency chain should not be empty,
+    // unless the instruction dependent upon it is already ready.
+    if (curr == NULL) {
+        return;
+    }
+
+    nodesRemoved++;
+
+    // Find the instruction to remove within the dependency linked list.
+    while (curr->inst != inst_to_remove) {
+        prev = curr;
+        curr = curr->next;
+        nodesTraversed++;
+
+        assert(curr != NULL);
+    }
+
+    // Now remove this instruction from the list.
+    prev->next = curr->next;
+
+    --memAllocCounter;
+
+    // Could push this off to the destructor of DependencyEntry
+    curr->inst = NULL;
+
+    delete curr;
+}
+
+template <class DynInstPtr>
+DynInstPtr
+DependencyGraph<DynInstPtr>::pop(PhysRegIndex idx)
+{
+    DepEntry *node;
+    node = dependGraph[idx].next;
+    DynInstPtr inst = NULL;
+    if (node) {
+        inst = node->inst;
+        dependGraph[idx].next = node->next;
+        node->inst = NULL;
+        memAllocCounter--;
+        delete node;
+    }
+    return inst;
+}
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::dump()
+{
+    DepEntry *curr;
+
+    for (int i = 0; i < numEntries; ++i)
+    {
+        curr = &dependGraph[i];
+
+        if (curr->inst) {
+            cprintf("dependGraph[%i]: producer: %#x [sn:%lli] consumer: ",
+                    i, curr->inst->readPC(), curr->inst->seqNum);
+        } else {
+            cprintf("dependGraph[%i]: No producer. consumer: ", i);
+        }
+
+        while (curr->next != NULL) {
+            curr = curr->next;
+
+            cprintf("%#x [sn:%lli] ",
+                    curr->inst->readPC(), curr->inst->seqNum);
+        }
+
+        cprintf("\n");
+    }
+    cprintf("memAllocCounter: %i\n", memAllocCounter);
+}
+
+#endif // __CPU_O3_DEP_GRAPH_HH__
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index 72be25668..935320628 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,20 +41,23 @@
 class FUPool;
 
 /**
- * DefaultIEW handles both single threaded and SMT IEW(issue/execute/writeback).
- * It handles the dispatching of instructions to the LSQ/IQ as part of the issue
- * stage, and has the IQ try to issue instructions each cycle. The execute
- * latency is actually tied into the issue latency to allow the IQ to be able to
+ * DefaultIEW handles both single threaded and SMT IEW
+ * (issue/execute/writeback).  It handles the dispatching of
+ * instructions to the LSQ/IQ as part of the issue stage, and has the
+ * IQ try to issue instructions each cycle. The execute latency is
+ * actually tied into the issue latency to allow the IQ to be able to
  * do back-to-back scheduling without having to speculatively schedule
- * instructions. This happens by having the IQ have access to the functional
- * units, and the IQ gets the execution latencies from the FUs when it issues
- * instructions. Instructions reach the execute stage on the last cycle of
- * their execution, which is when the IQ knows to wake up any dependent
- * instructions, allowing back to back scheduling. The execute portion of IEW
- * separates memory instructions from non-memory instructions, either telling
- * the LSQ to execute the instruction, or executing the instruction directly.
- * The writeback portion of IEW completes the instructions by waking up any
- * dependents, and marking the register ready on the scoreboard.
+ * instructions. This happens by having the IQ have access to the
+ * functional units, and the IQ gets the execution latencies from the
+ * FUs when it issues instructions. Instructions reach the execute
+ * stage on the last cycle of their execution, which is when the IQ
+ * knows to wake up any dependent instructions, allowing back to back
+ * scheduling. The execute portion of IEW separates memory
+ * instructions from non-memory instructions, either telling the LSQ
+ * to execute the instruction, or executing the instruction directly.
+ * The writeback portion of IEW completes the instructions by waking
+ * up any dependents, and marking the register ready on the
+ * scoreboard.
  */
 template<class Impl>
 class DefaultIEW
@@ -214,10 +217,8 @@ class DefaultIEW
     /** Tells CPU that the IEW stage is inactive and idle. */
     inline void deactivateStage();
 
-//#if !FULL_SYSTEM
     /** Returns if the LSQ has any stores to writeback. */
     bool hasStoresToWB() { return ldstQueue.hasStoresToWB(); }
-//#endif
 
   private:
     /** Sends commit proper information for a squash due to a branch
@@ -469,10 +470,10 @@ class DefaultIEW
     /** Stat for total number of mispredicted branches detected at execute. */
     Stats::Formula branchMispredicts;
 
-    Stats::Vector<> exe_swp;
-    Stats::Vector<> exe_nop;
-    Stats::Vector<> exe_refs;
-    Stats::Vector<> exe_branches;
+    Stats::Vector<> exeSwp;
+    Stats::Vector<> exeNop;
+    Stats::Vector<> exeRefs;
+    Stats::Vector<> exeBranches;
 
 //    Stats::Vector<> issued_ops;
 /*
@@ -481,20 +482,20 @@ class DefaultIEW
     Stats::Vector<> dist_unissued;
     Stats::Vector2d<> stat_issued_inst_type;
 */
-    Stats::Formula issue_rate;
+    Stats::Formula issueRate;
     Stats::Formula iewExecStoreInsts;
 //    Stats::Formula issue_op_rate;
 //    Stats::Formula fu_busy_rate;
 
     Stats::Vector<> iewInstsToCommit;
-    Stats::Vector<> writeback_count;
-    Stats::Vector<> producer_inst;
-    Stats::Vector<> consumer_inst;
-    Stats::Vector<> wb_penalized;
+    Stats::Vector<> writebackCount;
+    Stats::Vector<> producerInst;
+    Stats::Vector<> consumerInst;
+    Stats::Vector<> wbPenalized;
 
-    Stats::Formula wb_rate;
-    Stats::Formula wb_fanout;
-    Stats::Formula wb_penalized_rate;
+    Stats::Formula wbRate;
+    Stats::Formula wbFanout;
+    Stats::Formula wbPenalizedRate;
 };
 
 #endif // __CPU_O3_IEW_HH__
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index cbd7396f7..59f4055a6 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -69,7 +69,7 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
     if (!inst->isExecuted()) {
         inst->setExecuted();
 
-        // Execute again to copy data to proper place.
+        // Complete access to copy data to proper place.
         if (inst->isStore()) {
             inst->completeAcc();
         }
@@ -78,7 +78,6 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
     // Need to insert instruction into queue to commit
     iewStage->instToCommit(inst);
 
-    //wroteToTimeBuffer = true;
     iewStage->activityThisCycle();
 
     inst = NULL;
@@ -93,8 +92,7 @@ DefaultIEW<Impl>::LdWritebackEvent::description()
 
 template<class Impl>
 DefaultIEW<Impl>::DefaultIEW(Params *params)
-    : // Just make this time buffer really big for now
-    // @todo: Make this into a parameter.
+    : // @todo: Make this into a parameter.
       issueToExecQueue(5, 5),
       instQueue(params),
       ldstQueue(params),
@@ -108,7 +106,6 @@ DefaultIEW<Impl>::DefaultIEW(Params *params)
       numThreads(params->numberOfThreads),
       switchedOut(false)
 {
-    DPRINTF(IEW, "executeIntWidth: %i.\n", params->executeIntWidth);
     _status = Active;
     exeStatus = Running;
     wbStatus = Idle;
@@ -130,7 +127,6 @@ DefaultIEW<Impl>::DefaultIEW(Params *params)
 
     updateLSQNextCycle = false;
 
-    // @todo: Make into a parameter
     skidBufferMax = (3 * (renameToIEWDelay * params->renameWidth)) + issueWidth;
 }
 
@@ -149,8 +145,6 @@ DefaultIEW<Impl>::regStats()
 
     instQueue.regStats();
 
-    //ldstQueue.regStats();
-
     iewIdleCycles
         .name(name() + ".iewIdleCycles")
         .desc("Number of cycles IEW is idle");
@@ -167,8 +161,6 @@ DefaultIEW<Impl>::regStats()
         .name(name() + ".iewUnblockCycles")
         .desc("Number of cycles IEW is unblocking");
 
-//    iewWBInsts;
-
     iewDispatchedInsts
         .name(name() + ".iewDispatchedInsts")
         .desc("Number of instructions dispatched to IQ");
@@ -206,11 +198,7 @@ DefaultIEW<Impl>::regStats()
         .name(name() + ".iewExecLoadInsts")
         .desc("Number of load instructions executed")
         .flags(total);
-/*
-    iewExecStoreInsts
-        .name(name() + ".iewExecStoreInsts")
-        .desc("Number of store instructions executed");
-*/
+
     iewExecSquashedInsts
         .name(name() + ".iewExecSquashedInsts")
         .desc("Number of squashed instructions skipped in execute");
@@ -233,47 +221,47 @@ DefaultIEW<Impl>::regStats()
 
     branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
 
-    exe_swp
+    exeSwp
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:swp")
         .desc("number of swp insts executed")
         .flags(total)
         ;
 
-    exe_nop
+    exeNop
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:nop")
         .desc("number of nop insts executed")
         .flags(total)
         ;
 
-    exe_refs
+    exeRefs
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:refs")
         .desc("number of memory reference insts executed")
         .flags(total)
         ;
 
-    exe_branches
+    exeBranches
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:branches")
         .desc("Number of branches executed")
         .flags(total)
         ;
 
-    issue_rate
+    issueRate
         .name(name() + ".EXEC:rate")
         .desc("Inst execution rate")
         .flags(total)
         ;
-    issue_rate = iewExecutedInsts / cpu->numCycles;
+    issueRate = iewExecutedInsts / cpu->numCycles;
 
     iewExecStoreInsts
         .name(name() + ".EXEC:stores")
         .desc("Number of stores executed")
         .flags(total)
         ;
-    iewExecStoreInsts = exe_refs - iewExecLoadInsts;
+    iewExecStoreInsts = exeRefs - iewExecLoadInsts;
 /*
     for (int i=0; i<Num_OpClasses; ++i) {
         stringstream subname;
@@ -292,56 +280,56 @@ DefaultIEW<Impl>::regStats()
         .flags(total)
         ;
 
-    writeback_count
+    writebackCount
         .init(cpu->number_of_threads)
         .name(name() + ".WB:count")
         .desc("cumulative count of insts written-back")
         .flags(total)
         ;
 
-    producer_inst
+    producerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:producers")
         .desc("num instructions producing a value")
         .flags(total)
         ;
 
-    consumer_inst
+    consumerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:consumers")
         .desc("num instructions consuming a value")
         .flags(total)
         ;
 
-    wb_penalized
+    wbPenalized
         .init(cpu->number_of_threads)
         .name(name() + ".WB:penalized")
         .desc("number of instrctions required to write to 'other' IQ")
         .flags(total)
         ;
 
-    wb_penalized_rate
+    wbPenalizedRate
         .name(name() + ".WB:penalized_rate")
         .desc ("fraction of instructions written-back that wrote to 'other' IQ")
         .flags(total)
         ;
 
-    wb_penalized_rate = wb_penalized / writeback_count;
+    wbPenalizedRate = wbPenalized / writebackCount;
 
-    wb_fanout
+    wbFanout
         .name(name() + ".WB:fanout")
         .desc("average fanout of values written-back")
         .flags(total)
         ;
 
-    wb_fanout = producer_inst / consumer_inst;
+    wbFanout = producerInst / consumerInst;
 
-    wb_rate
+    wbRate
         .name(name() + ".WB:rate")
         .desc("insts written-back per cycle")
         .flags(total)
         ;
-    wb_rate = writeback_count / cpu->numCycles;
+    wbRate = writebackCount / cpu->numCycles;
 }
 
 template<class Impl>
@@ -507,7 +495,7 @@ DefaultIEW<Impl>::squash(unsigned tid)
     instQueue.squash(tid);
 
     // Tell the LDSTQ to start squashing.
-    ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum,tid);
+    ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum, tid);
 
     updatedQueues = true;
 
@@ -543,18 +531,15 @@ DefaultIEW<Impl>::squashDueToBranch(DynInstPtr &inst, unsigned tid)
     DPRINTF(IEW, "[tid:%i]: Squashing from a specific instruction, PC: %#x "
             "[sn:%i].\n", tid, inst->readPC(), inst->seqNum);
 
-    // Tell rename to squash through the time buffer.
     toCommit->squash[tid] = true;
     toCommit->squashedSeqNum[tid] = inst->seqNum;
     toCommit->mispredPC[tid] = inst->readPC();
     toCommit->nextPC[tid] = inst->readNextPC();
     toCommit->branchMispredict[tid] = true;
-    // Prediction was incorrect, so send back inverse.
     toCommit->branchTaken[tid] = inst->readNextPC() !=
         (inst->readPC() + sizeof(TheISA::MachInst));
 
     toCommit->includeSquashInst[tid] = false;
-    //toCommit->iewSquashNum[tid] = inst->seqNum;
 
     wroteToTimeBuffer = true;
 }
@@ -566,13 +551,11 @@ DefaultIEW<Impl>::squashDueToMemOrder(DynInstPtr &inst, unsigned tid)
     DPRINTF(IEW, "[tid:%i]: Squashing from a specific instruction, "
             "PC: %#x [sn:%i].\n", tid, inst->readPC(), inst->seqNum);
 
-    // Tell rename to squash through the time buffer.
     toCommit->squash[tid] = true;
     toCommit->squashedSeqNum[tid] = inst->seqNum;
     toCommit->nextPC[tid] = inst->readNextPC();
 
     toCommit->includeSquashInst[tid] = false;
-    //toCommit->iewSquashNum[tid] = inst->seqNum;
 
     wroteToTimeBuffer = true;
 }
@@ -611,7 +594,6 @@ DefaultIEW<Impl>::block(unsigned tid)
     // reprocessed when this stage unblocks.
     skidInsert(tid);
 
-    // Set the status to Blocked.
     dispatchStatus[tid] = Blocked;
 }
 
@@ -661,10 +643,7 @@ DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
     // to.  If there are free write ports at the time, then go ahead
     // and write the instruction to that time.  If there are not,
     // keep looking back to see where's the first time there's a
-    // free slot.  What happens if you run out of free spaces?
-    // For now naively assume that all instructions take one cycle.
-    // Otherwise would have to look into the time buffer based on the
-    // latency of the instruction.
+    // free slot.
     while ((*iewQueue)[wbCycle].insts[wbNumInst]) {
         ++wbNumInst;
         if (wbNumInst == issueWidth) {
@@ -918,10 +897,10 @@ void
 DefaultIEW<Impl>::sortInsts()
 {
     int insts_from_rename = fromRename->size;
-
+#ifdef DEBUG
     for (int i = 0; i < numThreads; i++)
         assert(insts[i].empty());
-
+#endif
     for (int i = 0; i < insts_from_rename; ++i) {
         insts[fromRename->insts[i]->threadNumber].push(fromRename->insts[i]);
     }
@@ -1047,9 +1026,6 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
         // Be sure to mark these instructions as ready so that the
         // commit stage can go ahead and execute them, and mark
         // them as issued so the IQ doesn't reprocess them.
-        // -------------
-        // @TODO: What happens if the ldstqueue is full?
-        //        Do we process the other instructions?
 
         // Check for squashed instructions.
         if (inst->isSquashed()) {
@@ -1125,6 +1101,9 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             ++iewDispStoreInsts;
 
             if (inst->isNonSpeculative()) {
+                // Non-speculative stores (namely store conditionals)
+                // need to be set as "canCommit()" so that commit can
+                // process them when they reach the head of commit.
                 inst->setCanCommit();
                 instQueue.insertNonSpec(inst);
                 add_to_iq = false;
@@ -1137,6 +1116,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             toRename->iewInfo[tid].dispatchedToLSQ++;
 #if FULL_SYSTEM
         } else if (inst->isMemBarrier() || inst->isWriteBarrier()) {
+            // Same as non-speculative stores.
             inst->setCanCommit();
             instQueue.insertBarrier(inst);
             add_to_iq = false;
@@ -1145,7 +1125,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             DPRINTF(IEW, "[tid:%i]: Issue: Nonspeculative instruction "
                     "encountered, skipping.\n", tid);
 
-            // Same hack as with stores.
+            // Same as non-speculative stores.
             inst->setCanCommit();
 
             // Specifically insert it as nonspeculative.
@@ -1162,9 +1142,9 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             inst->setExecuted();
             inst->setCanCommit();
 
-            instQueue.advanceTail(inst);
+            instQueue.recordProducer(inst);
 
-            exe_nop[tid]++;
+            exeNop[tid]++;
 
             add_to_iq = false;
         } else if (inst->isExecuted()) {
@@ -1175,7 +1155,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             inst->setIssued();
             inst->setCanCommit();
 
-            instQueue.advanceTail(inst);
+            instQueue.recordProducer(inst);
 
             add_to_iq = false;
         } else {
@@ -1237,7 +1217,6 @@ template <class Impl>
 void
 DefaultIEW<Impl>::executeInsts()
 {
-    //bool fetch_redirect[(*activeThreads).size()];
     wbNumInst = 0;
     wbCycle = 0;
 
@@ -1254,20 +1233,17 @@ DefaultIEW<Impl>::executeInsts()
 
     // Execute/writeback any instructions that are available.
     int inst_num = 0;
-    for ( ; inst_num < issueWidth &&  /* Haven't exceeded issue bandwidth */
-              fromIssue->insts[inst_num];
-         ++inst_num) {
+    for ( ; inst_num < issueWidth && fromIssue->insts[inst_num];
+          ++inst_num) {
 
         DPRINTF(IEW, "Execute: Executing instructions from IQ.\n");
 
-        // Get instruction from issue's queue.
         DynInstPtr inst = fromIssue->insts[inst_num];
 
         DPRINTF(IEW, "Execute: Processing PC %#x, [tid:%i] [sn:%i].\n",
                 inst->readPC(), inst->threadNumber,inst->seqNum);
 
         // Check if the instruction is squashed; if so then skip it
-        // and don't count it towards the FU usage.
         if (inst->isSquashed()) {
             DPRINTF(IEW, "Execute: Instruction was squashed.\n");
 
@@ -1299,22 +1275,19 @@ DefaultIEW<Impl>::executeInsts()
                 // Loads will mark themselves as executed, and their writeback
                 // event adds the instruction to the queue to commit
                 fault = ldstQueue.executeLoad(inst);
-
-//                ++iewExecLoadInsts;
             } else if (inst->isStore()) {
                 ldstQueue.executeStore(inst);
 
-//                ++iewExecStoreInsts;
-
                 // If the store had a fault then it may not have a mem req
                 if (inst->req && !(inst->req->flags & LOCKED)) {
                     inst->setExecuted();
 
                     instToCommit(inst);
                 }
-                // Store conditionals will mark themselves as executed, and
-                // their writeback event will add the instruction to the queue
-                // to commit.
+
+                // Store conditionals will mark themselves as
+                // executed, and their writeback event will add the
+                // instruction to the queue to commit.
             } else {
                 panic("Unexpected memory type!\n");
             }
@@ -1329,10 +1302,9 @@ DefaultIEW<Impl>::executeInsts()
 
         updateExeInstStats(inst);
 
-        // Check if branch was correct.  This check happens after the
-        // instruction is added to the queue because even if the branch
-        // is mispredicted, the branch instruction itself is still valid.
-        // Only handle this if there hasn't already been something that
+        // Check if branch prediction was correct, if not then we need
+        // to tell commit to squash in flight instructions.  Only
+        // handle this if there hasn't already been something that
         // redirects fetch in this group of instructions.
 
         // This probably needs to prioritize the redirects if a different
@@ -1360,7 +1332,8 @@ DefaultIEW<Impl>::executeInsts()
             } else if (ldstQueue.violation(tid)) {
                 fetchRedirect[tid] = true;
 
-                // Get the DynInst that caused the violation.  Note that this
+                // If there was an ordering violation, then get the
+                // DynInst that caused the violation.  Note that this
                 // clears the violation signal.
                 DynInstPtr violator;
                 violator = ldstQueue.getMemDepViolator(tid);
@@ -1409,13 +1382,11 @@ template <class Impl>
 void
 DefaultIEW<Impl>::writebackInsts()
 {
-    // Loop through the head of the time buffer and wake any dependents.
-    // These instructions are about to write back.  In the simple model
-    // this loop can really happen within the previous loop, but when
-    // instructions have actual latencies, this loop must be separate.
-    // Also mark scoreboard that this instruction is finally complete.
-    // Either have IEW have direct access to rename map, or have this as
-    // part of backwards communication.
+    // Loop through the head of the time buffer and wake any
+    // dependents.  These instructions are about to write back.  Also
+    // mark scoreboard that this instruction is finally complete.
+    // Either have IEW have direct access to scoreboard, or have this
+    // as part of backwards communication.
     for (int inst_num = 0; inst_num < issueWidth &&
              toCommit->insts[inst_num]; inst_num++) {
         DynInstPtr inst = toCommit->insts[inst_num];
@@ -1441,9 +1412,9 @@ DefaultIEW<Impl>::writebackInsts()
                 scoreboard->setReg(inst->renamedDestRegIdx(i));
             }
 
-            producer_inst[tid]++;
-            consumer_inst[tid]+= dependents;
-            writeback_count[tid]++;
+            producerInst[tid]++;
+            consumerInst[tid]+= dependents;
+            writebackCount[tid]++;
         }
     }
 }
@@ -1452,8 +1423,6 @@ template<class Impl>
 void
 DefaultIEW<Impl>::tick()
 {
-    // Try to fill up issue queue with as many instructions as bandwidth
-    // allows.
     wbNumInst = 0;
     wbCycle = 0;
 
@@ -1462,9 +1431,12 @@ DefaultIEW<Impl>::tick()
 
     sortInsts();
 
+    // Free function units marked as being freed this cycle.
+    fuPool->processFreeUnits();
+
     list<unsigned>::iterator threads = (*activeThreads).begin();
 
-    // Check stall and squash signals.
+    // Check stall and squash signals, dispatch any instructions.
     while (threads != (*activeThreads).end()) {
            unsigned tid = *threads++;
 
@@ -1472,7 +1444,6 @@ DefaultIEW<Impl>::tick()
 
         checkSignalsAndUpdate(tid);
         dispatch(tid);
-
     }
 
     if (exeStatus != Squashing) {
@@ -1502,9 +1473,6 @@ DefaultIEW<Impl>::tick()
     // Writeback any stores using any leftover bandwidth.
     ldstQueue.writebackStores();
 
-    // Free function units marked as being freed this cycle.
-    fuPool->processFreeUnits();
-
     // Check the committed load/store signals to see if there's a load
     // or store to commit.  Also check if it's being told to execute a
     // nonspeculative instruction.
@@ -1557,8 +1525,6 @@ DefaultIEW<Impl>::tick()
 
         DPRINTF(IEW, "[tid:%i], Dispatch dispatched %i instructions.\n",
                 tid, toRename->iewInfo[tid].dispatched);
-
-        //thread_queue.pop();
     }
 
     DPRINTF(IEW, "IQ has %i free entries (Can schedule: %i).  "
@@ -1585,7 +1551,7 @@ DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch())
-        exe_swp[thread_number]++;
+        exeSwp[thread_number]++;
     else
         iewExecutedInsts++;
 #else
@@ -1596,13 +1562,13 @@ DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
     //  Control operations
     //
     if (inst->isControl())
-        exe_branches[thread_number]++;
+        exeBranches[thread_number]++;
 
     //
     //  Memory operations
     //
     if (inst->isMemRef()) {
-        exe_refs[thread_number]++;
+        exeRefs[thread_number]++;
 
         if (inst->isLoad()) {
             iewExecLoadInsts[thread_number]++;
diff --git a/cpu/o3/inst_queue.cc b/cpu/o3/inst_queue.cc
index 2ff2282b4..95ae2b699 100644
--- a/cpu/o3/inst_queue.cc
+++ b/cpu/o3/inst_queue.cc
@@ -32,7 +32,3 @@
 
 // Force instantiation of InstructionQueue.
 template class InstructionQueue<AlphaSimpleImpl>;
-
-template<>
-unsigned
-InstructionQueue<AlphaSimpleImpl>::DependencyEntry::mem_alloc_counter = 0;
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 982294b4f..6bdf4ddc2 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dep_graph.hh"
 #include "encumbered/cpu/full/op_class.hh"
 #include "sim/host.hh"
 
@@ -91,6 +92,8 @@ class InstructionQueue
         /** Pointer back to the instruction queue. */
         InstructionQueue<Impl> *iqPtr;
 
+        bool freeFU;
+
       public:
         /** Construct a FU completion event. */
         FUCompletion(DynInstPtr &_inst, int fu_idx,
@@ -98,6 +101,7 @@ class InstructionQueue
 
         virtual void process();
         virtual const char *description();
+        void setFreeFU() { freeFU = true; }
     };
 
     /** Constructs an IQ. */
@@ -114,8 +118,6 @@ class InstructionQueue
 
     void resetState();
 
-    void resetDependencyGraph();
-
     /** Sets CPU pointer. */
     void setCPU(FullCPU *_cpu) { cpu = _cpu; }
 
@@ -170,11 +172,11 @@ class InstructionQueue
     void insertBarrier(DynInstPtr &barr_inst);
 
     /**
-     * Advances the tail of the IQ, used if an instruction is not added to the
-     * IQ for scheduling.
-     * @todo: Rename this function.
+     * Records the instruction as the producer of a register without
+     * adding it to the rest of the IQ.
      */
-    void advanceTail(DynInstPtr &inst);
+    void recordProducer(DynInstPtr &inst)
+    { addToProducers(inst); }
 
     /** Process FU completion event. */
     void processFUCompletion(DynInstPtr &inst, int fu_idx);
@@ -224,9 +226,6 @@ class InstructionQueue
     /** Returns the number of used entries for a thread. */
     unsigned getCount(unsigned tid) { return count[tid]; };
 
-    /** Updates the number of free entries. */
-    void updateFreeEntries(int num) { freeEntries += num; }
-
     /** Debug function to print all instructions. */
     void printInsts();
 
@@ -286,15 +285,6 @@ class InstructionQueue
         }
     };
 
-    /**
-     * Struct for an IQ entry. It includes the instruction and an iterator
-     * to the instruction's spot in the IQ.
-     */
-    struct IQEntry {
-        DynInstPtr inst;
-        ListIt iqIt;
-    };
-
     typedef std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare>
     ReadyInstQueue;
 
@@ -309,7 +299,6 @@ class InstructionQueue
      *  inside of DynInst), when these instructions are woken up only
      *  the sequence number will be available.  Thus it is most efficient to be
      *  able to search by the sequence number alone.
-     *  @todo: Maybe change this to a priority queue per thread.
      */
     std::map<InstSeqNum, DynInstPtr> nonSpecInsts;
 
@@ -324,6 +313,9 @@ class InstructionQueue
     /** List that contains the age order of the oldest instruction of each
      *  ready queue.  Used to select the oldest instruction available
      *  among op classes.
+     *  @todo: Might be better to just move these entries around instead
+     *  of creating new ones every time the position changes due to an
+     *  instruction issuing.  Not sure std::list supports this.
      */
     std::list<ListOrderEntry> listOrder;
 
@@ -346,6 +338,8 @@ class InstructionQueue
      */
     void moveToYoungerInst(ListOrderIt age_order_it);
 
+    DependencyGraph<DynInstPtr> dependGraph;
+
     //////////////////////////////////////
     // Various parameters
     //////////////////////////////////////
@@ -397,57 +391,9 @@ class InstructionQueue
 
     bool switchedOut;
 
-    //////////////////////////////////
-    // Variables needed for squashing
-    //////////////////////////////////
-
     /** The sequence number of the squashed instruction. */
     InstSeqNum squashedSeqNum[Impl::MaxThreads];
 
-    /** Iterator that points to the last instruction that has been squashed.
-     *  This will not be valid unless the IQ is in the process of squashing.
-     */
-    ListIt squashIt[Impl::MaxThreads];
-
-    ///////////////////////////////////
-    // Dependency graph stuff
-    ///////////////////////////////////
-
-    class DependencyEntry
-    {
-      public:
-        DependencyEntry()
-            : inst(NULL), next(NULL)
-        { }
-
-        DynInstPtr inst;
-        //Might want to include data about what arch. register the
-        //dependence is waiting on.
-        DependencyEntry *next;
-
-        //This function, and perhaps this whole class, stand out a little
-        //bit as they don't fit a classification well.  I want access
-        //to the underlying structure of the linked list, yet at
-        //the same time it feels like this should be something abstracted
-        //away.  So for now it will sit here, within the IQ, until
-        //a better implementation is decided upon.
-        // This function probably shouldn't be within the entry...
-        void insert(DynInstPtr &new_inst);
-
-        void remove(DynInstPtr &inst_to_remove);
-
-        // Debug variable, remove when done testing.
-        static unsigned mem_alloc_counter;
-    };
-
-    /** Array of linked lists.  Each linked list is a list of all the
-     *  instructions that depend upon a given register.  The actual
-     *  register's index is used to index into the graph; ie all
-     *  instructions in flight that are dependent upon r34 will be
-     *  in the linked list of dependGraph[34].
-     */
-    DependencyEntry *dependGraph;
-
     /** A cache of the recently woken registers.  It is 1 if the register
      *  has been woken up recently, and 0 if the register has been added
      *  to the dependency graph and has not yet received its value.  It
@@ -456,11 +402,11 @@ class InstructionQueue
      */
     std::vector<bool> regScoreboard;
 
-    /** Adds an instruction to the dependency graph, as a producer. */
+    /** Adds an instruction to the dependency graph, as a consumer. */
     bool addToDependents(DynInstPtr &new_inst);
 
-    /** Adds an instruction to the dependency graph, as a consumer. */
-    void createDependency(DynInstPtr &new_inst);
+    /** Adds an instruction to the dependency graph, as a producer. */
+    void addToProducers(DynInstPtr &new_inst);
 
     /** Moves an instruction to the ready queue if it is ready. */
     void addIfReady(DynInstPtr &inst);
@@ -471,10 +417,6 @@ class InstructionQueue
      */
     int countInsts();
 
-    /** Debugging function to dump out the dependency graph.
-     */
-    void dumpDependGraph();
-
     /** Debugging function to dump all the list sizes, as well as print
      *  out the list of nonspeculative instructions.  Should not be used
      *  in any other capacity, but it has no harmful sideaffects.
@@ -490,20 +432,16 @@ class InstructionQueue
     Stats::Scalar<> iqInstsAdded;
     /** Stat for number of non-speculative instructions added. */
     Stats::Scalar<> iqNonSpecInstsAdded;
-//    Stats::Scalar<> iqIntInstsAdded;
+
     Stats::Scalar<> iqInstsIssued;
     /** Stat for number of integer instructions issued. */
     Stats::Scalar<> iqIntInstsIssued;
-//    Stats::Scalar<> iqFloatInstsAdded;
     /** Stat for number of floating point instructions issued. */
     Stats::Scalar<> iqFloatInstsIssued;
-//    Stats::Scalar<> iqBranchInstsAdded;
     /** Stat for number of branch instructions issued. */
     Stats::Scalar<> iqBranchInstsIssued;
-//    Stats::Scalar<> iqMemInstsAdded;
     /** Stat for number of memory instructions issued. */
     Stats::Scalar<> iqMemInstsIssued;
-//    Stats::Scalar<> iqMiscInstsAdded;
     /** Stat for number of miscellaneous instructions issued. */
     Stats::Scalar<> iqMiscInstsIssued;
     /** Stat for number of squashed instructions that were ready to issue. */
@@ -518,20 +456,20 @@ class InstructionQueue
      */
     Stats::Scalar<> iqSquashedNonSpecRemoved;
 
-    Stats::VectorDistribution<> queue_res_dist;
-    Stats::Distribution<> n_issued_dist;
-    Stats::VectorDistribution<> issue_delay_dist;
+    Stats::VectorDistribution<> queueResDist;
+    Stats::Distribution<> numIssuedDist;
+    Stats::VectorDistribution<> issueDelayDist;
 
-    Stats::Vector<> stat_fu_busy;
+    Stats::Vector<> statFuBusy;
 //    Stats::Vector<> dist_unissued;
-    Stats::Vector2d<> stat_issued_inst_type;
+    Stats::Vector2d<> statIssuedInstType;
 
-    Stats::Formula issue_rate;
+    Stats::Formula issueRate;
 //    Stats::Formula issue_stores;
 //    Stats::Formula issue_op_rate;
-    Stats::Vector<> fu_busy;  //cumulative fu busy
+    Stats::Vector<> fuBusy;  //cumulative fu busy
 
-    Stats::Formula fu_busy_rate;
+    Stats::Formula fuBusyRate;
 };
 
 #endif //__CPU_O3_INST_QUEUE_HH__
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index 0d9cc09f3..ed57ac257 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,14 +26,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo:
-// Current ordering allows for 0 cycle added-to-scheduled.  Could maybe fake
-// it; either do in reverse order, or have added instructions put into a
-// different ready queue that, in scheduleRreadyInsts(), gets put onto the
-// normal ready queue.  This would however give only a one cycle delay,
-// but probably is more flexible to actually add in a delay parameter than
-// just running it backwards.
-
 #include <limits>
 #include <vector>
 
@@ -49,7 +41,7 @@ InstructionQueue<Impl>::FUCompletion::FUCompletion(DynInstPtr &_inst,
                                                    int fu_idx,
                                                    InstructionQueue<Impl> *iq_ptr)
     : Event(&mainEventQueue, Stat_Event_Pri),
-      inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr)
+      inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr), freeFU(false)
 {
     this->setFlags(Event::AutoDelete);
 }
@@ -58,7 +50,7 @@ template <class Impl>
 void
 InstructionQueue<Impl>::FUCompletion::process()
 {
-    iqPtr->processFUCompletion(inst, fuIdx);
+    iqPtr->processFUCompletion(inst, freeFU ? fuIdx : -1);
     inst = NULL;
 }
 
@@ -93,14 +85,7 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
 
     //Create an entry for each physical register within the
     //dependency graph.
-    dependGraph = new DependencyEntry[numPhysRegs];
-
-    // Initialize all the head pointers to point to NULL, and all the
-    // entries as unready.
-    for (int i = 0; i < numPhysRegs; ++i) {
-        dependGraph[i].next = NULL;
-        dependGraph[i].inst = NULL;
-    }
+    dependGraph.resize(numPhysRegs);
 
     // Resize the register scoreboard.
     regScoreboard.resize(numPhysRegs);
@@ -165,10 +150,9 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
 template <class Impl>
 InstructionQueue<Impl>::~InstructionQueue()
 {
-    resetDependencyGraph();
-    assert(DependencyEntry::mem_alloc_counter == 0);
-
-    delete [] dependGraph;
+    dependGraph.reset();
+    cprintf("Nodes traversed: %i, removed: %i\n",
+            dependGraph.nodesTraversed, dependGraph.nodesRemoved);
 }
 
 template <class Impl>
@@ -193,8 +177,6 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of non-speculative instructions added to the IQ")
         .prereq(iqNonSpecInstsAdded);
 
-//    iqIntInstsAdded;
-
     iqInstsIssued
         .name(name() + ".iqInstsIssued")
         .desc("Number of instructions issued")
@@ -205,29 +187,21 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of integer instructions issued")
         .prereq(iqIntInstsIssued);
 
-//    iqFloatInstsAdded;
-
     iqFloatInstsIssued
         .name(name() + ".iqFloatInstsIssued")
         .desc("Number of float instructions issued")
         .prereq(iqFloatInstsIssued);
 
-//    iqBranchInstsAdded;
-
     iqBranchInstsIssued
         .name(name() + ".iqBranchInstsIssued")
         .desc("Number of branch instructions issued")
         .prereq(iqBranchInstsIssued);
 
-//    iqMemInstsAdded;
-
     iqMemInstsIssued
         .name(name() + ".iqMemInstsIssued")
         .desc("Number of memory instructions issued")
         .prereq(iqMemInstsIssued);
 
-//    iqMiscInstsAdded;
-
     iqMiscInstsIssued
         .name(name() + ".iqMiscInstsIssued")
         .desc("Number of miscellaneous instructions issued")
@@ -255,16 +229,16 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of squashed non-spec instructions that were removed")
         .prereq(iqSquashedNonSpecRemoved);
 
-    queue_res_dist
+    queueResDist
         .init(Num_OpClasses, 0, 99, 2)
         .name(name() + ".IQ:residence:")
         .desc("cycles from dispatch to issue")
         .flags(total | pdf | cdf )
         ;
     for (int i = 0; i < Num_OpClasses; ++i) {
-        queue_res_dist.subname(i, opClassStrings[i]);
+        queueResDist.subname(i, opClassStrings[i]);
     }
-    n_issued_dist
+    numIssuedDist
         .init(0,totalWidth,1)
         .name(name() + ".ISSUE:issued_per_cycle")
         .desc("Number of insts issued each cycle")
@@ -281,19 +255,19 @@ InstructionQueue<Impl>::regStats()
         dist_unissued.subname(i, unissued_names[i]);
     }
 */
-    stat_issued_inst_type
+    statIssuedInstType
         .init(numThreads,Num_OpClasses)
         .name(name() + ".ISSUE:FU_type")
         .desc("Type of FU issued")
         .flags(total | pdf | dist)
         ;
-    stat_issued_inst_type.ysubnames(opClassStrings);
+    statIssuedInstType.ysubnames(opClassStrings);
 
     //
     //  How long did instructions for a particular FU type wait prior to issue
     //
 
-    issue_delay_dist
+    issueDelayDist
         .init(Num_OpClasses,0,99,2)
         .name(name() + ".ISSUE:")
         .desc("cycles from operands ready to issue")
@@ -303,15 +277,15 @@ InstructionQueue<Impl>::regStats()
     for (int i=0; i<Num_OpClasses; ++i) {
         stringstream subname;
         subname << opClassStrings[i] << "_delay";
-        issue_delay_dist.subname(i, subname.str());
+        issueDelayDist.subname(i, subname.str());
     }
 
-    issue_rate
+    issueRate
         .name(name() + ".ISSUE:rate")
         .desc("Inst issue rate")
         .flags(total)
         ;
-    issue_rate = iqInstsIssued / cpu->numCycles;
+    issueRate = iqInstsIssued / cpu->numCycles;
 /*
     issue_stores
         .name(name() + ".ISSUE:stores")
@@ -328,29 +302,29 @@ InstructionQueue<Impl>::regStats()
         ;
     issue_op_rate = issued_ops / numCycles;
 */
-    stat_fu_busy
+    statFuBusy
         .init(Num_OpClasses)
         .name(name() + ".ISSUE:fu_full")
         .desc("attempts to use FU when none available")
         .flags(pdf | dist)
         ;
     for (int i=0; i < Num_OpClasses; ++i) {
-        stat_fu_busy.subname(i, opClassStrings[i]);
+        statFuBusy.subname(i, opClassStrings[i]);
     }
 
-    fu_busy
+    fuBusy
         .init(numThreads)
         .name(name() + ".ISSUE:fu_busy_cnt")
         .desc("FU busy when requested")
         .flags(total)
         ;
 
-    fu_busy_rate
+    fuBusyRate
         .name(name() + ".ISSUE:fu_busy_rate")
         .desc("FU busy rate (busy events/executed inst)")
         .flags(total)
         ;
-    fu_busy_rate = fu_busy / iqInstsIssued;
+    fuBusyRate = fuBusy / iqInstsIssued;
 
     for ( int i=0; i < numThreads; i++) {
         // Tell mem dependence unit to reg stats as well.
@@ -394,35 +368,6 @@ InstructionQueue<Impl>::resetState()
     listOrder.clear();
 }
 
-template <class Impl>
-void
-InstructionQueue<Impl>::resetDependencyGraph()
-{
-    // Clear the dependency graph
-    DependencyEntry *curr;
-    DependencyEntry *prev;
-
-    for (int i = 0; i < numPhysRegs; ++i) {
-        curr = dependGraph[i].next;
-
-        while (curr) {
-            DependencyEntry::mem_alloc_counter--;
-
-            prev = curr;
-            curr = prev->next;
-            prev->inst = NULL;
-
-            delete prev;
-        }
-
-        if (dependGraph[i].inst) {
-            dependGraph[i].inst = NULL;
-        }
-
-        dependGraph[i].next = NULL;
-    }
-}
-
 template <class Impl>
 void
 InstructionQueue<Impl>::setActiveThreads(list<unsigned> *at_ptr)
@@ -454,7 +399,7 @@ void
 InstructionQueue<Impl>::switchOut()
 {
     resetState();
-    resetDependencyGraph();
+    dependGraph.reset();
     switchedOut = true;
     for (int i = 0; i < numThreads; ++i) {
         memDepUnit[i].switchOut();
@@ -562,20 +507,15 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // Make sure the instruction is valid
     assert(new_inst);
 
-    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
-            new_inst->readPC());
+    DPRINTF(IQ, "Adding instruction [sn:%lli] PC %#x to the IQ.\n",
+            new_inst->seqNum, new_inst->readPC());
 
-    // Check if there are any free entries.  Panic if there are none.
-    // Might want to have this return a fault in the future instead of
-    // panicing.
     assert(freeEntries != 0);
 
     instList[new_inst->threadNumber].push_back(new_inst);
 
-    // Decrease the number of free entries.
     --freeEntries;
 
-    //Mark Instruction as in IQ
     new_inst->setInIQ();
 
     // Look through its source registers (physical regs), and mark any
@@ -584,21 +524,16 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
 
     // Have this instruction set itself as the producer of its destination
     // register(s).
-    createDependency(new_inst);
+    addToProducers(new_inst);
 
-    // If it's a memory instruction, add it to the memory dependency
-    // unit.
     if (new_inst->isMemRef()) {
         memDepUnit[new_inst->threadNumber].insert(new_inst);
     } else {
-        // If the instruction is ready then add it to the ready list.
         addIfReady(new_inst);
     }
 
     ++iqInstsAdded;
 
-
-    //Update Thread IQ Count
     count[new_inst->threadNumber]++;
 
     assert(freeEntries == (numEntries - countInsts()));
@@ -611,30 +546,25 @@ InstructionQueue<Impl>::insertNonSpec(DynInstPtr &new_inst)
     // @todo: Clean up this code; can do it by setting inst as unable
     // to issue, then calling normal insert on the inst.
 
-    // Make sure the instruction is valid
     assert(new_inst);
 
     nonSpecInsts[new_inst->seqNum] = new_inst;
 
-    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
-            new_inst->readPC());
+    DPRINTF(IQ, "Adding non-speculative instruction [sn:%lli] PC %#x "
+            "to the IQ.\n",
+            new_inst->seqNum, new_inst->readPC());
 
-    // Check if there are any free entries.  Panic if there are none.
-    // Might want to have this return a fault in the future instead of
-    // panicing.
     assert(freeEntries != 0);
 
     instList[new_inst->threadNumber].push_back(new_inst);
 
-    // Decrease the number of free entries.
     --freeEntries;
 
-    //Mark Instruction as in IQ
     new_inst->setInIQ();
 
     // Have this instruction set itself as the producer of its destination
     // register(s).
-    createDependency(new_inst);
+    addToProducers(new_inst);
 
     // If it's a memory instruction, add it to the memory dependency
     // unit.
@@ -644,7 +574,6 @@ InstructionQueue<Impl>::insertNonSpec(DynInstPtr &new_inst)
 
     ++iqNonSpecInstsAdded;
 
-    //Update Thread IQ Count
     count[new_inst->threadNumber]++;
 
     assert(freeEntries == (numEntries - countInsts()));
@@ -659,15 +588,6 @@ InstructionQueue<Impl>::insertBarrier(DynInstPtr &barr_inst)
     insertNonSpec(barr_inst);
 }
 
-template <class Impl>
-void
-InstructionQueue<Impl>::advanceTail(DynInstPtr &inst)
-{
-    // Have this instruction set itself as the producer of its destination
-    // register(s).
-    createDependency(inst);
-}
-
 template <class Impl>
 void
 InstructionQueue<Impl>::addToOrderList(OpClass op_class)
@@ -733,8 +653,15 @@ InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
 
     iewStage->wakeCPU();
 
-    fuPool->freeUnit(fu_idx);
+    if (fu_idx > -1)
+        fuPool->freeUnitNextCycle(fu_idx);
 
+    // @todo: Ensure that these FU Completions happen at the beginning
+    // of a cycle, otherwise they could add too many instructions to
+    // the queue.
+    // @todo: This could break if there's multiple multi-cycle ops
+    // finishing on this cycle.  Maybe implement something like
+    // instToCommit in iew_impl.hh.
     int &size = issueToExecuteQueue->access(0)->size;
 
     issueToExecuteQueue->access(0)->insts[size++] = inst;
@@ -752,20 +679,6 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
     IssueStruct *i2e_info = issueToExecuteQueue->access(0);
 
-    // Will need to reorder the list if either a queue is not on the list,
-    // or it has an older instruction than last time.
-    for (int i = 0; i < Num_OpClasses; ++i) {
-        if (!readyInsts[i].empty()) {
-            if (!queueOnList[i]) {
-                addToOrderList(OpClass(i));
-            } else if (readyInsts[i].top()->seqNum  <
-                       (*readyIt[i]).oldestInst) {
-                listOrder.erase(readyIt[i]);
-                addToOrderList(OpClass(i));
-            }
-        }
-    }
-
     // Have iterator to head of the list
     // While I haven't exceeded bandwidth or reached the end of the list,
     // Try to get a FU that can do what this op needs.
@@ -779,7 +692,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     int total_issued = 0;
     int exec_queue_slot = i2e_info->size;
 
-    while (exec_queue_slot < totalWidth && order_it != order_end_it) {
+    while (exec_queue_slot < totalWidth && total_issued < totalWidth &&
+           order_it != order_end_it) {
         OpClass op_class = (*order_it).queueType;
 
         assert(!readyInsts[op_class].empty());
@@ -805,70 +719,47 @@ InstructionQueue<Impl>::scheduleReadyInsts()
             continue;
         }
 
-        int idx = fuPool->getUnit(op_class);
-
+        int idx = -2;
+        int op_latency = 1;
         int tid = issuing_inst->threadNumber;
 
-        if (idx == -2) {
-            assert(op_class == No_OpClass);
+        if (op_class != No_OpClass) {
+            idx = fuPool->getUnit(op_class);
 
-            i2e_info->insts[exec_queue_slot++] = issuing_inst;
-            i2e_info->size++;
-
-            DPRINTF(IQ, "Thread %i: Issuing instruction PC that needs no FU"
-                    " %#x [sn:%lli]\n",
-                    tid, issuing_inst->readPC(),
-                    issuing_inst->seqNum);
-
-            readyInsts[op_class].pop();
-
-            if (!readyInsts[op_class].empty()) {
-                moveToYoungerInst(order_it);
-            } else {
-                readyIt[op_class] = listOrder.end();
-                queueOnList[op_class] = false;
+            if (idx > -1) {
+                op_latency = fuPool->getOpLatency(op_class);
             }
+        }
 
-            issuing_inst->setIssued();
-            ++total_issued;
-
-            if (!issuing_inst->isMemRef()) {
-                // Memory instructions can not be freed from the IQ until they
-                // complete.
-                ++freeEntries;
-                count[tid]--;
-                issuing_inst->removeInIQ();
-            } else {
-                memDepUnit[tid].issue(issuing_inst);
-            }
-
-            listOrder.erase(order_it++);
-
-            stat_issued_inst_type[tid][op_class]++;
-        } else if (idx != -1) {
-            int op_latency = fuPool->getOpLatency(op_class);
-
+        if (idx == -2 || idx != -1) {
             if (op_latency == 1) {
                 i2e_info->insts[exec_queue_slot++] = issuing_inst;
                 i2e_info->size++;
 
-                // Add the FU onto the list of FU's to be freed next cycle.
-                fuPool->freeUnit(idx);
+                // Add the FU onto the list of FU's to be freed next
+                // cycle if we used one.
+                if (idx >= 0)
+                    fuPool->freeUnitNextCycle(idx);
             } else {
                 int issue_latency = fuPool->getIssueLatency(op_class);
+                // Generate completion event for the FU
+                FUCompletion *execution = new FUCompletion(issuing_inst,
+                                                           idx, this);
 
+                execution->schedule(curTick + cpu->cycles(issue_latency - 1));
+
+                // @todo: Enforce that issue_latency == 1 or op_latency
                 if (issue_latency > 1) {
-                    // Generate completion event for the FU
-                    FUCompletion *execution = new FUCompletion(issuing_inst,
-                                                               idx, this);
-
-                    execution->schedule(curTick + cpu->cycles(issue_latency - 1));
+                    execution->setFreeFU();
                 } else {
-                    i2e_info->insts[exec_queue_slot++] = issuing_inst;
-                    i2e_info->size++;
+                    // @todo: Not sure I'm accounting for the
+                    // multi-cycle op in a pipelined FU properly, or
+                    // the number of instructions issued in one cycle.
+//                    i2e_info->insts[exec_queue_slot++] = issuing_inst;
+//                    i2e_info->size++;
 
                     // Add the FU onto the list of FU's to be freed next cycle.
-                    fuPool->freeUnit(idx);
+                    fuPool->freeUnitNextCycle(idx);
                 }
             }
 
@@ -900,15 +791,16 @@ InstructionQueue<Impl>::scheduleReadyInsts()
             }
 
             listOrder.erase(order_it++);
-            stat_issued_inst_type[tid][op_class]++;
+            statIssuedInstType[tid][op_class]++;
         } else {
-            stat_fu_busy[op_class]++;
-            fu_busy[tid]++;
+            statFuBusy[op_class]++;
+            fuBusy[tid]++;
             ++order_it;
         }
     }
 
-    n_issued_dist.sample(total_issued);
+    numIssuedDist.sample(total_issued);
+    iqInstsIssued+= total_issued;
 
     if (total_issued) {
         cpu->activityThisCycle();
@@ -930,10 +822,8 @@ InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
 
     unsigned tid = (*inst_it).second->threadNumber;
 
-    // Mark this instruction as ready to issue.
     (*inst_it).second->setCanIssue();
 
-    // Now schedule the instruction.
     if (!(*inst_it).second->isMemRef()) {
         addIfReady((*inst_it).second);
     } else {
@@ -949,7 +839,6 @@ template <class Impl>
 void
 InstructionQueue<Impl>::commit(const InstSeqNum &inst, unsigned tid)
 {
-    /*Need to go through each thread??*/
     DPRINTF(IQ, "[tid:%i]: Committing instructions older than [sn:%i]\n",
             tid,inst);
 
@@ -973,18 +862,13 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
     DPRINTF(IQ, "Waking dependents of completed instruction.\n");
 
     assert(!completed_inst->isSquashed());
-    // Look at the physical destination register of the DynInst
-    // and look it up on the dependency graph.  Then mark as ready
-    // any instructions within the instruction queue.
-    DependencyEntry *curr;
-    DependencyEntry *prev;
 
     // Tell the memory dependence unit to wake any dependents on this
     // instruction if it is a memory instruction.  Also complete the memory
-    // instruction at this point since we know it executed fine.
-    // @todo: Might want to rename "completeMemInst" to
-    // something that indicates that it won't need to be replayed, and call
-    // this earlier.  Might not be a big deal.
+    // instruction at this point since we know it executed without issues.
+    // @todo: Might want to rename "completeMemInst" to something that
+    // indicates that it won't need to be replayed, and call this
+    // earlier.  Might not be a big deal.
     if (completed_inst->isMemRef()) {
         memDepUnit[completed_inst->threadNumber].wakeDependents(completed_inst);
         completeMemInst(completed_inst);
@@ -1010,39 +894,31 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
         DPRINTF(IQ, "Waking any dependents on register %i.\n",
                 (int) dest_reg);
 
-        //Maybe abstract this part into a function.
-        //Go through the dependency chain, marking the registers as ready
-        //within the waiting instructions.
+        //Go through the dependency chain, marking the registers as
+        //ready within the waiting instructions.
+        DynInstPtr dep_inst = dependGraph.pop(dest_reg);
 
-        curr = dependGraph[dest_reg].next;
-
-        while (curr) {
+        while (dep_inst) {
             DPRINTF(IQ, "Waking up a dependent instruction, PC%#x.\n",
-                    curr->inst->readPC());
+                    dep_inst->readPC());
 
             // Might want to give more information to the instruction
-            // so that it knows which of its source registers is ready.
-            // However that would mean that the dependency graph entries
-            // would need to hold the src_reg_idx.
-            curr->inst->markSrcRegReady();
+            // so that it knows which of its source registers is
+            // ready.  However that would mean that the dependency
+            // graph entries would need to hold the src_reg_idx.
+            dep_inst->markSrcRegReady();
 
-            addIfReady(curr->inst);
+            addIfReady(dep_inst);
 
-            DependencyEntry::mem_alloc_counter--;
-
-            prev = curr;
-            curr = prev->next;
-            prev->inst = NULL;
+            dep_inst = dependGraph.pop(dest_reg);
 
             ++dependents;
-
-            delete prev;
         }
 
-        // Reset the head node now that all of its dependents have been woken
-        // up.
-        dependGraph[dest_reg].next = NULL;
-        dependGraph[dest_reg].inst = NULL;
+        // Reset the head node now that all of its dependents have
+        // been woken up.
+        assert(dependGraph.empty(dest_reg));
+        dependGraph.clearInst(dest_reg);
 
         // Mark the scoreboard as having that register ready.
         regScoreboard[dest_reg] = true;
@@ -1058,6 +934,16 @@ InstructionQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst)
 
     readyInsts[op_class].push(ready_inst);
 
+    // Will need to reorder the list if either a queue is not on the list,
+    // or it has an older instruction than last time.
+    if (!queueOnList[op_class]) {
+        addToOrderList(op_class);
+    } else if (readyInsts[op_class].top()->seqNum  <
+               (*readyIt[op_class]).oldestInst) {
+        listOrder.erase(readyIt[op_class]);
+        addToOrderList(op_class);
+    }
+
     DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
             "the ready list, PC %#x opclass:%i [sn:%lli].\n",
             ready_inst->readPC(), op_class, ready_inst->seqNum);
@@ -1114,10 +1000,6 @@ InstructionQueue<Impl>::squash(unsigned tid)
     // time buffer.
     squashedSeqNum[tid] = fromCommit->commitInfo[tid].doneSeqNum;
 
-    // Setup the squash iterator to point to the tail.
-    squashIt[tid] = instList[tid].end();
-    --squashIt[tid];
-
     // Call doSquash if there are insts in the IQ
     if (count[tid] > 0) {
         doSquash(tid);
@@ -1131,24 +1013,25 @@ template <class Impl>
 void
 InstructionQueue<Impl>::doSquash(unsigned tid)
 {
-    // Make sure the squashed sequence number is valid.
-//    assert(squashedSeqNum[tid] != 0);
+    // Start at the tail.
+    ListIt squash_it = instList[tid].end();
+    --squash_it;
 
     DPRINTF(IQ, "[tid:%i]: Squashing until sequence number %i!\n",
             tid, squashedSeqNum[tid]);
 
     // Squash any instructions younger than the squashed sequence number
     // given.
-    while (squashIt[tid] != instList[tid].end() &&
-           (*squashIt[tid])->seqNum > squashedSeqNum[tid]) {
+    while (squash_it != instList[tid].end() &&
+           (*squash_it)->seqNum > squashedSeqNum[tid]) {
 
-        DynInstPtr squashed_inst = (*squashIt[tid]);
+        DynInstPtr squashed_inst = (*squash_it);
 
         // Only handle the instruction if it actually is in the IQ and
         // hasn't already been squashed in the IQ.
         if (squashed_inst->threadNumber != tid ||
             squashed_inst->isSquashedInIQ()) {
-            --squashIt[tid];
+            --squash_it;
             continue;
         }
 
@@ -1168,27 +1051,23 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
                     PhysRegIndex src_reg =
                         squashed_inst->renamedSrcRegIdx(src_reg_idx);
 
-                    // Only remove it from the dependency graph if it was
-                    // placed there in the first place.
-                    // HACK: This assumes that instructions woken up from the
-                    // dependency chain aren't informed that a specific src
-                    // register has become ready.  This may not always be true
-                    // in the future.
-                    // Instead of doing a linked list traversal, we can just
-                    // remove these squashed instructions either at issue time,
-                    // or when the register is overwritten.  The only downside
-                    // to this is it leaves more room for error.
+                    // Only remove it from the dependency graph if it
+                    // was placed there in the first place.
+
+                    // Instead of doing a linked list traversal, we
+                    // can just remove these squashed instructions
+                    // either at issue time, or when the register is
+                    // overwritten.  The only downside to this is it
+                    // leaves more room for error.
 
                     if (!squashed_inst->isReadySrcRegIdx(src_reg_idx) &&
                         src_reg < numPhysRegs) {
-                        dependGraph[src_reg].remove(squashed_inst);
+                        dependGraph.remove(src_reg, squashed_inst);
                     }
 
 
                     ++iqSquashedOperandsExamined;
                 }
-
-                // Might want to remove producers as well.
             } else {
                 NonSpecMapIt ns_inst_it =
                     nonSpecInsts.find(squashed_inst->seqNum);
@@ -1217,74 +1096,16 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
 
             ++freeEntries;
 
-            if (numThreads > 1) {
-                DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
-                        "squashed.\n",
-                        tid, squashed_inst->seqNum, squashed_inst->readPC());
-            } else {
-                DPRINTF(IQ, "Instruction [sn:%lli] PC %#x squashed.\n",
-                        squashed_inst->seqNum, squashed_inst->readPC());
-            }
+            DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
+                    "squashed.\n",
+                    tid, squashed_inst->seqNum, squashed_inst->readPC());
         }
 
-        instList[tid].erase(squashIt[tid]--);
+        instList[tid].erase(squash_it--);
         ++iqSquashedInstsExamined;
     }
 }
 
-template <class Impl>
-void
-InstructionQueue<Impl>::DependencyEntry::insert(DynInstPtr &new_inst)
-{
-    //Add this new, dependent instruction at the head of the dependency
-    //chain.
-
-    // First create the entry that will be added to the head of the
-    // dependency chain.
-    DependencyEntry *new_entry = new DependencyEntry;
-    new_entry->next = this->next;
-    new_entry->inst = new_inst;
-
-    // Then actually add it to the chain.
-    this->next = new_entry;
-
-    ++mem_alloc_counter;
-}
-
-template <class Impl>
-void
-InstructionQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove)
-{
-    DependencyEntry *prev = this;
-    DependencyEntry *curr = this->next;
-
-    // Make sure curr isn't NULL.  Because this instruction is being
-    // removed from a dependency list, it must have been placed there at
-    // an earlier time.  The dependency chain should not be empty,
-    // unless the instruction dependent upon it is already ready.
-    if (curr == NULL) {
-        return;
-    }
-
-    // Find the instruction to remove within the dependency linked list.
-    while (curr->inst != inst_to_remove) {
-        prev = curr;
-        curr = curr->next;
-
-        assert(curr != NULL);
-    }
-
-    // Now remove this instruction from the list.
-    prev->next = curr->next;
-
-    --mem_alloc_counter;
-
-    // Could push this off to the destructor of DependencyEntry
-    curr->inst = NULL;
-
-    delete curr;
-}
-
 template <class Impl>
 bool
 InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
@@ -1313,7 +1134,7 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
                         "is being added to the dependency chain.\n",
                         new_inst->readPC(), src_reg);
 
-                dependGraph[src_reg].insert(new_inst);
+                dependGraph.insert(src_reg, new_inst);
 
                 // Change the return value to indicate that something
                 // was added to the dependency graph.
@@ -1323,7 +1144,7 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
                         "became ready before it reached the IQ.\n",
                         new_inst->readPC(), src_reg);
                 // Mark a register ready within the instruction.
-                new_inst->markSrcRegReady();
+                new_inst->markSrcRegReady(src_reg_idx);
             }
         }
     }
@@ -1333,12 +1154,12 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
+InstructionQueue<Impl>::addToProducers(DynInstPtr &new_inst)
 {
-    //Actually nothing really needs to be marked when an
-    //instruction becomes the producer of a register's value,
-    //but for convenience a ptr to the producing instruction will
-    //be placed in the head node of the dependency links.
+    // Nothing really needs to be marked when an instruction becomes
+    // the producer of a register's value, but for convenience a ptr
+    // to the producing instruction will be placed in the head node of
+    // the dependency links.
     int8_t total_dest_regs = new_inst->numDestRegs();
 
     for (int dest_reg_idx = 0;
@@ -1355,12 +1176,12 @@ InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
             continue;
         }
 
-        if (dependGraph[dest_reg].next) {
-            dumpDependGraph();
+        if (!dependGraph.empty(dest_reg)) {
+            dependGraph.dump();
             panic("Dependency graph %i not empty!", dest_reg);
         }
 
-        dependGraph[dest_reg].inst = new_inst;
+        dependGraph.setInst(dest_reg, new_inst);
 
         // Mark the scoreboard to say it's not yet ready.
         regScoreboard[dest_reg] = false;
@@ -1371,7 +1192,7 @@ template <class Impl>
 void
 InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
 {
-    //If the instruction now has all of its source registers
+    // If the instruction now has all of its source registers
     // available, then add it to the list of ready instructions.
     if (inst->readyToIssue()) {
 
@@ -1382,7 +1203,6 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
 
             // Message to the mem dependence unit that this instruction has
             // its registers ready.
-
             memDepUnit[inst->threadNumber].regsReady(inst);
 
             return;
@@ -1395,6 +1215,16 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
                 inst->readPC(), op_class, inst->seqNum);
 
         readyInsts[op_class].push(inst);
+
+        // Will need to reorder the list if either a queue is not on the list,
+        // or it has an older instruction than last time.
+        if (!queueOnList[op_class]) {
+            addToOrderList(op_class);
+        } else if (readyInsts[op_class].top()->seqNum  <
+                   (*readyIt[op_class]).oldestInst) {
+            listOrder.erase(readyIt[op_class]);
+            addToOrderList(op_class);
+        }
     }
 }
 
@@ -1434,34 +1264,6 @@ InstructionQueue<Impl>::countInsts()
 #endif
 }
 
-template <class Impl>
-void
-InstructionQueue<Impl>::dumpDependGraph()
-{
-    DependencyEntry *curr;
-
-    for (int i = 0; i < numPhysRegs; ++i)
-    {
-        curr = &dependGraph[i];
-
-        if (curr->inst) {
-            cprintf("dependGraph[%i]: producer: %#x [sn:%lli] consumer: ",
-                    i, curr->inst->readPC(), curr->inst->seqNum);
-        } else {
-            cprintf("dependGraph[%i]: No producer. consumer: ", i);
-        }
-
-        while (curr->next != NULL) {
-            curr = curr->next;
-
-            cprintf("%#x [sn:%lli] ",
-                    curr->inst->readPC(), curr->inst->seqNum);
-        }
-
-        cprintf("\n");
-    }
-}
-
 template <class Impl>
 void
 InstructionQueue<Impl>::dumpLists()
@@ -1524,8 +1326,8 @@ InstructionQueue<Impl>::dumpInsts()
                     cprintf("Count:%i\n", valid_num);
                 } else if ((*inst_list_it)->isMemRef() &&
                            !(*inst_list_it)->memOpDone) {
-                    // Loads that have not been marked as executed still count
-                    // towards the total instructions.
+                    // Loads that have not been marked as executed
+                    // still count towards the total instructions.
                     ++valid_num;
                     cprintf("Count:%i\n", valid_num);
                 }

From fda6ddbffdfb2dfecf233750c080191141450276 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:45:06 -0400
Subject: [PATCH 37/50] Rename function to be more expressive.

--HG--
extra : convert_revision : 0c01b6d5309e2d09f03631740c9b0c8619ea26c4
---
 cpu/o3/fu_pool.cc | 2 +-
 cpu/o3/fu_pool.hh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpu/o3/fu_pool.cc b/cpu/o3/fu_pool.cc
index cb7a15061..fb2b5c00d 100644
--- a/cpu/o3/fu_pool.cc
+++ b/cpu/o3/fu_pool.cc
@@ -189,7 +189,7 @@ FUPool::getUnit(OpClass capability)
 }
 
 void
-FUPool::freeUnit(int fu_idx)
+FUPool::freeUnitNextCycle(int fu_idx)
 {
     assert(unitBusy[fu_idx]);
     unitsToBeFreed.push_back(fu_idx);
diff --git a/cpu/o3/fu_pool.hh b/cpu/o3/fu_pool.hh
index 7df5ad5f3..da6fdc802 100644
--- a/cpu/o3/fu_pool.hh
+++ b/cpu/o3/fu_pool.hh
@@ -134,7 +134,7 @@ class FUPool : public SimObject
     int getUnit(OpClass capability);
 
     /** Frees a FU at the end of this cycle. */
-    void freeUnit(int fu_idx);
+    void freeUnitNextCycle(int fu_idx);
 
     /** Frees all FUs on the list. */
     void processFreeUnits();

From 1a6f21b8d23494752cdc9d3a8d1c1a2adfd85ccf Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:47:55 -0400
Subject: [PATCH 38/50] Remove sat_counter.cc and put its code into
 sat_counter.hh.

cpu/SConscript:
    Remove sat_counter.cc and push its functions into the .hh file (all functions were 3 or less lines).
cpu/o3/sat_counter.hh:
    Incorporate .cc code into this file.

--HG--
extra : convert_revision : d75b1319292b00b00af1ce377cc0215fd06e6916
---
 cpu/SConscript        |  1 -
 cpu/o3/sat_counter.cc | 55 -------------------------------------------
 cpu/o3/sat_counter.hh | 22 +++++++++++++----
 3 files changed, 17 insertions(+), 61 deletions(-)
 delete mode 100644 cpu/o3/sat_counter.cc

diff --git a/cpu/SConscript b/cpu/SConscript
index 5d727bd25..3840b9d41 100644
--- a/cpu/SConscript
+++ b/cpu/SConscript
@@ -125,7 +125,6 @@ if 'AlphaFullCPU' in env['CPU_MODELS']:
         o3/rename.cc
         o3/rename_map.cc
         o3/rob.cc
-        o3/sat_counter.cc
         o3/scoreboard.cc
         o3/store_set.cc
         o3/tournament_pred.cc
diff --git a/cpu/o3/sat_counter.cc b/cpu/o3/sat_counter.cc
deleted file mode 100644
index b481b4ad2..000000000
--- a/cpu/o3/sat_counter.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2005 The Regents of The University of Michigan
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "base/misc.hh"
-#include "cpu/o3/sat_counter.hh"
-
-SatCounter::SatCounter()
-    : initialVal(0), counter(0)
-{
-}
-
-SatCounter::SatCounter(unsigned bits)
-    : initialVal(0), maxVal((1 << bits) - 1), counter(0)
-{
-}
-
-SatCounter::SatCounter(unsigned bits, uint8_t initial_val)
-    : initialVal(initialVal), maxVal((1 << bits) - 1), counter(initial_val)
-{
-    // Check to make sure initial value doesn't exceed the max counter value.
-    if (initial_val > maxVal) {
-        fatal("BP: Initial counter value exceeds max size.");
-    }
-}
-
-void
-SatCounter::setBits(unsigned bits)
-{
-    maxVal = (1 << bits) - 1;
-}
diff --git a/cpu/o3/sat_counter.hh b/cpu/o3/sat_counter.hh
index 1d20a8a8f..d01fd93ce 100644
--- a/cpu/o3/sat_counter.hh
+++ b/cpu/o3/sat_counter.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 The Regents of The University of Michigan
+ * Copyright (c) 2005-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -44,25 +44,37 @@ class SatCounter
     /**
      * Constructor for the counter.
      */
-    SatCounter();
+    SatCounter()
+        : initialVal(0), counter(0)
+    { }
 
     /**
      * Constructor for the counter.
      * @param bits How many bits the counter will have.
      */
-    SatCounter(unsigned bits);
+    SatCounter(unsigned bits)
+        : initialVal(0), maxVal((1 << bits) - 1), counter(0)
+    { }
 
     /**
      * Constructor for the counter.
      * @param bits How many bits the counter will have.
      * @param initial_val Starting value for each counter.
      */
-    SatCounter(unsigned bits, uint8_t initial_val);
+    SatCounter(unsigned bits, uint8_t initial_val)
+        : initialVal(initialVal), maxVal((1 << bits) - 1), counter(initial_val)
+    {
+        // Check to make sure initial value doesn't exceed the max
+        // counter value.
+        if (initial_val > maxVal) {
+            fatal("BP: Initial counter value exceeds max size.");
+        }
+    }
 
     /**
      * Sets the number of bits.
      */
-    void setBits(unsigned bits);
+    void setBits(unsigned bits) { maxVal = (1 << bits) - 1; }
 
     void reset() { counter = initialVal; }
 

From e3d5588ca70c88318c1e41e438102034c92c561e Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:53:17 -0400
Subject: [PATCH 39/50] O3 code update/cleanup.

cpu/o3/commit_impl.hh:
    O3 code update/cleanup.  Fetch fault code no longer needed (see previous checkin).

--HG--
extra : convert_revision : f602e7f978e19b8900dce482f38f9c7a195e94da
---
 cpu/o3/2bit_local_pred.cc   |   2 +-
 cpu/o3/2bit_local_pred.hh   |   2 +-
 cpu/o3/alpha_cpu.hh         |  18 +--
 cpu/o3/bpred_unit.cc        |   2 +-
 cpu/o3/bpred_unit.hh        |   7 +-
 cpu/o3/bpred_unit_impl.hh   |   6 +-
 cpu/o3/comm.hh              |   6 +-
 cpu/o3/commit.hh            |   5 +-
 cpu/o3/commit_impl.hh       |  59 +------
 cpu/o3/decode.hh            |  12 +-
 cpu/o3/decode_impl.hh       |  14 +-
 cpu/o3/fetch.hh             |  31 ++--
 cpu/o3/fetch_impl.hh        |  29 +---
 cpu/o3/lsq.hh               |  65 +++++---
 cpu/o3/lsq_impl.hh          | 138 +---------------
 cpu/o3/lsq_unit.hh          | 220 +++++++------------------
 cpu/o3/lsq_unit_impl.hh     | 315 ++++++++++++------------------------
 cpu/o3/mem_dep_unit.hh      |   9 +-
 cpu/o3/mem_dep_unit_impl.hh |  20 +--
 cpu/o3/rename.hh            |  32 ++--
 cpu/o3/rename_impl.hh       |  35 ++--
 cpu/o3/rename_map.cc        |  81 ++--------
 cpu/o3/rename_map.hh        |   5 +-
 cpu/o3/rob.hh               |  34 ++--
 cpu/o3/rob_impl.hh          |  38 ++---
 cpu/o3/scoreboard.cc        |   1 +
 cpu/o3/store_set.cc         |   7 +-
 cpu/o3/thread_state.hh      |  95 +++++------
 28 files changed, 381 insertions(+), 907 deletions(-)

diff --git a/cpu/o3/2bit_local_pred.cc b/cpu/o3/2bit_local_pred.cc
index eab98531d..c3fb2fdb8 100644
--- a/cpu/o3/2bit_local_pred.cc
+++ b/cpu/o3/2bit_local_pred.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/2bit_local_pred.hh b/cpu/o3/2bit_local_pred.hh
index 0dfe53819..cd65978ca 100644
--- a/cpu/o3/2bit_local_pred.hh
+++ b/cpu/o3/2bit_local_pred.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index f70793aaa..78ad5f7d8 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -87,7 +87,8 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         virtual Status status() const { return thread->status(); }
 
-        virtual void setStatus(Status new_status) { thread->setStatus(new_status); }
+        virtual void setStatus(Status new_status)
+        { thread->setStatus(new_status); }
 
         /// Set the status to Active.  Optional delay indicates number of
         /// cycles to wait before beginning execution.
@@ -168,12 +169,15 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
 
         // @todo: Figure out where these store cond failures should go.
-        virtual unsigned readStCondFailures() { return thread->storeCondFailures; }
+        virtual unsigned readStCondFailures()
+        { return thread->storeCondFailures; }
 
-        virtual void setStCondFailures(unsigned sc_failures) { thread->storeCondFailures = sc_failures; }
+        virtual void setStCondFailures(unsigned sc_failures)
+        { thread->storeCondFailures = sc_failures; }
 
 #if FULL_SYSTEM
-        virtual bool inPalMode() { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
+        virtual bool inPalMode()
+        { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
 #endif
 
         // Only really makes sense for old CPU model.  Lots of code
@@ -194,10 +198,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
     };
 
-//    friend class AlphaXC;
-
-//    std::vector<ExecContext *> xcProxies;
-
 #if FULL_SYSTEM
     /** ITB pointer. */
     AlphaITB *itb;
diff --git a/cpu/o3/bpred_unit.cc b/cpu/o3/bpred_unit.cc
index a78dcf463..92344111f 100644
--- a/cpu/o3/bpred_unit.cc
+++ b/cpu/o3/bpred_unit.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/bpred_unit.hh b/cpu/o3/bpred_unit.hh
index ee7ffc183..b7814b2e9 100644
--- a/cpu/o3/bpred_unit.hh
+++ b/cpu/o3/bpred_unit.hh
@@ -43,12 +43,7 @@
 
 /**
  * Basically a wrapper class to hold both the branch predictor
- * and the BTB.  Right now I'm unsure of the implementation; it would
- * be nicer to have something closer to the CPUPolicy or the Impl where
- * this is just typedefs, but it forces the upper level stages to be
- * aware of the constructors of the BP and the BTB.  The nicer thing
- * to do is have this templated on the Impl, accept the usual Params
- * object, and be able to call the constructors on the BP and BTB.
+ * and the BTB.
  */
 template<class Impl>
 class TwobitBPredUnit
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index d20b31e55..c37df606b 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -26,13 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <list>
+#include <vector>
+
 #include "base/trace.hh"
 #include "base/traceflags.hh"
 #include "cpu/o3/bpred_unit.hh"
 
-#include <vector>
-#include <list>
-
 using namespace std;
 
 template<class Impl>
diff --git a/cpu/o3/comm.hh b/cpu/o3/comm.hh
index 1a8f394ca..c36c58d3d 100644
--- a/cpu/o3/comm.hh
+++ b/cpu/o3/comm.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -169,10 +169,6 @@ struct TimeBufStruct {
         bool commitInsts;
         InstSeqNum squashSeqNum;
 
-        // Extra bit of information so that the LDSTQ only updates when it
-        // needs to.
-        bool commitIsLoad;
-
         // Communication specifically to the IQ to tell the IQ that it can
         // schedule a non-speculative instruction.
         InstSeqNum nonSpecSeqNum;
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 73eccd2b0..66abf8dc6 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -30,10 +30,10 @@
 #define __CPU_O3_COMMIT_HH__
 
 #include "arch/faults.hh"
-#include "cpu/inst_seq.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/inst_seq.hh"
 #include "mem/memory_interface.hh"
 
 template <class>
@@ -59,8 +59,7 @@ class O3ThreadState;
  * squashing instruction's sequence number, and only broadcasting a
  * redirect if it corresponds to an older instruction. Commit also
  * supports multiple cycle squashing, to model a ROB that can only
- * remove a certain number of instructions per cycle. Eventually traps
- * and interrupts will most likely be handled here as well.
+ * remove a certain number of instructions per cycle.
  */
 template<class Impl>
 class DefaultCommit
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 170f5b01f..346a8bc1c 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -27,12 +27,7 @@
  */
 
 #include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iomanip>
-#include <stdio.h>
-#include <string.h>
+#include <string>
 
 #include "base/loader/symtab.hh"
 #include "base/timebuf.hh"
@@ -835,58 +830,6 @@ DefaultCommit<Impl>::commitInsts()
     unsigned num_committed = 0;
 
     DynInstPtr head_inst;
-#if FULL_SYSTEM
-    // Not the best way to check if the front end is empty, but it should
-    // work.
-    // @todo: Try to avoid directly accessing fetch.
-    if (commitStatus[0] == FetchTrapPending && rob->isEmpty()) {
-        DPRINTF(Commit, "Fault from fetch is pending.\n");
-
-        fetchTrapWait++;
-        if (fetchTrapWait > 10000000) {
-            panic("Fetch trap has been pending for a long time!");
-        }
-        if (fetchFaultTick > curTick) {
-            DPRINTF(Commit, "Not enough cycles since fault, fault will "
-                    "happen on %lli\n",
-                    fetchFaultTick);
-            cpu->activityThisCycle();
-            return;
-        } else if (iewStage->hasStoresToWB()) {
-            DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
-                    "they are completed. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            cpu->activityThisCycle();
-            return;
-        } else if (cpu->inPalMode(readPC())) {
-            DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            return;
-        } else if (fetchStage->getYoungestSN() > youngestSeqNum[0]) {
-            DPRINTF(Commit, "Waiting for front end to drain. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            return;
-        }
-        fetchTrapWait = 0;
-        DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
-
-        assert(!thread[0]->inSyscall);
-
-        thread[0]->inSyscall = true;
-
-        // Consider holding onto the trap and waiting until the trap event
-        // happens for this to be executed.
-        cpu->trap(fetchFault, 0);
-
-        // Exit state update mode to avoid accidental updating.
-        thread[0]->inSyscall = false;
-
-        commitStatus[0] = TrapPending;
-        // Set it up so that we squash next cycle
-        trapSquash[0] = true;
-        return;
-    }
-#endif
 
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
diff --git a/cpu/o3/decode.hh b/cpu/o3/decode.hh
index 3f3f68247..3035b3387 100644
--- a/cpu/o3/decode.hh
+++ b/cpu/o3/decode.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,11 +35,11 @@
 #include "base/timebuf.hh"
 
 /**
- * DefaultDecode class handles both single threaded and SMT decode. Its width is
- * specified by the parameters; each cycles it tries to decode that many
- * instructions. Because instructions are actually decoded when the StaticInst
- * is created, this stage does not do much other than check any PC-relative
- * branches.
+ * DefaultDecode class handles both single threaded and SMT
+ * decode. Its width is specified by the parameters; each cycles it
+ * tries to decode that many instructions. Because instructions are
+ * actually decoded when the StaticInst is created, this stage does
+ * not do much other than check any PC-relative branches.
  */
 template<class Impl>
 class DefaultDecode
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index a419a8932..2ed7ec6fc 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@ DefaultDecode<Impl>::DefaultDecode(Params *params)
       decodeWidth(params->decodeWidth),
       numThreads(params->numberOfThreads)
 {
-    DPRINTF(Decode, "decodeWidth=%i.\n", decodeWidth);
     _status = Inactive;
 
     for (int i = 0; i < numThreads; ++i) {
@@ -249,8 +248,6 @@ template<class Impl>
 bool
 DefaultDecode<Impl>::unblock(unsigned tid)
 {
-    DPRINTF(Decode, "[tid:%u]: Trying to unblock.\n", tid);
-
     // Decode is done unblocking only if the skid buffer is empty.
     if (skidBuffer[tid].empty()) {
         DPRINTF(Decode, "[tid:%u]: Done unblocking.\n", tid);
@@ -261,6 +258,8 @@ DefaultDecode<Impl>::unblock(unsigned tid)
         return true;
     }
 
+    DPRINTF(Decode, "[tid:%u]: Currently unblocking.\n", tid);
+
     return false;
 }
 
@@ -318,6 +317,7 @@ DefaultDecode<Impl>::squash(unsigned tid)
         // In syscall emulation, we can have both a block and a squash due
         // to a syscall in the same cycle.  This would cause both signals to
         // be high.  This shouldn't happen in full system.
+        // @todo: Determine if this still happens.
         if (toFetch->decodeBlock[tid]) {
             toFetch->decodeBlock[tid] = 0;
         } else {
@@ -372,7 +372,7 @@ DefaultDecode<Impl>::skidInsert(unsigned tid)
         skidBuffer[tid].push(inst);
     }
 
-    // Eventually need to enforce this by not letting a thread
+    // @todo: Eventually need to enforce this by not letting a thread
     // fetch past its skidbuffer
     assert(skidBuffer[tid].size() <= skidBufferMax);
 }
@@ -436,10 +436,10 @@ void
 DefaultDecode<Impl>::sortInsts()
 {
     int insts_from_fetch = fromFetch->size;
-
+#ifdef DEBUG
     for (int i=0; i < numThreads; i++)
         assert(insts[i].empty());
-
+#endif
     for (int i = 0; i < insts_from_fetch; ++i) {
         insts[fromFetch->insts[i]->threadNumber].push(fromFetch->insts[i]);
     }
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index b03d4afe3..3fcfdc3a1 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,12 +38,12 @@
 class Sampler;
 
 /**
- * DefaultFetch class handles both single threaded and SMT fetch. Its width is
- * specified by the parameters; each cycle it tries to fetch that many
- * instructions. It supports using a branch predictor to predict direction and
- * targets.
- * It supports the idling functionalitiy of the CPU by indicating to the CPU
- * when it is active and inactive.
+ * DefaultFetch class handles both single threaded and SMT fetch. Its
+ * width is specified by the parameters; each cycle it tries to fetch
+ * that many instructions. It supports using a branch predictor to
+ * predict direction and targets.
+ * It supports the idling functionalitiy of the CPU by indicating to
+ * the CPU when it is active and inactive.
  */
 template <class Impl>
 class DefaultFetch
@@ -66,8 +66,8 @@ class DefaultFetch
     typedef TheISA::ExtMachInst ExtMachInst;
 
   public:
-    /** Overall fetch status. Used to determine if the CPU can deschedule itsef
-     * due to a lack of activity.
+    /** Overall fetch status. Used to determine if the CPU can
+     * deschedule itsef due to a lack of activity.
      */
     enum FetchStatus {
         Active,
@@ -174,13 +174,13 @@ class DefaultFetch
     void wakeFromQuiesce();
 
   private:
-    /** Changes the status of this stage to active, and indicates this to the
-     * CPU.
+    /** Changes the status of this stage to active, and indicates this
+     * to the CPU.
      */
     inline void switchToActive();
 
-    /** Changes the status of this stage to inactive, and indicates this to the
-     * CPU.
+    /** Changes the status of this stage to inactive, and indicates
+     * this to the CPU.
      */
     inline void switchToInactive();
 
@@ -373,11 +373,6 @@ class DefaultFetch
 
     bool switchedOut;
 
-  public:
-    InstSeqNum &getYoungestSN() { return youngestSN; }
-  private:
-    InstSeqNum youngestSN;
-
 #if !FULL_SYSTEM
     /** Page table pointer. */
 //    PageTable *pTable;
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 523719945..1c5e508f6 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -938,10 +938,6 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         DPRINTF(Fetch, "[tid:%i]: Adding instructions to queue to "
                 "decode.\n",tid);
 
-        //////////////////////////
-        // Fetch first instruction
-        //////////////////////////
-
         // Need to keep track of whether or not a predicted branch
         // ended this fetch block.
         bool predicted_branch = false;
@@ -1004,7 +1000,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             fetch_PC = next_PC;
 
             if (instruction->isQuiesce()) {
-                warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
+                warn("%lli: Quiesce instruction encountered, halting fetch!",
+                     curTick);
                 fetchStatus[tid] = QuiescePending;
                 ++numInst;
                 status_change = true;
@@ -1022,24 +1019,20 @@ DefaultFetch<Impl>::fetch(bool &status_change)
     // Now that fetching is completed, update the PC to signify what the next
     // cycle will be.
     if (fault == NoFault) {
-
         DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n",tid, next_PC);
 
-
         PC[tid] = next_PC;
         nextPC[tid] = next_PC + instSize;
     } else {
-        // If the issue was an icache miss, then we can just return and
-        // wait until it is handled.
+        // We shouldn't be in an icache miss and also have a fault (an ITB
+        // miss)
         if (fetchStatus[tid] == IcacheMissStall) {
             panic("Fetch should have exited prior to this!");
         }
 
-        // Handle the fault.
-        // This stage will not be able to continue until all the ROB
-        // slots are empty, at which point the fault can be handled.
-        // The only other way it can wake up is if a squash comes along
-        // and changes the PC.
+        // Send the fault to commit.  This thread will not do anything
+        // until commit handles the fault.  The only other way it can
+        // wake up is if a squash comes along and changes the PC.
 #if FULL_SYSTEM
         assert(numInst != fetchWidth);
         // Get a sequence number.
@@ -1067,20 +1060,12 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         toDecode->insts[numInst] = instruction;
         toDecode->size++;
 
-        // Tell the commit stage the fault we had.
-//        toDecode->fetchFault = fault;
-//        toDecode->fetchFaultSN = cpu->globalSeqNum;
-
         DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid);
 
         fetchStatus[tid] = TrapPending;
         status_change = true;
 
         warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
-//        cpu->trap(fault);
-        // Send a signal to the ROB indicating that there's a trap from the
-        // fetch stage that needs to be handled.  Need to indicate that
-        // there's a fault, and the fault type.
 #else // !FULL_SYSTEM
         fatal("fault (%d) detected @ PC %08p", fault, PC[tid]);
 #endif // FULL_SYSTEM
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
index d5f893e57..a1eeccbe7 100644
--- a/cpu/o3/lsq.hh
+++ b/cpu/o3/lsq.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,10 +32,9 @@
 #include <map>
 #include <queue>
 
-#include "base/hashmap.hh"
 #include "config/full_system.hh"
 #include "cpu/inst_seq.hh"
-#include "cpu/o3/cpu_policy.hh"
+//#include "cpu/o3/cpu_policy.hh"
 #include "cpu/o3/lsq_unit.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
@@ -85,7 +84,8 @@ class LSQ {
     /** Ticks the LSQ. */
     void tick();
     /** Ticks a specific LSQ Unit. */
-    void tick(unsigned tid);
+    void tick(unsigned tid)
+    { thread[tid].tick(); }
 
     /** Inserts a load into the LSQ. */
     void insertLoad(DynInstPtr &load_inst);
@@ -95,18 +95,23 @@ class LSQ {
     /** Executes a load. */
     Fault executeLoad(DynInstPtr &inst);
 
-    Fault executeLoad(int lq_idx, unsigned tid);
+    Fault executeLoad(int lq_idx, unsigned tid)
+    { return thread[tid].executeLoad(lq_idx); }
+
     /** Executes a store. */
     Fault executeStore(DynInstPtr &inst);
 
     /**
      * Commits loads up until the given sequence number for a specific thread.
      */
-    void commitLoads(InstSeqNum &youngest_inst, unsigned tid);
+    void commitLoads(InstSeqNum &youngest_inst, unsigned tid)
+    { thread[tid].commitLoads(youngest_inst); }
+
     /**
      * Commits stores up until the given sequence number for a specific thread.
      */
-    void commitStores(InstSeqNum &youngest_inst, unsigned tid);
+    void commitStores(InstSeqNum &youngest_inst, unsigned tid)
+    { thread[tid].commitStores(youngest_inst); }
 
     /**
      * Attempts to write back stores until all cache ports are used or the
@@ -119,7 +124,8 @@ class LSQ {
     /**
      * Squash instructions from a thread until the specified sequence number.
      */
-    void squash(const InstSeqNum &squashed_num, unsigned tid);
+    void squash(const InstSeqNum &squashed_num, unsigned tid)
+    { thread[tid].squash(squashed_num); }
 
     /** Returns whether or not there was a memory ordering violation. */
     bool violation();
@@ -127,12 +133,14 @@ class LSQ {
      * Returns whether or not there was a memory ordering violation for a
      * specific thread.
      */
-    bool violation(unsigned tid);
+    bool violation(unsigned tid)
+    { return thread[tid].violation(); }
 
     /** Returns if a load is blocked due to the memory system for a specific
      *  thread.
      */
-    bool loadBlocked(unsigned tid);
+    bool loadBlocked(unsigned tid)
+    { return thread[tid].loadBlocked(); }
 
     bool isLoadBlockedHandled(unsigned tid)
     { return thread[tid].isLoadBlockedHandled(); }
@@ -141,10 +149,13 @@ class LSQ {
     { thread[tid].setLoadBlockedHandled(); }
 
     /** Gets the instruction that caused the memory ordering violation. */
-    DynInstPtr getMemDepViolator(unsigned tid);
+    DynInstPtr getMemDepViolator(unsigned tid)
+    { return thread[tid].getMemDepViolator(); }
 
     /** Returns the head index of the load queue for a specific thread. */
-    int getLoadHead(unsigned tid);
+    int getLoadHead(unsigned tid)
+    { return thread[tid].getLoadHead(); }
+
     /** Returns the sequence number of the head of the load queue. */
     InstSeqNum getLoadHeadSeqNum(unsigned tid)
     {
@@ -152,7 +163,9 @@ class LSQ {
     }
 
     /** Returns the head index of the store queue. */
-    int getStoreHead(unsigned tid);
+    int getStoreHead(unsigned tid)
+    { return thread[tid].getStoreHead(); }
+
     /** Returns the sequence number of the head of the store queue. */
     InstSeqNum getStoreHeadSeqNum(unsigned tid)
     {
@@ -162,22 +175,26 @@ class LSQ {
     /** Returns the number of instructions in all of the queues. */
     int getCount();
     /** Returns the number of instructions in the queues of one thread. */
-    int getCount(unsigned tid);
+    int getCount(unsigned tid)
+    { return thread[tid].getCount(); }
 
     /** Returns the total number of loads in the load queue. */
     int numLoads();
     /** Returns the total number of loads for a single thread. */
-    int numLoads(unsigned tid);
+    int numLoads(unsigned tid)
+    { return thread[tid].numLoads(); }
 
     /** Returns the total number of stores in the store queue. */
     int numStores();
     /** Returns the total number of stores for a single thread. */
-    int numStores(unsigned tid);
+    int numStores(unsigned tid)
+    { return thread[tid].numStores(); }
 
     /** Returns the total number of loads that are ready. */
     int numLoadsReady();
     /** Returns the number of loads that are ready for a single thread. */
-    int numLoadsReady(unsigned tid);
+    int numLoadsReady(unsigned tid)
+    { return thread[tid].numLoadsReady(); }
 
     /** Returns the number of free entries. */
     unsigned numFreeEntries();
@@ -215,24 +232,30 @@ class LSQ {
 
     /** Returns whether or not there are any stores to write back to memory. */
     bool hasStoresToWB();
+
     /** Returns whether or not a specific thread has any stores to write back
      * to memory.
      */
-    bool hasStoresToWB(unsigned tid);
+    bool hasStoresToWB(unsigned tid)
+    { return thread[tid].hasStoresToWB(); }
+
     /** Returns the number of stores a specific thread has to write back. */
-    int  numStoresToWB(unsigned tid);
+    int  numStoresToWB(unsigned tid)
+    { return thread[tid].numStoresToWB(); }
 
     /** Returns if the LSQ will write back to memory this cycle. */
     bool willWB();
     /** Returns if the LSQ of a specific thread will write back to memory this
      * cycle.
      */
-    bool willWB(unsigned tid);
+    bool willWB(unsigned tid)
+    { return thread[tid].willWB(); }
 
     /** Debugging function to print out all instructions. */
     void dumpInsts();
     /** Debugging function to print out instructions from a specific thread. */
-    void dumpInsts(unsigned tid);
+    void dumpInsts(unsigned tid)
+    { thread[tid].dumpInsts(); }
 
     /** Executes a read operation, using the load specified at the load index. */
     template <class T>
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
index c43c19619..a6ad27522 100644
--- a/cpu/o3/lsq_impl.hh
+++ b/cpu/o3/lsq_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,6 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <algorithm>
+#include <string>
+
 #include "cpu/o3/lsq.hh"
 
 using namespace std;
@@ -89,7 +92,7 @@ LSQ<Impl>::LSQ(Params *params)
 
     //Initialize LSQs
     for (int tid=0; tid < numThreads; tid++) {
-        thread[tid].init(params, maxLQEntries+1, maxSQEntries+1, tid);
+        thread[tid].init(params, maxLQEntries, maxSQEntries, tid);
     }
 }
 
@@ -226,13 +229,6 @@ LSQ<Impl>::tick()
     }
 }
 
-template<class Impl>
-void
-LSQ<Impl>::tick(unsigned tid)
-{
-    thread[tid].tick();
-}
-
 template<class Impl>
 void
 LSQ<Impl>::insertLoad(DynInstPtr &load_inst)
@@ -260,13 +256,6 @@ LSQ<Impl>::executeLoad(DynInstPtr &inst)
     return thread[tid].executeLoad(inst);
 }
 
-template<class Impl>
-Fault
-LSQ<Impl>::executeLoad(int lq_idx, unsigned tid)
-{
-    return thread[tid].executeLoad(lq_idx);
-}
-
 template<class Impl>
 Fault
 LSQ<Impl>::executeStore(DynInstPtr &inst)
@@ -276,20 +265,6 @@ LSQ<Impl>::executeStore(DynInstPtr &inst)
     return thread[tid].executeStore(inst);
 }
 
-template<class Impl>
-void
-LSQ<Impl>::commitLoads(InstSeqNum &youngest_inst,unsigned tid)
-{
-    thread[tid].commitLoads(youngest_inst);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::commitStores(InstSeqNum &youngest_inst,unsigned tid)
-{
-    thread[tid].commitStores(youngest_inst);
-}
-
 template<class Impl>
 void
 LSQ<Impl>::writebackStores()
@@ -300,28 +275,14 @@ LSQ<Impl>::writebackStores()
         unsigned tid = *active_threads++;
 
         if (numStoresToWB(tid) > 0) {
-            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores available"
-                " for Writeback.\n", tid, numStoresToWB(tid));
+            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores "
+                "available for Writeback.\n", tid, numStoresToWB(tid));
         }
 
         thread[tid].writebackStores();
     }
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numStoresToWB(unsigned tid)
-{
-    return thread[tid].numStoresToWB();
-}
-
-template<class Impl>
-void
-LSQ<Impl>::squash(const InstSeqNum &squashed_num, unsigned tid)
-{
-        thread[tid].squash(squashed_num);
-}
-
 template<class Impl>
 bool
 LSQ<Impl>::violation()
@@ -338,41 +299,6 @@ LSQ<Impl>::violation()
     return false;
 }
 
-template<class Impl>
-bool
-LSQ<Impl>::violation(unsigned tid)
-{
-    return thread[tid].violation();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::loadBlocked(unsigned tid)
-{
-    return thread[tid].loadBlocked();
-}
-
-template<class Impl>
-typename Impl::DynInstPtr
-LSQ<Impl>::getMemDepViolator(unsigned tid)
-{
-    return thread[tid].getMemDepViolator();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getLoadHead(unsigned tid)
-{
-    return thread[tid].getLoadHead();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getStoreHead(unsigned tid)
-{
-    return thread[tid].getStoreHead();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::getCount()
@@ -389,13 +315,6 @@ LSQ<Impl>::getCount()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::getCount(unsigned tid)
-{
-    return thread[tid].getCount();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numLoads()
@@ -412,13 +331,6 @@ LSQ<Impl>::numLoads()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numLoads(unsigned tid)
-{
-    return thread[tid].numLoads();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numStores()
@@ -435,13 +347,6 @@ LSQ<Impl>::numStores()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numStores(unsigned tid)
-{
-    return thread[tid].numStores();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numLoadsReady()
@@ -458,13 +363,6 @@ LSQ<Impl>::numLoadsReady()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numLoadsReady(unsigned tid)
-{
-    return thread[tid].numLoadsReady();
-}
-
 template<class Impl>
 unsigned
 LSQ<Impl>::numFreeEntries()
@@ -612,14 +510,6 @@ LSQ<Impl>::hasStoresToWB()
     return true;
 }
 
-
-template<class Impl>
-bool
-LSQ<Impl>::hasStoresToWB(unsigned tid)
-{
-    return thread[tid].hasStoresToWB();
-}
-
 template<class Impl>
 bool
 LSQ<Impl>::willWB()
@@ -635,13 +525,6 @@ LSQ<Impl>::willWB()
     return true;
 }
 
-template<class Impl>
-bool
-LSQ<Impl>::willWB(unsigned tid)
-{
-    return thread[tid].willWB();
-}
-
 template<class Impl>
 void
 LSQ<Impl>::dumpInsts()
@@ -653,10 +536,3 @@ LSQ<Impl>::dumpInsts()
         thread[tid].dumpInsts();
     }
 }
-
-template<class Impl>
-void
-LSQ<Impl>::dumpInsts(unsigned tid)
-{
-    thread[tid].dumpInsts();
-}
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index 623dbdb4b..942b4583d 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,29 +29,30 @@
 #ifndef __CPU_O3_LSQ_UNIT_HH__
 #define __CPU_O3_LSQ_UNIT_HH__
 
+#include <algorithm>
 #include <map>
 #include <queue>
-#include <algorithm>
 
+#include "arch/faults.hh"
 #include "config/full_system.hh"
 #include "base/hashmap.hh"
 #include "cpu/inst_seq.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
-#include "sim/debug.hh"
-#include "sim/sim_object.hh"
-#include "arch/faults.hh"
+//#include "sim/debug.hh"
+//#include "sim/sim_object.hh"
 
 /**
- * Class that implements the actual LQ and SQ for each specific thread.
- * Both are circular queues; load entries are freed upon committing, while
- * store entries are freed once they writeback. The LSQUnit tracks if there
- * are memory ordering violations, and also detects partial load to store
- * forwarding cases (a store only has part of a load's data) that requires
- * the load to wait until the store writes back. In the former case it
- * holds onto the instruction until the dependence unit looks at it, and
- * in the latter it stalls the LSQ until the store writes back. At that
- * point the load is replayed.
+ * Class that implements the actual LQ and SQ for each specific
+ * thread.  Both are circular queues; load entries are freed upon
+ * committing, while store entries are freed once they writeback. The
+ * LSQUnit tracks if there are memory ordering violations, and also
+ * detects partial load to store forwarding cases (a store only has
+ * part of a load's data) that requires the load to wait until the
+ * store writes back. In the former case it holds onto the instruction
+ * until the dependence unit looks at it, and in the latter it stalls
+ * the LSQ until the store writes back. At that point the load is
+ * replayed.
  */
 template <class Impl>
 class LSQUnit {
@@ -76,21 +77,19 @@ class LSQUnit {
         /** Returns the description of this event. */
         const char *description();
 
-      private:
-        /** The store index of the store being written back. */
-        int storeIdx;
         /** The writeback event for the store.  Needed for store
          * conditionals.
          */
-      public:
         Event *wbEvent;
+
+      private:
+        /** The store index of the store being written back. */
+        int storeIdx;
       private:
         /** The pointer to the LSQ unit that issued the store. */
         LSQUnit<Impl> *lsqPtr;
     };
 
-    friend class StoreCompletionEvent;
-
   public:
     /** Constructs an LSQ unit. init() must be called prior to use. */
     LSQUnit();
@@ -136,14 +135,12 @@ class LSQUnit {
     /** Executes a load instruction. */
     Fault executeLoad(DynInstPtr &inst);
 
-    Fault executeLoad(int lq_idx);
+    Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; }
     /** Executes a store instruction. */
     Fault executeStore(DynInstPtr &inst);
 
     /** Commits the head load. */
     void commitLoad();
-    /** Commits a specific load, given by the sequence number. */
-    void commitLoad(InstSeqNum &inst);
     /** Commits loads older than a specific sequence number. */
     void commitLoads(InstSeqNum &youngest_inst);
 
@@ -179,9 +176,7 @@ class LSQUnit {
     /** Returns the memory ordering violator. */
     DynInstPtr getMemDepViolator();
 
-    /** Returns if a load became blocked due to the memory system.  It clears
-     *  the bool's value upon this being called.
-     */
+    /** Returns if a load became blocked due to the memory system. */
     bool loadBlocked()
     { return isLoadBlocked; }
 
@@ -215,9 +210,6 @@ class LSQUnit {
     /** Returns if the SQ is full. */
     bool sqFull() { return stores >= (SQEntries - 1); }
 
-    /** Debugging function to dump instructions in the LSQ. */
-    void dumpInsts();
-
     /** Returns the number of instructions in the LSQ. */
     unsigned getCount() { return loads + stores; }
 
@@ -245,6 +237,10 @@ class LSQUnit {
     /** Decrements the given load index (circular queue). */
     inline void decrLdIdx(int &load_idx);
 
+  public:
+    /** Debugging function to dump instructions in the LSQ. */
+    void dumpInsts();
+
   private:
     /** Pointer to the CPU. */
     FullCPU *cpu;
@@ -287,38 +283,29 @@ class LSQUnit {
         /** Whether or not the store is completed. */
         bool completed;
     };
-/*
-    enum Status {
-        Running,
-        Idle,
-        DcacheMissStall,
-        DcacheMissSwitch
-    };
-*/
+
   private:
     /** The LSQUnit thread id. */
     unsigned lsqID;
 
-    /** The status of the LSQ unit. */
-//    Status _status;
-
     /** The store queue. */
     std::vector<SQEntry> storeQueue;
 
     /** The load queue. */
     std::vector<DynInstPtr> loadQueue;
 
-    // Consider making these 16 bits
-    /** The number of LQ entries. */
+    /** The number of LQ entries, plus a sentinel entry (circular queue).
+     *  @todo: Consider having var that records the true number of LQ entries.
+     */
     unsigned LQEntries;
-    /** The number of SQ entries. */
+    /** The number of SQ entries, plus a sentinel entry (circular queue).
+     *  @todo: Consider having var that records the true number of SQ entries.
+     */
     unsigned SQEntries;
 
     /** The number of load instructions in the LQ. */
     int loads;
-    /** The number of store instructions in the SQ (excludes those waiting to
-     * writeback).
-     */
+    /** The number of store instructions in the SQ. */
     int stores;
     /** The number of store instructions in the SQ waiting to writeback. */
     int storesToWB;
@@ -330,8 +317,8 @@ class LSQUnit {
 
     /** The index of the head instruction in the SQ. */
     int storeHead;
-    /** The index of the first instruction that is ready to be written back,
-     * and has not yet been written back.
+    /** The index of the first instruction that may be ready to be
+     * written back, and has not yet been written back.
      */
     int storeWBIdx;
     /** The index of the tail instruction in the SQ. */
@@ -348,13 +335,9 @@ class LSQUnit {
 
     //list<InstSeqNum> mshrSeqNums;
 
-     //Stats::Scalar<> dcacheStallCycles;
-    Counter lastDcacheStall;
-
     /** Wire to read information from the issue stage time queue. */
     typename TimeBuffer<IssueStruct>::wire fromIssue;
 
-    // Make these per thread?
     /** Whether or not the LSQ is stalled. */
     bool stalled;
     /** The store that causes the stall due to partial store to load
@@ -364,20 +347,13 @@ class LSQUnit {
     /** The index of the above store. */
     int stallingLoadIdx;
 
-    /** Whether or not a load is blocked due to the memory system.  It is
-     *  cleared when this value is checked via loadBlocked().
-     */
+    /** Whether or not a load is blocked due to the memory system. */
     bool isLoadBlocked;
 
     bool loadBlockedHandled;
 
     InstSeqNum blockedLoadSeqNum;
 
-    /** The oldest faulting load instruction. */
-    DynInstPtr loadFaultInst;
-    /** The oldest faulting store instruction. */
-    DynInstPtr storeFaultInst;
-
     /** The oldest load that caused a memory ordering violation. */
     DynInstPtr memDepViolator;
 
@@ -447,23 +423,14 @@ template <class T>
 Fault
 LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 {
-    //Depending on issue2execute delay a squashed load could
-    //execute if it is found to be squashed in the same
-    //cycle it is scheduled to execute
     assert(loadQueue[load_idx]);
 
-    if (loadQueue[load_idx]->isExecuted()) {
-        panic("Should not reach this point with split ops!");
-        memcpy(&data,req->data,req->size);
-
-        return NoFault;
-    }
+    assert(!loadQueue[load_idx]->isExecuted());
 
     // Make sure this isn't an uncacheable access
     // A bit of a hackish way to get uncached accesses to work only if they're
     // at the head of the LSQ and are ready to commit (at the head of the ROB
     // too).
-    // @todo: Fix uncached accesses.
     if (req->flags & UNCACHEABLE &&
         (load_idx != loadHead || !loadQueue[load_idx]->reachedCommit)) {
         iewStage->rescheduleMemInst(loadQueue[load_idx]);
@@ -479,12 +446,16 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             "storeHead: %i addr: %#x\n",
             load_idx, store_idx, storeHead, req->paddr);
 
-#ifdef FULL_SYSTEM
+#if 0
     if (req->flags & LOCKED) {
         cpu->lockAddr = req->paddr;
         cpu->lockFlag = true;
     }
 #endif
+            req->cmd = Read;
+            assert(!req->completionEvent);
+            req->completionEvent = NULL;
+            req->time = curTick;
 
     while (store_idx != -1) {
         // End once we've reached the top of the LSQ
@@ -518,18 +489,14 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
         // If the store's data has all of the data needed, we can forward.
         if (store_has_lower_limit && store_has_upper_limit) {
-
+            // Get shift amount for offset into the store's data.
             int shift_amt = req->vaddr & (store_size - 1);
-            // Assumes byte addressing
+            // @todo: Magic number, assumes byte addressing
             shift_amt = shift_amt << 3;
 
             // Cast this to type T?
             data = storeQueue[store_idx].data >> shift_amt;
 
-            req->cmd = Read;
-            assert(!req->completionEvent);
-            req->completionEvent = NULL;
-            req->time = curTick;
             assert(!req->data);
             req->data = new uint8_t[64];
 
@@ -579,7 +546,6 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
             // Do not generate a writeback event as this instruction is not
             // complete.
-
             DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
                     "Store idx %i to load addr %#x\n",
                     store_idx, req->vaddr);
@@ -588,16 +554,13 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         }
     }
 
-
     // If there's no forwarding case, then go access memory
     DynInstPtr inst = loadQueue[load_idx];
 
-    DPRINTF(LSQUnit, "Doing functional access for inst PC %#x\n",
-            loadQueue[load_idx]->readPC());
+    DPRINTF(LSQUnit, "Doing functional access for inst [sn:%lli] PC %#x\n",
+            loadQueue[load_idx]->seqNum, loadQueue[load_idx]->readPC());
+
     assert(!req->data);
-    req->cmd = Read;
-    req->completionEvent = NULL;
-    req->time = curTick;
     req->data = new uint8_t[64];
     Fault fault = cpu->read(req, data);
     memcpy(req->data, &data, sizeof(T));
@@ -611,20 +574,19 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             if (isLoadBlocked && blockedLoadSeqNum < inst->seqNum)
                 return NoFault;
 
+            // Record that the load was blocked due to memory.  This
+            // load will squash all instructions after it, be
+            // refetched, and re-executed.
             isLoadBlocked = true;
             loadBlockedHandled = false;
             blockedLoadSeqNum = inst->seqNum;
             // No fault occurred, even though the interface is blocked.
             return NoFault;
         }
+
         DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
                 loadQueue[load_idx]->readPC());
-/*
-        Addr debug_addr = ULL(0xfffffc0000be81a8);
-        if (req->vaddr == debug_addr) {
-            debug_break();
-        }
-*/
+
         assert(!req->completionEvent);
         req->completionEvent =
             new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
@@ -632,75 +594,16 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
         assert(dcacheInterface->doEvents());
 
-        // Ugly hack to get an event scheduled *only* if the access is
-        // a miss.  We really should add first-class support for this
-        // at some point.
         if (result != MA_HIT) {
             DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
             DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
                     inst->seqNum);
-
-            lastDcacheStall = curTick;
-
-//            _status = DcacheMissStall;
-
         } else {
+            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
             DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
                     inst->seqNum);
-
-            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
         }
     }
-#if 0
-    // if we have a cache, do cache access too
-    if (dcacheInterface) {
-        if (dcacheInterface->isBlocked()) {
-            isLoadBlocked = true;
-            // No fault occurred, even though the interface is blocked.
-            return NoFault;
-        }
-
-        DPRINTF(LSQUnit, "LSQUnit: D-cache: PC:%#x reading from paddr:%#x "
-                "vaddr:%#x flags:%i\n",
-                inst->readPC(), req->paddr, req->vaddr, req->flags);
-
-        // Setup MemReq pointer
-        req->cmd = Read;
-        req->completionEvent = NULL;
-        req->time = curTick;
-        assert(!req->data);
-        req->data = new uint8_t[64];
-
-        assert(!req->completionEvent);
-        req->completionEvent =
-            new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
-
-        // Do Cache Access
-        MemAccessResult result = dcacheInterface->access(req);
-
-        // Ugly hack to get an event scheduled *only* if the access is
-        // a miss.  We really should add first-class support for this
-        // at some point.
-        // @todo: Probably should support having no events
-        if (result != MA_HIT) {
-            DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
-            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
-                    inst->seqNum);
-
-            lastDcacheStall = curTick;
-
-            _status = DcacheMissStall;
-
-        } else {
-            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
-                    inst->seqNum);
-
-            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
-        }
-    } else {
-        fatal("Must use D-cache with new memory system");
-    }
-#endif
 
     return fault;
 }
@@ -716,24 +619,11 @@ LSQUnit<Impl>::write(MemReqPtr &req, T &data, int store_idx)
             " | storeHead:%i [sn:%i]\n",
             store_idx, req->paddr, data, storeHead,
             storeQueue[store_idx].inst->seqNum);
-/*
-    if (req->flags & LOCKED) {
-        if (req->flags & UNCACHEABLE) {
-            req->result = 2;
-        } else {
-            req->result = 1;
-        }
-    }
-*/
+
     storeQueue[store_idx].req = req;
     storeQueue[store_idx].size = sizeof(T);
     storeQueue[store_idx].data = data;
-/*
-    Addr debug_addr = ULL(0xfffffc0000be81a8);
-    if (req->vaddr == debug_addr) {
-        debug_break();
-    }
-*/
+
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index dca808ac9..f0b4405ed 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -35,8 +35,8 @@ LSQUnit<Impl>::StoreCompletionEvent::StoreCompletionEvent(int store_idx,
                                                           Event *wb_event,
                                                           LSQUnit<Impl> *lsq_ptr)
     : Event(&mainEventQueue),
-      storeIdx(store_idx),
       wbEvent(wb_event),
+      storeIdx(store_idx),
       lsqPtr(lsq_ptr)
 {
     this->setFlags(Event::AutoDelete);
@@ -86,15 +86,13 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
 
     lsqID = id;
 
-    LQEntries = maxLQEntries;
-    SQEntries = maxSQEntries;
+    // Add 1 for the sentinel entry (they are circular queues).
+    LQEntries = maxLQEntries + 1;
+    SQEntries = maxSQEntries + 1;
 
     loadQueue.resize(LQEntries);
     storeQueue.resize(SQEntries);
 
-
-    // May want to initialize these entries to NULL
-
     loadHead = loadTail = 0;
 
     storeHead = storeWBIdx = storeTail = 0;
@@ -104,7 +102,7 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
 
     dcacheInterface = params->dcacheInterface;
 
-    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+    memDepViolator = NULL;
 
     blockedLoadSeqNum = 0;
 }
@@ -152,6 +150,8 @@ LSQUnit<Impl>::switchOut()
     for (int i = 0; i < loadQueue.size(); ++i)
         loadQueue[i] = NULL;
 
+    assert(storesToWB == 0);
+
     while (storesToWB > 0 &&
            storeWBIdx != storeTail &&
            storeQueue[storeWBIdx].inst &&
@@ -218,7 +218,7 @@ LSQUnit<Impl>::takeOverFrom()
 
     usedPorts = 0;
 
-    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+    memDepViolator = NULL;
 
     blockedLoadSeqNum = 0;
 
@@ -231,16 +231,17 @@ template<class Impl>
 void
 LSQUnit<Impl>::resizeLQ(unsigned size)
 {
-    assert( size >= LQEntries);
+    unsigned size_plus_sentinel = size + 1;
+    assert(size_plus_sentinel >= LQEntries);
 
-    if (size > LQEntries) {
-        while (size > loadQueue.size()) {
+    if (size_plus_sentinel > LQEntries) {
+        while (size_plus_sentinel > loadQueue.size()) {
             DynInstPtr dummy;
             loadQueue.push_back(dummy);
             LQEntries++;
         }
     } else {
-        LQEntries = size;
+        LQEntries = size_plus_sentinel;
     }
 
 }
@@ -249,14 +250,15 @@ template<class Impl>
 void
 LSQUnit<Impl>::resizeSQ(unsigned size)
 {
-    if (size > SQEntries) {
-        while (size > storeQueue.size()) {
+    unsigned size_plus_sentinel = size + 1;
+    if (size_plus_sentinel > SQEntries) {
+        while (size_plus_sentinel > storeQueue.size()) {
             SQEntry dummy;
             storeQueue.push_back(dummy);
             SQEntries++;
         }
     } else {
-        SQEntries = size;
+        SQEntries = size_plus_sentinel;
     }
 }
 
@@ -264,10 +266,8 @@ template <class Impl>
 void
 LSQUnit<Impl>::insert(DynInstPtr &inst)
 {
-    // Make sure we really have a memory reference.
     assert(inst->isMemRef());
 
-    // Make sure it's one of the two classes of memory references.
     assert(inst->isLoad() || inst->isStore());
 
     if (inst->isLoad()) {
@@ -283,7 +283,8 @@ template <class Impl>
 void
 LSQUnit<Impl>::insertLoad(DynInstPtr &load_inst)
 {
-    assert((loadTail + 1) % LQEntries != loadHead && loads < LQEntries);
+    assert((loadTail + 1) % LQEntries != loadHead);
+    assert(loads < LQEntries);
 
     DPRINTF(LSQUnit, "Inserting load PC %#x, idx:%i [sn:%lli]\n",
             load_inst->readPC(), loadTail, load_inst->seqNum);
@@ -322,7 +323,6 @@ LSQUnit<Impl>::insertStore(DynInstPtr &store_inst)
     incrStIdx(storeTail);
 
     ++stores;
-
 }
 
 template <class Impl>
@@ -370,39 +370,6 @@ LSQUnit<Impl>::numLoadsReady()
     return retval;
 }
 
-#if 0
-template <class Impl>
-Fault
-LSQUnit<Impl>::executeLoad()
-{
-    Fault load_fault = NoFault;
-    DynInstPtr load_inst;
-
-    assert(readyLoads.size() != 0);
-
-    // Execute a ready load.
-    LdMapIt ready_it = readyLoads.begin();
-
-    load_inst = (*ready_it).second;
-
-    // Execute the instruction, which is held in the data portion of the
-    // iterator.
-    load_fault = load_inst->execute();
-
-    // If it executed successfully, then switch it over to the executed
-    // loads list.
-    if (load_fault == NoFault) {
-        executedLoads[load_inst->seqNum] = load_inst;
-
-        readyLoads.erase(ready_it);
-    } else {
-        loadFaultInst = load_inst;
-    }
-
-    return load_fault;
-}
-#endif
-
 template <class Impl>
 Fault
 LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
@@ -413,33 +380,14 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
     DPRINTF(LSQUnit, "Executing load PC %#x, [sn:%lli]\n",
             inst->readPC(),inst->seqNum);
 
-    // Make sure it's really in the list.
-    // Normally it should always be in the list.  However,
-    /* due to a syscall it may not be the list.
-#ifdef DEBUG
-    int i = loadHead;
-    while (1) {
-        if (i == loadTail && !find(inst)) {
-            assert(0 && "Load not in the queue!");
-        } else if (loadQueue[i] == inst) {
-            break;
-        }
-
-        i = i + 1;
-        if (i >= LQEntries) {
-            i = 0;
-        }
-    }
-#endif // DEBUG*/
-
 //    load_fault = inst->initiateAcc();
     load_fault = inst->execute();
 
     // If the instruction faulted, then we need to send it along to commit
     // without the instruction completing.
     if (load_fault != NoFault) {
-        // Maybe just set it as can commit here, although that might cause
-        // some other problems with sending traps to the ROB too quickly.
+        // Send this instruction to commit, also make sure iew stage
+        // realizes there is activity.
         iewStage->instToCommit(inst);
         iewStage->activityThisCycle();
     }
@@ -447,20 +395,6 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
     return load_fault;
 }
 
-template <class Impl>
-Fault
-LSQUnit<Impl>::executeLoad(int lq_idx)
-{
-    // Very hackish.  Not sure the best way to check that this
-    // instruction is at the head of the ROB.  I should have some sort
-    // of extra information here so that I'm not overloading the
-    // canCommit signal for 15 different things.
-    loadQueue[lq_idx]->setCanCommit();
-    Fault ret_fault = executeLoad(loadQueue[lq_idx]);
-    loadQueue[lq_idx]->clearCanCommit();
-    return ret_fault;
-}
-
 template <class Impl>
 Fault
 LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
@@ -481,11 +415,7 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
     Fault store_fault = store_inst->initiateAcc();
 //    Fault store_fault = store_inst->execute();
 
-    // Store size should now be available.  Use it to get proper offset for
-    // addr comparisons.
-    int size = storeQueue[store_idx].size;
-
-    if (size == 0) {
+    if (storeQueue[store_idx].size == 0) {
         DPRINTF(LSQUnit,"Fault on Store PC %#x, [sn:%lli],Size = 0\n",
                 store_inst->readPC(),store_inst->seqNum);
 
@@ -494,30 +424,25 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
 
     assert(store_fault == NoFault);
 
-    if (!storeFaultInst) {
-        if (store_fault != NoFault) {
-            panic("Fault in a store instruction!");
-            storeFaultInst = store_inst;
-        } else if (store_inst->isNonSpeculative()) {
-            // Nonspeculative accesses (namely store conditionals)
-            // need to set themselves as able to writeback if we
-            // haven't had a fault by here.
-            storeQueue[store_idx].canWB = true;
+    if (store_inst->isNonSpeculative()) {
+        // Nonspeculative accesses (namely store conditionals)
+        // need to set themselves as able to writeback if we
+        // haven't had a fault by here.
+        storeQueue[store_idx].canWB = true;
 
-            ++storesToWB;
-        }
+        ++storesToWB;
     }
 
     if (!memDepViolator) {
         while (load_idx != loadTail) {
-            // Actually should only check loads that have actually executed
-            // Might be safe because effAddr is set to InvalAddr when the
-            // dyn inst is created.
+            // Really only need to check loads that have actually executed
+            // It's safe to check all loads because effAddr is set to
+            // InvalAddr when the dyn inst is created.
+
+            // @todo: For now this is extra conservative, detecting a
+            // violation if the addresses match assuming all accesses
+            // are quad word accesses.
 
-            // Must actually check all addrs in the proper size range
-            // Which is more correct than needs to be.  What if for now we just
-            // assume all loads are quad-word loads, and do the addr based
-            // on that.
             // @todo: Fix this, magic number being used here
             if ((loadQueue[load_idx]->effAddr >> 8) ==
                 (store_inst->effAddr >> 8)) {
@@ -555,32 +480,6 @@ LSQUnit<Impl>::commitLoad()
     --loads;
 }
 
-template <class Impl>
-void
-LSQUnit<Impl>::commitLoad(InstSeqNum &inst)
-{
-    // Hopefully I don't use this function too much
-    panic("Don't use this function!");
-
-    int i = loadHead;
-    while (1) {
-        if (i == loadTail) {
-            assert(0 && "Load not in the queue!");
-        } else if (loadQueue[i]->seqNum == inst) {
-            break;
-        }
-
-        ++i;
-        if (i >= LQEntries) {
-            i = 0;
-        }
-    }
-
-    loadQueue[i]->removeInLSQ();
-    loadQueue[i] = NULL;
-    --loads;
-}
-
 template <class Impl>
 void
 LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst)
@@ -602,6 +501,8 @@ LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
 
     while (store_idx != storeTail) {
         assert(storeQueue[store_idx].inst);
+        // Mark any stores that are now committed and have not yet
+        // been marked as able to write back.
         if (!storeQueue[store_idx].canWB) {
             if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
                 break;
@@ -613,7 +514,6 @@ LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
 
             storeQueue[store_idx].canWB = true;
 
-//            --stores;
             ++storesToWB;
         }
 
@@ -631,6 +531,8 @@ LSQUnit<Impl>::writebackStores()
            storeQueue[storeWBIdx].canWB &&
            usedPorts < cachePorts) {
 
+        // Store didn't write any data so no need to write it back to
+        // memory.
         if (storeQueue[storeWBIdx].size == 0) {
             completeStore(storeWBIdx);
 
@@ -659,7 +561,6 @@ LSQUnit<Impl>::writebackStores()
         MemReqPtr req = storeQueue[storeWBIdx].req;
         storeQueue[storeWBIdx].committed = true;
 
-//	Fault fault = cpu->translateDataWriteReq(req);
         req->cmd = Write;
         req->completionEvent = NULL;
         req->time = curTick;
@@ -689,6 +590,12 @@ LSQUnit<Impl>::writebackStores()
           default:
             panic("Unexpected store size!\n");
         }
+
+        // Stores other than store conditionals are completed at this
+        // time.  Mark them as completed and, if we have a checker,
+        // tell it that the instruction is completed.
+        // @todo: Figure out what time I can say stores are complete in
+        // the timing memory.
         if (!(req->flags & LOCKED)) {
             storeQueue[storeWBIdx].inst->setCompleted();
             if (cpu->checker) {
@@ -714,57 +621,35 @@ LSQUnit<Impl>::writebackStores()
                 iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
             }
 
-            if (result != MA_HIT && dcacheInterface->doEvents()) {
-                typename IEW::LdWritebackEvent *wb = NULL;
-                if (req->flags & LOCKED) {
-                    // Stx_C should not generate a system port transaction,
-                    // but that might be hard to accomplish.
-                    wb = new typename
-                        IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+            typename IEW::LdWritebackEvent *wb = NULL;
+            if (req->flags & LOCKED) {
+                // Stx_C should not generate a system port transaction
+                // if it misses in the cache, but that might be hard
+                // to accomplish without explicit cache support.
+                wb = new typename
+                    IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
                                               iewStage);
-                    store_event->wbEvent = wb;
-                }
+                store_event->wbEvent = wb;
+            }
 
-                DPRINTF(LSQUnit,"D-Cache Write Miss!\n");
+            if (result != MA_HIT && dcacheInterface->doEvents()) {
+                DPRINTF(LSQUnit,"D-Cache Write Miss on idx:%i!\n",
+                        storeWBIdx);
 
                 DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
                         storeQueue[storeWBIdx].inst->seqNum);
 
-                lastDcacheStall = curTick;
-
-//                _status = DcacheMissStall;
-
                 //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum);
 
                 //DPRINTF(LSQUnit, "Added MSHR. count = %i\n",mshrSeqNums.size());
 
-                // Increment stat here or something
+                // @todo: Increment stat here.
             } else {
                 DPRINTF(LSQUnit,"D-Cache: Write Hit on idx:%i !\n",
                         storeWBIdx);
 
                 DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
                         storeQueue[storeWBIdx].inst->seqNum);
-
-
-                if (req->flags & LOCKED) {
-                    // Stx_C does not generate a system port transaction.
-/*
-                    if (req->flags & UNCACHEABLE) {
-                        req->result = 2;
-                    } else {
-                        if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
-                            req->result=1;
-                        } else {
-                            req->result = 0;
-                        }
-                    }
-*/
-                    typename IEW::LdWritebackEvent *wb =
-                        new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
-                                                           iewStage);
-                    store_event->wbEvent = wb;
-                }
             }
 
             incrStIdx(storeWBIdx);
@@ -798,14 +683,12 @@ void
 LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
 {
     DPRINTF(LSQUnit, "Squashing until [sn:%lli]!"
-            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+            "(Loads:%i Stores:%i)\n", squashed_num, loads, stores);
 
     int load_idx = loadTail;
     decrLdIdx(load_idx);
 
     while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
-
-        // Clear the smart pointer to make sure it is decremented.
         DPRINTF(LSQUnit,"Load Instruction PC %#x squashed, "
                 "[sn:%lli]\n",
                 loadQueue[load_idx]->readPC(),
@@ -817,6 +700,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
             stallingLoadIdx = 0;
         }
 
+        // Clear the smart pointer to make sure it is decremented.
         loadQueue[load_idx]->squashed = true;
         loadQueue[load_idx] = NULL;
         --loads;
@@ -840,19 +724,18 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
 
     while (stores != 0 &&
            storeQueue[store_idx].inst->seqNum > squashed_num) {
-
+        // Instructions marked as can WB are already committed.
         if (storeQueue[store_idx].canWB) {
             break;
         }
 
-        // Clear the smart pointer to make sure it is decremented.
         DPRINTF(LSQUnit,"Store Instruction PC %#x squashed, "
                 "idx:%i [sn:%lli]\n",
                 storeQueue[store_idx].inst->readPC(),
                 store_idx, storeQueue[store_idx].inst->seqNum);
 
-        // I don't think this can happen.  It should have been cleared by the
-        // stalling load.
+        // I don't think this can happen.  It should have been cleared
+        // by the stalling load.
         if (isStalled() &&
             storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
             panic("Is stalled should have been cleared by stalling load!\n");
@@ -860,13 +743,17 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
             stallingStoreIsn = 0;
         }
 
+        // Clear the smart pointer to make sure it is decremented.
         storeQueue[store_idx].inst->squashed = true;
         storeQueue[store_idx].inst = NULL;
         storeQueue[store_idx].canWB = 0;
 
         if (storeQueue[store_idx].req) {
+            // There should not be a completion event if the store has
+            // not yet committed.
             assert(!storeQueue[store_idx].req->completionEvent);
         }
+
         storeQueue[store_idx].req = NULL;
         --stores;
 
@@ -877,36 +764,6 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
     }
 }
 
-template <class Impl>
-void
-LSQUnit<Impl>::dumpInsts()
-{
-    cprintf("Load store queue: Dumping instructions.\n");
-    cprintf("Load queue size: %i\n", loads);
-    cprintf("Load queue: ");
-
-    int load_idx = loadHead;
-
-    while (load_idx != loadTail && loadQueue[load_idx]) {
-        cprintf("%#x ", loadQueue[load_idx]->readPC());
-
-        incrLdIdx(load_idx);
-    }
-
-    cprintf("Store queue size: %i\n", stores);
-    cprintf("Store queue: ");
-
-    int store_idx = storeHead;
-
-    while (store_idx != storeTail && storeQueue[store_idx].inst) {
-        cprintf("%#x ", storeQueue[store_idx].inst->readPC());
-
-        incrStIdx(store_idx);
-    }
-
-    cprintf("\n");
-}
-
 template <class Impl>
 void
 LSQUnit<Impl>::completeStore(int store_idx)
@@ -930,7 +787,9 @@ LSQUnit<Impl>::completeStore(int store_idx)
         iewStage->updateLSQNextCycle = true;
     }
 
-    DPRINTF(LSQUnit, "Store head idx:%i\n", storeHead);
+    DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
+            "idx:%i\n",
+            storeQueue[store_idx].inst->seqNum, store_idx, storeHead);
 
     if (isStalled() &&
         storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
@@ -943,6 +802,10 @@ LSQUnit<Impl>::completeStore(int store_idx)
     }
 
     storeQueue[store_idx].inst->setCompleted();
+
+    // Tell the checker we've completed this instruction.  Some stores
+    // may get reported twice to the checker, but the checker can
+    // handle that case.
     if (cpu->checker) {
         cpu->checker->tick(storeQueue[store_idx].inst);
     }
@@ -979,3 +842,33 @@ LSQUnit<Impl>::decrLdIdx(int &load_idx)
     if (--load_idx < 0)
         load_idx += LQEntries;
 }
+
+template <class Impl>
+void
+LSQUnit<Impl>::dumpInsts()
+{
+    cprintf("Load store queue: Dumping instructions.\n");
+    cprintf("Load queue size: %i\n", loads);
+    cprintf("Load queue: ");
+
+    int load_idx = loadHead;
+
+    while (load_idx != loadTail && loadQueue[load_idx]) {
+        cprintf("%#x ", loadQueue[load_idx]->readPC());
+
+        incrLdIdx(load_idx);
+    }
+
+    cprintf("Store queue size: %i\n", stores);
+    cprintf("Store queue: ");
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail && storeQueue[store_idx].inst) {
+        cprintf("%#x ", storeQueue[store_idx].inst->readPC());
+
+        incrStIdx(store_idx);
+    }
+
+    cprintf("\n");
+}
diff --git a/cpu/o3/mem_dep_unit.hh b/cpu/o3/mem_dep_unit.hh
index 141e0fdc4..acbe08ec2 100644
--- a/cpu/o3/mem_dep_unit.hh
+++ b/cpu/o3/mem_dep_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,13 +201,6 @@ class MemDepUnit {
         static int memdep_erase;
     };
 
-    struct ltMemDepEntry {
-        bool operator() (const MemDepEntryPtr &lhs, const MemDepEntryPtr &rhs)
-        {
-            return lhs->inst->seqNum < rhs->inst->seqNum;
-        }
-    };
-
     /** Finds the memory dependence entry in the hash map. */
     inline MemDepEntryPtr &findInHash(const DynInstPtr &inst);
 
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 05a33685d..8b195baab 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -141,12 +141,12 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
         std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
     MemDepEntry::memdep_insert++;
 
-    // Add the instruction to the instruction list.
     instList[tid].push_back(inst);
 
     inst_entry->listIt = --(instList[tid].end());
 
-    // Check the dependence predictor for any producing stores.
+    // Check any barriers and the dependence predictor for any
+    // producing stores.
     InstSeqNum producing_store;
     if (inst->isLoad() && loadBarrier) {
         producing_store = loadBarrierSN;
@@ -181,7 +181,7 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
             moveToReady(inst_entry);
         }
     } else {
-        // Otherwise make the instruction dependent on the store.
+        // Otherwise make the instruction dependent on the store/barrier.
         DPRINTF(MemDepUnit, "Adding to dependency list; "
                 "inst PC %#x is dependent on [sn:%lli].\n",
                 inst->readPC(), producing_store);
@@ -193,8 +193,6 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
         // Add this instruction to the list of dependents.
         store_entry->dependInsts.push_back(inst_entry);
 
-//        inst_entry->producingStore = store_entry;
-
         if (inst->isLoad()) {
             ++conflictingLoads;
         } else {
@@ -370,8 +368,6 @@ MemDepUnit<MemDepPred, Impl>::completed(DynInstPtr &inst)
 
     instList[tid].erase((*hash_it).second->listIt);
 
-//    (*hash_it).second->inst = NULL;
-
     (*hash_it).second = NULL;
 
     memDepHash.erase(hash_it);
@@ -416,7 +412,6 @@ MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
 
         if (!woken_inst->inst) {
             // Potentially removed mem dep entries could be on this list
-//            inst_entry->dependInsts[i] = NULL;
             continue;
         }
 
@@ -429,7 +424,6 @@ MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
         } else {
             woken_inst->memDepReady = true;
         }
-//        inst_entry->dependInsts[i] = NULL;
     }
 
     inst_entry->dependInsts.clear();
@@ -468,13 +462,7 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,
         assert(hash_it != memDepHash.end());
 
         (*hash_it).second->squashed = true;
-/*
-        for (int i = 0; i < (*hash_it).second->dependInsts.size(); ++i) {
-            (*hash_it).second->dependInsts[i] = NULL;
-        }
 
-        (*hash_it).second->inst = NULL;
-*/
         (*hash_it).second = NULL;
 
         memDepHash.erase(hash_it);
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index dd2cb0c18..3f1a27bb5 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,16 @@
 #include "base/timebuf.hh"
 
 /**
- * DefaultRename handles both single threaded and SMT rename. Its width is
- * specified by the parameters; each cycle it tries to rename that many
- * instructions. It holds onto the rename history of all instructions with
- * destination registers, storing the arch. register, the new physical
- * register, and the old physical register, to allow for undoing of mappings
- * if squashing happens, or freeing up registers upon commit. Rename handles
- * blocking if the ROB, IQ, or LSQ is going to be full. Rename also handles
- * barriers, and does so by stalling on the instruction until the ROB is
- * empty and there are no instructions in flight to the ROB.
+ * DefaultRename handles both single threaded and SMT rename. Its
+ * width is specified by the parameters; each cycle it tries to rename
+ * that many instructions. It holds onto the rename history of all
+ * instructions with destination registers, storing the
+ * arch. register, the new physical register, and the old physical
+ * register, to allow for undoing of mappings if squashing happens, or
+ * freeing up registers upon commit. Rename handles blocking if the
+ * ROB, IQ, or LSQ is going to be full. Rename also handles barriers,
+ * and does so by stalling on the instruction until the ROB is empty
+ * and there are no instructions in flight to the ROB.
  */
 template<class Impl>
 class DefaultRename
@@ -68,14 +69,15 @@ class DefaultRename
     // Typedefs from the ISA.
     typedef TheISA::RegIndex RegIndex;
 
-    // A deque is used to queue the instructions.  Barrier insts must be
-    // added to the front of the deque, which is the only reason for using
-    // a deque instead of a queue. (Most other stages use a queue)
+    // A list is used to queue the instructions.  Barrier insts must
+    // be added to the front of the list, which is the only reason for
+    // using a list instead of a queue. (Most other stages use a
+    // queue)
     typedef std::list<DynInstPtr> InstQueue;
 
   public:
-    /** Overall rename status. Used to determine if the CPU can deschedule
-     * itself due to a lack of activity.
+    /** Overall rename status. Used to determine if the CPU can
+     * deschedule itself due to a lack of activity.
      */
     enum RenameStatus {
         Active,
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index db4bb2ffe..081581c92 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -209,17 +209,13 @@ template <class Impl>
 void
 DefaultRename<Impl>::initStage()
 {
+    // Grab the number of free entries directly from the stages.
     for (int tid=0; tid < numThreads; tid++) {
         freeEntries[tid].iqEntries = iew_ptr->instQueue.numFreeEntries(tid);
         freeEntries[tid].lsqEntries = iew_ptr->ldstQueue.numFreeEntries(tid);
         freeEntries[tid].robEntries = commit_ptr->numROBFreeEntries(tid);
         emptyROB[tid] = true;
     }
-
-    // Clear these pointers so they are not accidentally used in
-    // non-initialization code.
-//    iew_ptr = NULL;
-//    commit_ptr = NULL;
 }
 
 template<class Impl>
@@ -299,6 +295,7 @@ DefaultRename<Impl>::takeOverFrom()
     _status = Inactive;
     initStage();
 
+    // Reset all state prior to taking over from the other CPU.
     for (int i=0; i< numThreads; i++) {
         renameStatus[i] = Idle;
 
@@ -326,7 +323,7 @@ DefaultRename<Impl>::squash(unsigned tid)
     if (renameStatus[tid] == Blocked ||
         renameStatus[tid] == Unblocking ||
         renameStatus[tid] == SerializeStall) {
-#if !FULL_SYSTEM
+#if 0
         // In syscall emulation, we can have both a block and a squash due
         // to a syscall in the same cycle.  This would cause both signals to
         // be high.  This shouldn't happen in full system.
@@ -344,7 +341,7 @@ DefaultRename<Impl>::squash(unsigned tid)
     // Set the status to Squashing.
     renameStatus[tid] = Squashing;
 
-    // Clear the skid buffer in case it has any data in it.
+    // Squash any instructions from decode.
     unsigned squashCount = 0;
 
     for (int i=0; i<fromDecode->size; i++) {
@@ -367,9 +364,6 @@ template <class Impl>
 void
 DefaultRename<Impl>::tick()
 {
-    // Rename will need to try to rename as many instructions as it
-    // has bandwidth, unless it is blocked.
-
     wroteToTimeBuffer = false;
 
     blockThisCycle = false;
@@ -454,8 +448,6 @@ DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
     } else if (renameStatus[tid] == Unblocking) {
         renameInsts(tid);
 
-//        ++renameUnblockCycles;
-
         if (validInsts()) {
             // Add the current inputs to the skid buffer so they can be
             // reprocessed when this stage unblocks.
@@ -575,7 +567,6 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
 
         insts_to_rename.pop_front();
 
-        //Use skidBuffer with oldest instructions
         if (renameStatus[tid] == Unblocking) {
             DPRINTF(Rename,"[tid:%u]: Removing [sn:%lli] PC:%#x from rename "
                     "skidBuffer\n",
@@ -711,10 +702,10 @@ void
 DefaultRename<Impl>::sortInsts()
 {
     int insts_from_decode = fromDecode->size;
-
+#ifdef DEBUG
     for (int i=0; i < numThreads; i++)
         assert(insts[i].empty());
-
+#endif
     for (int i = 0; i < insts_from_decode; ++i) {
         DynInstPtr inst = fromDecode->insts[i];
         insts[inst->threadNumber].push_back(inst);
@@ -794,8 +785,8 @@ DefaultRename<Impl>::block(unsigned tid)
             wroteToTimeBuffer = true;
         }
 
-        // Rename can not go from SerializeStall to Blocked, otherwise it would
-        // not know to complete the serialize stall.
+        // Rename can not go from SerializeStall to Blocked, otherwise
+        // it would not know to complete the serialize stall.
         if (renameStatus[tid] != SerializeStall) {
             // Set status to Blocked.
             renameStatus[tid] = Blocked;
@@ -835,15 +826,11 @@ DefaultRename<Impl>::doSquash(unsigned tid)
 
     InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].doneSeqNum;
 
-//#if FULL_SYSTEM
-//    assert(!historyBuffer[tid].empty());
-//#else
     // After a syscall squashes everything, the history buffer may be empty
     // but the ROB may still be squashing instructions.
     if (historyBuffer[tid].empty()) {
         return;
     }
-//#endif // FULL_SYSTEM
 
     // Go through the most recent instructions, undoing the mappings
     // they did and freeing up the registers.
@@ -896,8 +883,8 @@ DefaultRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num, unsigned tid)
            hb_it != historyBuffer[tid].end() &&
            (*hb_it).instSeqNum <= inst_seq_num) {
 
-        DPRINTF(Rename, "[tid:%u]: Freeing up older rename of reg %i, sequence"
-                " number %i.\n",
+        DPRINTF(Rename, "[tid:%u]: Freeing up older rename of reg %i, "
+                "[sn:%lli].\n",
                 tid, (*hb_it).prevPhysReg, (*hb_it).instSeqNum);
 
         freeList->addReg((*hb_it).prevPhysReg);
diff --git a/cpu/o3/rename_map.cc b/cpu/o3/rename_map.cc
index 8ba632e65..fc59058a1 100644
--- a/cpu/o3/rename_map.cc
+++ b/cpu/o3/rename_map.cc
@@ -32,18 +32,12 @@
 
 using namespace std;
 
-// Todo: Consider making functions inline.  Avoid having things that are
-// using the zero register or misc registers from adding on the registers
-// to the free list.  Possibly remove the direct communication between
-// this and the freelist.  Considering making inline bool functions that
-// determine if the register is a logical int, logical fp, physical int,
-// physical fp, etc.
+// @todo: Consider making inline bool functions that determine if the
+// register is a logical int, logical fp, physical int, physical fp,
+// etc.
 
 SimpleRenameMap::~SimpleRenameMap()
 {
-    // Delete the rename maps as they were allocated with new.
-    //delete [] intRenameMap;
-    //delete [] floatRenameMap;
 }
 
 void
@@ -105,7 +99,8 @@ SimpleRenameMap::init(unsigned _numLogicalIntRegs,
         // Although the index refers purely to architected registers, because
         // the floating reg indices come after the integer reg indices, they
         // may exceed the size of a normal RegIndex (short).
-        for (PhysRegIndex index = numLogicalIntRegs; index < numLogicalRegs; ++index)
+        for (PhysRegIndex index = numLogicalIntRegs;
+             index < numLogicalRegs; ++index)
         {
             floatRenameMap[index].physical_reg = freg_idx++;
         }
@@ -132,14 +127,10 @@ SimpleRenameMap::init(unsigned _numLogicalIntRegs,
 void
 SimpleRenameMap::setFreeList(SimpleFreeList *fl_ptr)
 {
-    //Setup the interface to the freelist.
     freeList = fl_ptr;
 }
 
 
-// Don't allow this stage to fault; force that check to the rename stage.
-// Simply ask to rename a logical register and get back a new physical
-// register index.
 SimpleRenameMap::RenameInfo
 SimpleRenameMap::rename(RegIndex arch_reg)
 {
@@ -152,13 +143,11 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         // requested architected register.
         prev_reg = intRenameMap[arch_reg].physical_reg;
 
-        // If it's not referencing the zero register, then mark the register
-        // as not ready.
+        // If it's not referencing the zero register, then rename the
+        // register.
         if (arch_reg != intZeroReg) {
-            // Get a free physical register to rename to.
             renamed_reg = freeList->getIntReg();
 
-            // Update the integer rename map.
             intRenameMap[arch_reg].physical_reg = renamed_reg;
 
             assert(renamed_reg >= 0 && renamed_reg < numPhysicalIntRegs);
@@ -168,20 +157,15 @@ SimpleRenameMap::rename(RegIndex arch_reg)
             renamed_reg = intZeroReg;
         }
     } else if (arch_reg < numLogicalRegs) {
-        // Subtract off the base offset for floating point registers.
-//        arch_reg = arch_reg - numLogicalIntRegs;
-
         // Record the current physical register that is renamed to the
         // requested architected register.
         prev_reg = floatRenameMap[arch_reg].physical_reg;
 
-        // If it's not referencing the zero register, then mark the register
-        // as not ready.
+        // If it's not referencing the zero register, then rename the
+        // register.
         if (arch_reg != floatZeroReg) {
-            // Get a free floating point register to rename to.
             renamed_reg = freeList->getFloatReg();
 
-            // Update the floating point rename map.
             floatRenameMap[arch_reg].physical_reg = renamed_reg;
 
             assert(renamed_reg < numPhysicalRegs &&
@@ -194,10 +178,10 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         // Subtract off the base offset for miscellaneous registers.
         arch_reg = arch_reg - numLogicalRegs;
 
-        // No renaming happens to the misc. registers.  They are simply the
-        // registers that come after all the  physical registers; thus
-        // take the base architected register and add the physical registers
-        // to it.
+        // No renaming happens to the misc. registers.  They are
+        // simply the registers that come after all the physical
+        // registers; thus take the base architected register and add
+        // the physical registers to it.
         renamed_reg = arch_reg + numPhysicalRegs;
 
         // Set the previous register to the same register; mainly it must be
@@ -211,17 +195,12 @@ SimpleRenameMap::rename(RegIndex arch_reg)
     return RenameInfo(renamed_reg, prev_reg);
 }
 
-//Perhaps give this a pair as a return value, of the physical register
-//and whether or not it's ready.
 PhysRegIndex
 SimpleRenameMap::lookup(RegIndex arch_reg)
 {
     if (arch_reg < numLogicalIntRegs) {
         return intRenameMap[arch_reg].physical_reg;
     } else if (arch_reg < numLogicalRegs) {
-        // Subtract off the base FP offset.
-//        arch_reg = arch_reg - numLogicalIntRegs;
-
         return floatRenameMap[arch_reg].physical_reg;
     } else {
         // Subtract off the misc registers offset.
@@ -233,51 +212,23 @@ SimpleRenameMap::lookup(RegIndex arch_reg)
     }
 }
 
-// In this implementation the miscellaneous registers do not actually rename,
-// so this function does not allow you to try to change their mappings.
 void
 SimpleRenameMap::setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg)
 {
+    // In this implementation the miscellaneous registers do not
+    // actually rename, so this function does not allow you to try to
+    // change their mappings.
     if (arch_reg < numLogicalIntRegs) {
         DPRINTF(Rename, "Rename Map: Integer register %i being set to %i.\n",
                 (int)arch_reg, renamed_reg);
 
         intRenameMap[arch_reg].physical_reg = renamed_reg;
     } else if (arch_reg < numLogicalIntRegs + numLogicalFloatRegs) {
-
-
         DPRINTF(Rename, "Rename Map: Float register %i being set to %i.\n",
                 (int)arch_reg - numLogicalIntRegs, renamed_reg);
 
         floatRenameMap[arch_reg].physical_reg = renamed_reg;
     }
-
-    //assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
-}
-
-void
-SimpleRenameMap::squash(vector<RegIndex> freed_regs,
-                        vector<UnmapInfo> unmaps)
-{
-    panic("Not sure this function should be called.");
-
-    // Not sure the rename map should be able to access the free list
-    // like this.
-    while (!freed_regs.empty()) {
-        RegIndex free_register = freed_regs.back();
-
-        if (free_register < numPhysicalIntRegs) {
-            freeList->addIntReg(free_register);
-        } else {
-            // Subtract off the base FP dependence tag.
-            free_register = free_register - numPhysicalIntRegs;
-            freeList->addFloatReg(free_register);
-        }
-
-        freed_regs.pop_back();
-    }
-
-    // Take unmap info and roll back the rename map.
 }
 
 int
diff --git a/cpu/o3/rename_map.hh b/cpu/o3/rename_map.hh
index 3ecbe45c3..d7e49ae83 100644
--- a/cpu/o3/rename_map.hh
+++ b/cpu/o3/rename_map.hh
@@ -101,9 +101,6 @@ class SimpleRenameMap
      */
     void setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg);
 
-    void squash(std::vector<RegIndex> freed_regs,
-                std::vector<UnmapInfo> unmaps);
-
     int numFreeEntries();
 
   private:
@@ -153,7 +150,7 @@ class SimpleRenameMap
     };
 
     //Change this to private
-  public:
+  private:
     /** Integer rename map. */
     std::vector<RenameEntry> intRenameMap;
 
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index 0748850ea..e05eebe5a 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,9 +53,7 @@ class ROB
     enum Status {
         Running,
         Idle,
-        ROBSquashing,
-        DcacheMissStall,
-        DcacheMissComplete
+        ROBSquashing
     };
 
     /** SMT ROB Sharing Policy */
@@ -112,7 +110,7 @@ class ROB
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the head of the ROB.
      */
-    DynInstPtr readHeadInst();
+//    DynInstPtr readHeadInst();
 
     /** Returns a pointer to the head instruction of a specific thread within
      *  the ROB.
@@ -124,7 +122,7 @@ class ROB
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the tail of the ROB.
      */
-    DynInstPtr readTailInst();
+//    DynInstPtr readTailInst();
 
     /** Returns a pointer to the tail instruction of a specific thread within
      *  the ROB.
@@ -133,7 +131,7 @@ class ROB
     DynInstPtr readTailInst(unsigned tid);
 
     /** Retires the head instruction, removing it from the ROB. */
-    void retireHead();
+//    void retireHead();
 
     /** Retires the head instruction of a specific thread, removing it from the
      *  ROB.
@@ -141,7 +139,7 @@ class ROB
     void retireHead(unsigned tid);
 
     /** Is the oldest instruction across all threads ready. */
-    bool isHeadReady();
+//    bool isHeadReady();
 
     /** Is the oldest instruction across a particular thread ready. */
     bool isHeadReady(unsigned tid);
@@ -200,35 +198,35 @@ class ROB
     void updateTail();
 
     /** Reads the PC of the oldest head instruction. */
-    uint64_t readHeadPC();
+//    uint64_t readHeadPC();
 
     /** Reads the PC of the head instruction of a specific thread. */
-    uint64_t readHeadPC(unsigned tid);
+//    uint64_t readHeadPC(unsigned tid);
 
     /** Reads the next PC of the oldest head instruction. */
-    uint64_t readHeadNextPC();
+//    uint64_t readHeadNextPC();
 
     /** Reads the next PC of the head instruction of a specific thread. */
-    uint64_t readHeadNextPC(unsigned tid);
+//    uint64_t readHeadNextPC(unsigned tid);
 
     /** Reads the sequence number of the oldest head instruction. */
-    InstSeqNum readHeadSeqNum();
+//    InstSeqNum readHeadSeqNum();
 
     /** Reads the sequence number of the head instruction of a specific thread.
      */
-    InstSeqNum readHeadSeqNum(unsigned tid);
+//    InstSeqNum readHeadSeqNum(unsigned tid);
 
     /** Reads the PC of the youngest tail instruction. */
-    uint64_t readTailPC();
+//    uint64_t readTailPC();
 
     /** Reads the PC of the tail instruction of a specific thread. */
-    uint64_t readTailPC(unsigned tid);
+//    uint64_t readTailPC(unsigned tid);
 
     /** Reads the sequence number of the youngest tail instruction. */
-    InstSeqNum readTailSeqNum();
+//    InstSeqNum readTailSeqNum();
 
     /** Reads the sequence number of tail instruction of a specific thread. */
-    InstSeqNum readTailSeqNum(unsigned tid);
+//    InstSeqNum readTailSeqNum(unsigned tid);
 
     /** Checks if the ROB is still in the process of squashing instructions.
      *  @retval Whether or not the ROB is done squashing.
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index 02a4bfbee..25e0c80fd 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,20 +201,15 @@ template <class Impl>
 void
 ROB<Impl>::insertInst(DynInstPtr &inst)
 {
-    // Make sure we have the right number of instructions.
     //assert(numInstsInROB == countInsts());
-
-    // Make sure the instruction is valid.
     assert(inst);
 
     DPRINTF(ROB, "Adding inst PC %#x to the ROB.\n", inst->readPC());
 
-    // If the ROB is full then exit.
     assert(numInstsInROB != numEntries);
 
     int tid = inst->threadNumber;
 
-    // Place into ROB
     instList[tid].push_back(inst);
 
     //Set Up head iterator if this is the 1st instruction in the ROB
@@ -228,10 +223,8 @@ ROB<Impl>::insertInst(DynInstPtr &inst)
     tail = instList[tid].end();
     tail--;
 
-    // Mark as set in ROB
     inst->setInROB();
 
-    // Increment ROB count
     ++numInstsInROB;
     ++threadEntries[tid];
 
@@ -242,6 +235,7 @@ ROB<Impl>::insertInst(DynInstPtr &inst)
 
 // Whatever calls this function needs to ensure that it properly frees up
 // registers prior to this function.
+/*
 template <class Impl>
 void
 ROB<Impl>::retireHead()
@@ -249,7 +243,6 @@ ROB<Impl>::retireHead()
     //assert(numInstsInROB == countInsts());
     assert(numInstsInROB > 0);
 
-    // Get the head ROB instruction's TID.
     int tid = (*head)->threadNumber;
 
     retireHead(tid);
@@ -258,6 +251,7 @@ ROB<Impl>::retireHead()
         tail = instList[tid].end();
     }
 }
+*/
 
 template <class Impl>
 void
@@ -271,18 +265,15 @@ ROB<Impl>::retireHead(unsigned tid)
 
     DynInstPtr head_inst = (*head_it);
 
-    // Make certain this can retire.
     assert(head_inst->readyToCommit());
 
     DPRINTF(ROB, "[tid:%u]: Retiring head instruction, "
             "instruction PC %#x,[sn:%lli]\n", tid, head_inst->readPC(),
             head_inst->seqNum);
 
-    // Keep track of how many instructions are in the ROB.
     --numInstsInROB;
     --threadEntries[tid];
 
-    //Mark DynInstFlags
     head_inst->removeInROB();
     head_inst->setCommitted();
 
@@ -291,12 +282,12 @@ ROB<Impl>::retireHead(unsigned tid)
     //Update "Global" Head of ROB
     updateHead();
 
-    // A special case is needed if the instruction being retired is the
-    // only instruction in the ROB; otherwise the tail iterator will become
-    // invalidated.
+    // @todo: A special case is needed if the instruction being
+    // retired is the only instruction in the ROB; otherwise the tail
+    // iterator will become invalidated.
     cpu->removeFrontInst(head_inst);
 }
-
+/*
 template <class Impl>
 bool
 ROB<Impl>::isHeadReady()
@@ -307,7 +298,7 @@ ROB<Impl>::isHeadReady()
 
     return false;
 }
-
+*/
 template <class Impl>
 bool
 ROB<Impl>::isHeadReady(unsigned tid)
@@ -537,7 +528,7 @@ ROB<Impl>::squash(InstSeqNum squash_num,unsigned tid)
         doSquash(tid);
     }
 }
-
+/*
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readHeadInst()
@@ -549,7 +540,7 @@ ROB<Impl>::readHeadInst()
         return dummyInst;
     }
 }
-
+*/
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readHeadInst(unsigned tid)
@@ -564,7 +555,7 @@ ROB<Impl>::readHeadInst(unsigned tid)
         return dummyInst;
     }
 }
-
+/*
 template <class Impl>
 uint64_t
 ROB<Impl>::readHeadPC()
@@ -608,7 +599,6 @@ ROB<Impl>::readHeadNextPC(unsigned tid)
     return (*head_thread)->readNextPC();
 }
 
-
 template <class Impl>
 InstSeqNum
 ROB<Impl>::readHeadSeqNum()
@@ -637,7 +627,7 @@ ROB<Impl>::readTailInst()
 
     return (*tail);
 }
-
+*/
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readTailInst(unsigned tid)
@@ -650,7 +640,7 @@ ROB<Impl>::readTailInst(unsigned tid)
     return *tail_thread;
 }
 
-
+/*
 template <class Impl>
 uint64_t
 ROB<Impl>::readTailPC()
@@ -698,4 +688,4 @@ ROB<Impl>::readTailSeqNum(unsigned tid)
 
     return (*tail_thread)->seqNum;
 }
-
+*/
diff --git a/cpu/o3/scoreboard.cc b/cpu/o3/scoreboard.cc
index 87b0aee94..b0e433620 100644
--- a/cpu/o3/scoreboard.cc
+++ b/cpu/o3/scoreboard.cc
@@ -99,6 +99,7 @@ Scoreboard::unsetReg(PhysRegIndex ready_reg)
     if (ready_reg == zeroRegIdx ||
         ready_reg == (zeroRegIdx + numPhysicalIntRegs)) {
         // Don't do anything if int or fp zero reg.
+        return;
     }
 
     regScoreBoard[ready_reg] = 0;
diff --git a/cpu/o3/store_set.cc b/cpu/o3/store_set.cc
index a685646f3..0c957c8c7 100644
--- a/cpu/o3/store_set.cc
+++ b/cpu/o3/store_set.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -278,11 +278,6 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 void
 StoreSet::squash(InstSeqNum squashed_num, unsigned tid)
 {
-    // Not really sure how to do this well.
-    // Generally this is small enough that it should be okay; short circuit
-    // evaluation should take care of invalid entries.
-    // Maybe keep a list of valid LFST's?  Really ugly either way...
-
     DPRINTF(StoreSet, "StoreSet: Squashing until inum %i\n",
             squashed_num);
 
diff --git a/cpu/o3/thread_state.hh b/cpu/o3/thread_state.hh
index 17719bdeb..2c9788e4b 100644
--- a/cpu/o3/thread_state.hh
+++ b/cpu/o3/thread_state.hh
@@ -1,3 +1,30 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __CPU_O3_THREAD_STATE_HH__
 #define __CPU_O3_THREAD_STATE_HH__
@@ -15,27 +42,17 @@ class EndQuiesceEvent;
 class FunctionProfile;
 class ProfileNode;
 #else
-class Process;
 class FunctionalMemory;
+class Process;
 #endif
 
-// In the new CPU case this may be quite small...It depends on what I define
-// ThreadState to be.  Currently it's only the state that exists within
-// ExecContext basically.  Leaves the interface and manipulation up to the
-// CPU.  Not sure this is useful/flexible...probably can be if I can avoid
-// including state here that parts of the pipeline can't modify directly,
-// or at least don't let them.  The only problem is for state that's needed
-// per thread, per structure.  I.e. rename table, memreqs.
-// On the other hand, it might be nice to not have to pay the extra pointer
-// lookup to get frequently used state such as a memreq (that isn't used much
-// elsewhere)...
-
-// Maybe this ozone thread state should only really have committed state?
-// I need to think about why I'm using this and what it's useful for.  Clearly
-// has benefits for SMT; basically serves same use as CPUExecContext.
-// Makes the ExecContext proxy easier.  Gives organization/central access point
-// to state of a thread that can be accessed normally (i.e. not in-flight
-// stuff within a OoO processor).  Does this need an XC proxy within it?
+/**
+ * Class that has various thread state, such as the status, the
+ * current instruction being processed, whether or not the thread has
+ * a trap pending or is being externally updated, the ExecContext
+ * proxy pointer, etc.  It also handles anything related to a specific
+ * thread's process, such as syscalls and checking valid addresses.
+ */
 template <class Impl>
 struct O3ThreadState : public ThreadState {
     typedef ExecContext::Status Status;
@@ -43,7 +60,7 @@ struct O3ThreadState : public ThreadState {
 
     Status _status;
 
-    // Current instruction?
+    // Current instruction
     TheISA::MachInst inst;
   private:
     FullCPU *cpu;
@@ -80,51 +97,11 @@ struct O3ThreadState : public ThreadState {
     void setStatus(Status new_status) { _status = new_status; }
 
 #if !FULL_SYSTEM
-
-    Fault dummyTranslation(MemReqPtr &req)
-    {
-#if 0
-        assert((req->vaddr >> 48 & 0xffff) == 0);
-#endif
-
-        // put the asid in the upper 16 bits of the paddr
-        req->paddr = req->vaddr & ~((Addr)0xffff << sizeof(Addr) * 8 - 16);
-        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
-        return NoFault;
-    }
-    Fault translateInstReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-    Fault translateDataReadReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-    Fault translateDataWriteReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-
     bool validInstAddr(Addr addr)
     { return process->validInstAddr(addr); }
 
     bool validDataAddr(Addr addr)
     { return process->validDataAddr(addr); }
-#else
-    Fault translateInstReq(MemReqPtr &req)
-    {
-        return cpu->itb->translate(req);
-    }
-
-    Fault translateDataReadReq(MemReqPtr &req)
-    {
-        return cpu->dtb->translate(req, false);
-    }
-
-    Fault translateDataWriteReq(MemReqPtr &req)
-    {
-        return cpu->dtb->translate(req, true);
-    }
 #endif
 
     bool misspeculating() { return false; }

From 5a7db55e9a88d204bb783b7b363faacaa1350ff3 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sun, 21 May 2006 01:55:58 -0400
Subject: [PATCH 40/50] Threads start off in suspended status now (Korey's
 changes for SMT).

--HG--
extra : convert_revision : ad726f9f258e1983d2af5057ff6e5f9d2a5dd072
---
 kern/tru64/tru64.hh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kern/tru64/tru64.hh b/kern/tru64/tru64.hh
index 112f00f31..b8adab8a8 100644
--- a/kern/tru64/tru64.hh
+++ b/kern/tru64/tru64.hh
@@ -1020,7 +1020,7 @@ class Tru64 {
             for (int i = 0; i < process->numCpus(); ++i) {
                 ExecContext *xc = process->execContexts[i];
 
-                if (xc->status() == ExecContext::Unallocated) {
+                if (xc->status() == ExecContext::Suspended) {
                     // inactive context... grab it
                     init_exec_context(xc, attrp, uniq_val);
 

From 7df1412ccda9edba9a3274b6182a28853ced2a72 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 22 May 2006 16:01:25 -0400
Subject: [PATCH 41/50] Undo changes to instruction flags that has caused
 statistics to change in regressions. This temporarily will break the O3 and
 Ozone CPU models.  Updates to fix them will be coming soon.

arch/alpha/isa/decoder.isa:
    Undo changes to instruction flags that has caused statistics to change in regressions.

--HG--
extra : convert_revision : c0fa9d55a22cae7c4f02d388870565b205d6fba3
---
 arch/alpha/isa/decoder.isa | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/alpha/isa/decoder.isa b/arch/alpha/isa/decoder.isa
index b79286162..48ced0eff 100644
--- a/arch/alpha/isa/decoder.isa
+++ b/arch/alpha/isa/decoder.isa
@@ -73,9 +73,7 @@ decode OPCODE default Unknown::unknown() {
                         uint64_t tmp = write_result;
                         // see stq_c
                         Ra = (tmp == 0 || tmp == 1) ? tmp : Ra;
-                    }}, mem_flags = LOCKED, inst_flags = [IsNonSpeculative,
-                                                          IsSerializing,
-                                                          IsSerializeAfter]);
+                    }}, mem_flags = LOCKED);
         0x2f: stq_c({{ Mem.uq = Ra; }},
                     {{
                         uint64_t tmp = write_result;
@@ -87,9 +85,7 @@ decode OPCODE default Unknown::unknown() {
                         // mailbox access, and we don't update the
                         // result register at all.
                         Ra = (tmp == 0 || tmp == 1) ? tmp : Ra;
-                    }}, mem_flags = LOCKED, inst_flags = [IsNonSpeculative,
-                                                          IsSerializing,
-                                                          IsSerializeAfter]);
+                    }}, mem_flags = LOCKED);
     }
 
     format IntegerOperate {
@@ -595,8 +591,8 @@ decode OPCODE default Unknown::unknown() {
             0x02e: fcmovle({{ Fc = (Fa <= 0) ? Fb : Fc; }});
             0x02f: fcmovgt({{ Fc = (Fa >  0) ? Fb : Fc; }});
 
-            0x024: mt_fpcr({{ FPCR = Fa.uq; }}, IsSerializing, IsSerializeBefore);
-            0x025: mf_fpcr({{ Fa.uq = FPCR; }}, IsSerializing, IsSerializeBefore);
+            0x024: mt_fpcr({{ FPCR = Fa.uq; }});
+            0x025: mf_fpcr({{ Fa.uq = FPCR; }});
         }
     }
 
@@ -698,11 +694,11 @@ decode OPCODE default Unknown::unknown() {
             }}, IsNonSpeculative);
             0x83: callsys({{
                 xc->syscall();
-            }}, IsNonSpeculative, IsSerializeAfter);
+            }}, IsNonSpeculative);
             // Read uniq reg into ABI return value register (r0)
-            0x9e: rduniq({{ R0 = Runiq; }}, IsSerializing, IsSerializeBefore);
+            0x9e: rduniq({{ R0 = Runiq; }});
             // Write uniq reg with value from ABI arg register (r16)
-            0x9f: wruniq({{ Runiq = R16; }}, IsSerializing, IsSerializeBefore);
+            0x9f: wruniq({{ Runiq = R16; }});
         }
     }
 #endif
@@ -739,7 +735,7 @@ decode OPCODE default Unknown::unknown() {
         format HwMoveIPR {
             1: hw_mfpr({{
                 Ra = xc->readMiscRegWithEffect(ipr_index, fault);
-            }}, IsSerializing, IsSerializeBefore);
+            }});
         }
     }
 
@@ -749,7 +745,7 @@ decode OPCODE default Unknown::unknown() {
             1: hw_mtpr({{
                 xc->setMiscRegWithEffect(ipr_index, Ra);
                 if (traceData) { traceData->setData(Ra); }
-            }}, IsSerializing, IsSerializeBefore);
+            }});
         }
     }
 

From eeeee7c58f26fac9fe9b8606e26ef8e99a28e399 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 14:38:16 -0400
Subject: [PATCH 42/50] Add extra flags to help new CPU handle various
 instructions. IsIprAccess flag may go away in the future (op class can be
 used to tell this), and the CPU still needs a specific way to identify/deal
 with syscalls.

arch/alpha/isa/decoder.isa:
    Added a few extra flags to help the new CPU identify various classes of instructions without having to force certain behaviors for all CPUs.
cpu/base_dyn_inst.hh:
cpu/static_inst.hh:
    Added extra flags.
cpu/o3/iew_impl.hh:
cpu/o3/inst_queue_impl.hh:
    Handle store conditionals specially.
cpu/o3/lsq_unit_impl.hh:
    Extra flags tells if the instruction is a store conditional.
cpu/o3/rename_impl.hh:
    Handle IPR accesses and store conditionals specially.

--HG--
extra : convert_revision : 39debec4fa5341ae8a8ab5650bd12730aeb6c04f
---
 arch/alpha/isa/decoder.isa | 16 ++++++++--------
 cpu/base_dyn_inst.hh       |  3 +++
 cpu/o3/iew_impl.hh         |  8 ++++----
 cpu/o3/inst_queue_impl.hh  |  1 +
 cpu/o3/lsq_unit_impl.hh    |  7 +++----
 cpu/o3/rename_impl.hh      | 12 ++++++++++--
 cpu/static_inst.hh         |  8 ++++++--
 7 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/arch/alpha/isa/decoder.isa b/arch/alpha/isa/decoder.isa
index 48ced0eff..b3744a43d 100644
--- a/arch/alpha/isa/decoder.isa
+++ b/arch/alpha/isa/decoder.isa
@@ -73,7 +73,7 @@ decode OPCODE default Unknown::unknown() {
                         uint64_t tmp = write_result;
                         // see stq_c
                         Ra = (tmp == 0 || tmp == 1) ? tmp : Ra;
-                    }}, mem_flags = LOCKED);
+                    }}, mem_flags = LOCKED, inst_flags = IsStoreConditional);
         0x2f: stq_c({{ Mem.uq = Ra; }},
                     {{
                         uint64_t tmp = write_result;
@@ -85,7 +85,7 @@ decode OPCODE default Unknown::unknown() {
                         // mailbox access, and we don't update the
                         // result register at all.
                         Ra = (tmp == 0 || tmp == 1) ? tmp : Ra;
-                    }}, mem_flags = LOCKED);
+                    }}, mem_flags = LOCKED, inst_flags = IsStoreConditional);
     }
 
     format IntegerOperate {
@@ -591,8 +591,8 @@ decode OPCODE default Unknown::unknown() {
             0x02e: fcmovle({{ Fc = (Fa <= 0) ? Fb : Fc; }});
             0x02f: fcmovgt({{ Fc = (Fa >  0) ? Fb : Fc; }});
 
-            0x024: mt_fpcr({{ FPCR = Fa.uq; }});
-            0x025: mf_fpcr({{ Fa.uq = FPCR; }});
+            0x024: mt_fpcr({{ FPCR = Fa.uq; }}, IsIprAccess);
+            0x025: mf_fpcr({{ Fa.uq = FPCR; }}, IsIprAccess);
         }
     }
 
@@ -696,9 +696,9 @@ decode OPCODE default Unknown::unknown() {
                 xc->syscall();
             }}, IsNonSpeculative);
             // Read uniq reg into ABI return value register (r0)
-            0x9e: rduniq({{ R0 = Runiq; }});
+            0x9e: rduniq({{ R0 = Runiq; }}, IsIprAccess);
             // Write uniq reg with value from ABI arg register (r16)
-            0x9f: wruniq({{ Runiq = R16; }});
+            0x9f: wruniq({{ Runiq = R16; }}, IsIprAccess);
         }
     }
 #endif
@@ -735,7 +735,7 @@ decode OPCODE default Unknown::unknown() {
         format HwMoveIPR {
             1: hw_mfpr({{
                 Ra = xc->readMiscRegWithEffect(ipr_index, fault);
-            }});
+            }}, IsIprAccess);
         }
     }
 
@@ -745,7 +745,7 @@ decode OPCODE default Unknown::unknown() {
             1: hw_mtpr({{
                 xc->setMiscRegWithEffect(ipr_index, Ra);
                 if (traceData) { traceData->setData(Ra); }
-            }});
+            }}, IsIprAccess);
         }
     }
 
diff --git a/cpu/base_dyn_inst.hh b/cpu/base_dyn_inst.hh
index cd754dc3c..9403faec3 100644
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@@ -334,6 +334,8 @@ class BaseDynInst : public FastAlloc, public RefCounted
     bool isMemRef()    	  const { return staticInst->isMemRef(); }
     bool isLoad()	  const { return staticInst->isLoad(); }
     bool isStore()	  const { return staticInst->isStore(); }
+    bool isStoreConditional() const
+    { return staticInst->isStoreConditional(); }
     bool isInstPrefetch() const { return staticInst->isInstPrefetch(); }
     bool isDataPrefetch() const { return staticInst->isDataPrefetch(); }
     bool isCopy()         const { return staticInst->isCopy(); }
@@ -356,6 +358,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     bool isWriteBarrier() const { return staticInst->isWriteBarrier(); }
     bool isNonSpeculative() const { return staticInst->isNonSpeculative(); }
     bool isQuiesce() const { return staticInst->isQuiesce(); }
+    bool isIprAccess() const { return staticInst->isIprAccess(); }
     bool isUnverifiable() const { return staticInst->isUnverifiable(); }
 
     /** Temporarily sets this instruction as a serialize before instruction. */
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 59f4055a6..cf28f2efc 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -1100,10 +1100,10 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
 
             ++iewDispStoreInsts;
 
-            if (inst->isNonSpeculative()) {
-                // Non-speculative stores (namely store conditionals)
-                // need to be set as "canCommit()" so that commit can
-                // process them when they reach the head of commit.
+            if (inst->isStoreConditional()) {
+                // Store conditionals need to be set as "canCommit()"
+                // so that commit can process them when they reach the
+                // head of commit.
                 inst->setCanCommit();
                 instQueue.insertNonSpec(inst);
                 add_to_iq = false;
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index ed57ac257..71541b4f8 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -1041,6 +1041,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
 
             // Remove the instruction from the dependency list.
             if (!squashed_inst->isNonSpeculative() &&
+                !squashed_inst->isStoreConditional() &&
                 !squashed_inst->isMemBarrier() &&
                 !squashed_inst->isWriteBarrier()) {
 
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index f0b4405ed..7974ddaad 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -424,10 +424,9 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
 
     assert(store_fault == NoFault);
 
-    if (store_inst->isNonSpeculative()) {
-        // Nonspeculative accesses (namely store conditionals)
-        // need to set themselves as able to writeback if we
-        // haven't had a fault by here.
+    if (store_inst->isStoreConditional()) {
+        // Store conditionals need to set themselves as able to
+        // writeback if we haven't had a fault by here.
         storeQueue[store_idx].canWB = true;
 
         ++storesToWB;
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index 081581c92..b4f1077d1 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -594,7 +594,14 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
         // serializeAfter marks the next instruction as serializeBefore.
         // serializeBefore makes the instruction wait in rename until the ROB
         // is empty.
-        if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
+
+        // In this model, IPR accesses are serialize before
+        // instructions, and store conditionals are serialize after
+        // instructions.  This is mainly due to lack of support for
+        // out-of-order operations of either of those classes of
+        // instructions.
+        if ((inst->isIprAccess() || inst->isSerializeBefore()) &&
+            !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize before instruction encountered.\n");
 
             if (!inst->isTempSerializeBefore()) {
@@ -613,7 +620,8 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
             blockThisCycle = true;
 
             break;
-        } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) {
+        } else if ((inst->isStoreConditional() || inst->isSerializeAfter()) &&
+                   !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize after instruction encountered.\n");
 
             renamedSerializing++;
diff --git a/cpu/static_inst.hh b/cpu/static_inst.hh
index 0b8fe2f18..b9d782b7b 100644
--- a/cpu/static_inst.hh
+++ b/cpu/static_inst.hh
@@ -103,6 +103,7 @@ class StaticInstBase : public RefCounted
         IsMemRef,	///< References memory (load, store, or prefetch).
         IsLoad,		///< Reads from memory (load or prefetch).
         IsStore,	///< Writes to memory.
+        IsStoreConditional,    ///< Store conditional instruction.
         IsInstPrefetch,	///< Instruction-cache prefetch.
         IsDataPrefetch,	///< Data-cache prefetch.
         IsCopy,         ///< Fast Cache block copy
@@ -127,9 +128,10 @@ class StaticInstBase : public RefCounted
         IsWriteBarrier,	///< Is a write barrier
 
         IsNonSpeculative, ///< Should not be executed speculatively
-        IsQuiesce,
+        IsQuiesce,      ///< Is a quiesce instruction
 
-        IsUnverifiable,
+        IsIprAccess,    ///< Accesses IPRs
+        IsUnverifiable, ///< Can't be verified by a checker
 
         NumFlags
     };
@@ -193,6 +195,7 @@ class StaticInstBase : public RefCounted
     bool isMemRef()    	  const { return flags[IsMemRef]; }
     bool isLoad()	  const { return flags[IsLoad]; }
     bool isStore()	  const { return flags[IsStore]; }
+    bool isStoreConditional()	  const { return flags[IsStoreConditional]; }
     bool isInstPrefetch() const { return flags[IsInstPrefetch]; }
     bool isDataPrefetch() const { return flags[IsDataPrefetch]; }
     bool isCopy()         const { return flags[IsCopy];}
@@ -218,6 +221,7 @@ class StaticInstBase : public RefCounted
     bool isWriteBarrier() const { return flags[IsWriteBarrier]; }
     bool isNonSpeculative() const { return flags[IsNonSpeculative]; }
     bool isQuiesce() const { return flags[IsQuiesce]; }
+    bool isIprAccess() const { return flags[IsIprAccess]; }
     bool isUnverifiable() const { return flags[IsUnverifiable]; }
     //@}
 

From ff3d16ca1f7d83ce7932868d2bf1cb3e526562ea Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 16:51:16 -0400
Subject: [PATCH 43/50] Move kernel stats out of CPU and into XC.

arch/alpha/ev5.cc:
    Move kernel stats out of CPU and into XC.  Also be sure to check if the kernel stats exist prior to using them.

--HG--
extra : convert_revision : 565cd7026410fd7d8586f953d9b328c2e67a9473
---
 arch/alpha/ev5.cc           |  22 ++--
 cpu/base.cc                 |  23 ----
 cpu/base.hh                 |  10 +-
 cpu/checker/exec_context.hh |  44 +++++++-
 cpu/cpu_exec_context.cc     |   9 +-
 cpu/cpu_exec_context.hh     |   7 ++
 cpu/exec_context.hh         |   7 ++
 cpu/o3/alpha_cpu.hh         |  11 +-
 cpu/o3/alpha_cpu_impl.hh    |  62 +++++++----
 cpu/ozone/cpu.hh            |  28 ++---
 cpu/ozone/cpu_impl.hh       | 207 ++++++------------------------------
 cpu/simple/cpu.cc           |   2 +-
 cpu/thread_state.hh         |  39 ++++++-
 kern/system_events.cc       |  11 +-
 sim/pseudo_inst.cc          |  15 ++-
 15 files changed, 221 insertions(+), 276 deletions(-)

diff --git a/arch/alpha/ev5.cc b/arch/alpha/ev5.cc
index 019e83dd4..ad3a9ec4c 100644
--- a/arch/alpha/ev5.cc
+++ b/arch/alpha/ev5.cc
@@ -146,7 +146,7 @@ CPUExecContext::hwrei()
     setNextPC(readMiscReg(AlphaISA::IPR_EXC_ADDR));
 
     if (!misspeculating()) {
-        cpu->kernelStats->hwrei();
+        kernelStats->hwrei();
 
         cpu->checkInterrupts = true;
     }
@@ -336,7 +336,8 @@ AlphaISA::MiscRegFile::setIpr(int idx, uint64_t val, ExecContext *xc)
         // write entire quad w/ no side-effect
         old = ipr[idx];
         ipr[idx] = val;
-        xc->getCpuPtr()->kernelStats->context(old, val, xc);
+        if (xc->getKernelStats())
+            xc->getKernelStats()->context(old, val, xc);
         break;
 
       case AlphaISA::IPR_DTB_PTE:
@@ -363,14 +364,19 @@ AlphaISA::MiscRegFile::setIpr(int idx, uint64_t val, ExecContext *xc)
 
         // only write least significant five bits - interrupt level
         ipr[idx] = val & 0x1f;
-        xc->getCpuPtr()->kernelStats->swpipl(ipr[idx]);
+        if (xc->getKernelStats())
+            xc->getKernelStats()->swpipl(ipr[idx]);
         break;
 
       case AlphaISA::IPR_DTB_CM:
-        if (val & 0x18)
-            xc->getCpuPtr()->kernelStats->mode(Kernel::user, xc);
-        else
-            xc->getCpuPtr()->kernelStats->mode(Kernel::kernel, xc);
+        if (val & 0x18) {
+            if (xc->getKernelStats())
+                xc->getKernelStats()->mode(Kernel::user, xc);
+            else {
+                if (xc->getKernelStats())
+                    xc->getKernelStats()->mode(Kernel::kernel, xc);
+            }
+        }
 
       case AlphaISA::IPR_ICM:
         // only write two mode bits - processor mode
@@ -556,7 +562,7 @@ AlphaISA::MiscRegFile::copyIprs(ExecContext *xc)
 bool
 CPUExecContext::simPalCheck(int palFunc)
 {
-    cpu->kernelStats->callpal(palFunc, proxy);
+    kernelStats->callpal(palFunc, proxy);
 
     switch (palFunc) {
       case PAL::halt:
diff --git a/cpu/base.cc b/cpu/base.cc
index 74b679d5d..de03b9eab 100644
--- a/cpu/base.cc
+++ b/cpu/base.cc
@@ -45,10 +45,6 @@
 
 #include "base/trace.hh"
 
-#if FULL_SYSTEM
-#include "kern/kernel_stats.hh"
-#endif
-
 using namespace std;
 
 vector<BaseCPU *> BaseCPU::cpuList;
@@ -153,8 +149,6 @@ BaseCPU::BaseCPU(Params *p)
     profileEvent = NULL;
     if (params->profile)
         profileEvent = new ProfileEvent(this, params->profile);
-
-    kernelStats = new Kernel::Statistics(system);
 #endif
 
 }
@@ -175,10 +169,6 @@ BaseCPU::enableFunctionTrace()
 
 BaseCPU::~BaseCPU()
 {
-#if FULL_SYSTEM
-    if (kernelStats)
-        delete kernelStats;
-#endif
 }
 
 void
@@ -219,8 +209,6 @@ BaseCPU::regStats()
         execContexts[0]->regStats(name());
 
 #if FULL_SYSTEM
-    if (kernelStats)
-        kernelStats->regStats(name() + ".kern");
 #endif
 }
 
@@ -348,12 +336,6 @@ BaseCPU::serialize(std::ostream &os)
 {
     SERIALIZE_ARRAY(interrupts, TheISA::NumInterruptLevels);
     SERIALIZE_SCALAR(intstatus);
-
-#if FULL_SYSTEM
-    if (kernelStats)
-        kernelStats->serialize(os);
-#endif
-
 }
 
 void
@@ -361,11 +343,6 @@ BaseCPU::unserialize(Checkpoint *cp, const std::string &section)
 {
     UNSERIALIZE_ARRAY(interrupts, TheISA::NumInterruptLevels);
     UNSERIALIZE_SCALAR(intstatus);
-
-#if FULL_SYSTEM
-    if (kernelStats)
-        kernelStats->unserialize(cp, section);
-#endif
 }
 
 #endif // FULL_SYSTEM
diff --git a/cpu/base.hh b/cpu/base.hh
index 20166d7ee..dd776859d 100644
--- a/cpu/base.hh
+++ b/cpu/base.hh
@@ -38,14 +38,10 @@
 #include "sim/sim_object.hh"
 #include "arch/isa_traits.hh"
 
-#if FULL_SYSTEM
-class System;
-namespace Kernel { class Statistics; }
-#endif
-
 class BranchPred;
 class CheckerCPU;
 class ExecContext;
+class System;
 
 class BaseCPU : public SimObject
 {
@@ -237,10 +233,6 @@ class BaseCPU : public SimObject
   public:
     // Number of CPU cycles simulated
     Stats::Scalar<> numCycles;
-
-#if FULL_SYSTEM
-    Kernel::Statistics *kernelStats;
-#endif
 };
 
 #endif // __CPU_BASE_HH__
diff --git a/cpu/checker/exec_context.hh b/cpu/checker/exec_context.hh
index 4843d1cf0..38784867d 100644
--- a/cpu/checker/exec_context.hh
+++ b/cpu/checker/exec_context.hh
@@ -1,3 +1,31 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #ifndef __CPU_CHECKER_EXEC_CONTEXT_HH__
 #define __CPU_CHECKER_EXEC_CONTEXT_HH__
 
@@ -6,6 +34,9 @@
 #include "cpu/exec_context.hh"
 
 class EndQuiesceEvent;
+namespace Kernel {
+    class Statistics;
+};
 
 template <class XC>
 class CheckerExecContext : public ExecContext
@@ -13,7 +44,8 @@ class CheckerExecContext : public ExecContext
   public:
     CheckerExecContext(XC *actual_xc,
                        CheckerCPU *checker_cpu)
-        : actualXC(actual_xc), checkerXC(checker_cpu->cpuXC), checkerCPU(checker_cpu)
+        : actualXC(actual_xc), checkerXC(checker_cpu->cpuXC),
+          checkerCPU(checker_cpu)
     { }
 
   private:
@@ -43,6 +75,8 @@ class CheckerExecContext : public ExecContext
     AlphaITB *getITBPtr() { return actualXC->getITBPtr(); }
 
     AlphaDTB *getDTBPtr() { return actualXC->getDTBPtr(); }
+
+    Kernel::Statistics *getKernelStats() { return actualXC->getKernelStats(); }
 #else
     Process *getProcessPtr() { return actualXC->getProcessPtr(); }
 #endif
@@ -50,8 +84,10 @@ class CheckerExecContext : public ExecContext
     Status status() const { return actualXC->status(); }
 
     void setStatus(Status new_status)
-    { actualXC->setStatus(new_status);
-    checkerXC->setStatus(new_status); }
+    {
+        actualXC->setStatus(new_status);
+        checkerXC->setStatus(new_status);
+    }
 
     /// Set the status to Active.  Optional delay indicates number of
     /// cycles to wait before beginning execution.
@@ -216,8 +252,6 @@ class CheckerExecContext : public ExecContext
         actualXC->setSyscallReturn(return_value);
     }
 
-//    void syscall() { actualXC->syscall(); }
-
     Counter readFuncExeInst() { return actualXC->readFuncExeInst(); }
 #endif
 };
diff --git a/cpu/cpu_exec_context.cc b/cpu/cpu_exec_context.cc
index 24de6d450..78ce058e8 100644
--- a/cpu/cpu_exec_context.cc
+++ b/cpu/cpu_exec_context.cc
@@ -188,7 +188,8 @@ CPUExecContext::serialize(ostream &os)
     if (quiesceEvent->scheduled())
         quiesceEndTick = quiesceEvent->when();
     SERIALIZE_SCALAR(quiesceEndTick);
-
+    if (kernelStats)
+        kernelStats->serialize(os);
 #endif
 }
 
@@ -207,6 +208,8 @@ CPUExecContext::unserialize(Checkpoint *cp, const std::string &section)
     UNSERIALIZE_SCALAR(quiesceEndTick);
     if (quiesceEndTick)
         quiesceEvent->schedule(quiesceEndTick);
+    if (kernelStats)
+        kernelStats->unserialize(cp, section);
 #endif
 }
 
@@ -275,6 +278,10 @@ CPUExecContext::halt()
 void
 CPUExecContext::regStats(const string &name)
 {
+#if FULL_SYSTEM
+    kernelStats = new Kernel::Statistics(system);
+    kernelStats->regStats(name + ".kern");
+#endif
 }
 
 void
diff --git a/cpu/cpu_exec_context.hh b/cpu/cpu_exec_context.hh
index cac006925..3d1428933 100644
--- a/cpu/cpu_exec_context.hh
+++ b/cpu/cpu_exec_context.hh
@@ -53,6 +53,10 @@ class FunctionProfile;
 class ProfileNode;
 class MemoryController;
 
+namespace Kernel {
+    class Statistics;
+};
+
 #else // !FULL_SYSTEM
 
 #include "sim/process.hh"
@@ -147,6 +151,9 @@ class CPUExecContext
 
     void profileSample();
 
+    Kernel::Statistics *getKernelStats() { return kernelStats; }
+
+    Kernel::Statistics *kernelStats;
 #else
     Process *process;
 
diff --git a/cpu/exec_context.hh b/cpu/exec_context.hh
index 7bd7d5682..e1f1016e5 100644
--- a/cpu/exec_context.hh
+++ b/cpu/exec_context.hh
@@ -48,6 +48,9 @@ class FunctionalMemory;
 class PhysicalMemory;
 class Process;
 class System;
+namespace Kernel {
+    class Statistics;
+};
 
 class ExecContext
 {
@@ -98,6 +101,8 @@ class ExecContext
     virtual AlphaITB *getITBPtr() = 0;
 
     virtual AlphaDTB * getDTBPtr() = 0;
+
+    virtual Kernel::Statistics *getKernelStats() = 0;
 #else
     virtual Process *getProcessPtr() = 0;
 #endif
@@ -243,6 +248,8 @@ class ProxyExecContext : public ExecContext
     AlphaITB *getITBPtr() { return actualXC->getITBPtr(); }
 
     AlphaDTB *getDTBPtr() { return actualXC->getDTBPtr(); }
+
+    Kernel::Statistics *getKernelStats() { return actualXC->getKernelStats(); }
 #else
     Process *getProcessPtr() { return actualXC->getProcessPtr(); }
 #endif
diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index 78ad5f7d8..5c89e3462 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -35,6 +35,9 @@
 #include "sim/byteswap.hh"
 
 class EndQuiesceEvent;
+namespace Kernel {
+    class Statistics;
+};
 
 template <class Impl>
 class AlphaFullCPU : public FullO3CPU<Impl>
@@ -60,11 +63,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         O3ThreadState<Impl> *thread;
 
-        Tick lastActivate;
-        Tick lastSuspend;
-
-        EndQuiesceEvent *quiesceEvent;
-
         virtual BaseCPU *getCpuPtr() { return cpu; }
 
         virtual void setCpuId(int id) { cpu->cpu_id = id; }
@@ -81,6 +79,9 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual AlphaITB *getITBPtr() { return cpu->itb; }
 
         virtual AlphaDTB * getDTBPtr() { return cpu->dtb; }
+
+        virtual Kernel::Statistics *getKernelStats()
+        { return thread->kernelStats; }
 #else
         virtual Process *getProcessPtr() { return thread->process; }
 #endif
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 58b2b3548..91cd3d9e6 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -31,7 +31,6 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/checker/exec_context.hh"
-#include "cpu/quiesce_event.hh"
 #include "mem/mem_interface.hh"
 #include "sim/sim_events.hh"
 #include "sim/stats.hh"
@@ -44,6 +43,8 @@
 #if FULL_SYSTEM
 #include "arch/alpha/osfpal.hh"
 #include "arch/isa_traits.hh"
+#include "cpu/quiesce_event.hh"
+#include "kern/kernel_stats.hh"
 #endif
 
 using namespace TheISA;
@@ -101,11 +102,12 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params *params)
         alpha_xc_proxy->cpu = this;
         alpha_xc_proxy->thread = this->thread[i];
 
-        alpha_xc_proxy->quiesceEvent =
+#if FULL_SYSTEM
+        this->thread[i]->quiesceEvent =
             new EndQuiesceEvent(xc_proxy);
-        alpha_xc_proxy->lastActivate = 0;
-        alpha_xc_proxy->lastSuspend = 0;
-
+        this->thread[i]->lastActivate = 0;
+        this->thread[i]->lastSuspend = 0;
+#endif
         this->thread[i]->xcProxy = xc_proxy;
 
         this->execContexts.push_back(xc_proxy);
@@ -181,6 +183,9 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
     if (thread->quiesceEvent) {
         thread->quiesceEvent->xc = this;
     }
+
+    // Transfer kernel stats from one CPU to the other.
+    thread->kernelStats = old_context->getKernelStats();
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
 #endif
@@ -200,7 +205,9 @@ AlphaFullCPU<Impl>::AlphaXC::activate(int delay)
     if (thread->status() == ExecContext::Active)
         return;
 
-    lastActivate = curTick;
+#if FULL_SYSTEM
+    thread->lastActivate = curTick;
+#endif
 
     if (thread->status() == ExecContext::Unallocated) {
         cpu->activateWhenReady(thread->tid);
@@ -222,8 +229,10 @@ AlphaFullCPU<Impl>::AlphaXC::suspend()
     if (thread->status() == ExecContext::Suspended)
         return;
 
-    lastActivate = curTick;
-    lastSuspend = curTick;
+#if FULL_SYSTEM
+    thread->lastActivate = curTick;
+    thread->lastSuspend = curTick;
+#endif
 /*
 #if FULL_SYSTEM
     // Don't change the status from active if there are pending interrupts
@@ -266,38 +275,55 @@ AlphaFullCPU<Impl>::AlphaXC::halt()
 template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::regStats(const std::string &name)
-{}
+{
+#if FULL_SYSTEM
+    thread->kernelStats = new Kernel::Statistics(cpu->system);
+    thread->kernelStats->regStats(name + ".kern");
+#endif
+}
 
 template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::serialize(std::ostream &os)
-{}
+{
+#if FULL_SYSTEM
+    if (thread->kernelStats)
+        thread->kernelStats->serialize(os);
+#endif
+
+}
 
 template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::unserialize(Checkpoint *cp, const std::string &section)
-{}
+{
+#if FULL_SYSTEM
+    if (thread->kernelStats)
+        thread->kernelStats->unserialize(cp, section);
+#endif
+
+}
 
 #if FULL_SYSTEM
 template <class Impl>
 EndQuiesceEvent *
 AlphaFullCPU<Impl>::AlphaXC::getQuiesceEvent()
 {
-    return quiesceEvent;
+    return thread->quiesceEvent;
 }
 
 template <class Impl>
 Tick
 AlphaFullCPU<Impl>::AlphaXC::readLastActivate()
 {
-    return lastActivate;
+    return thread->lastActivate;
 }
 
 template <class Impl>
 Tick
 AlphaFullCPU<Impl>::AlphaXC::readLastSuspend()
 {
-    return lastSuspend;
+    return thread->lastSuspend;
 }
 
 template <class Impl>
@@ -595,7 +621,7 @@ AlphaFullCPU<Impl>::hwrei(unsigned tid)
     // Need to clear the lock flag upon returning from an interrupt.
     this->lockFlag = false;
 
-    this->kernelStats->hwrei();
+    this->thread[tid]->kernelStats->hwrei();
 
     this->checkInterrupts = true;
 
@@ -607,9 +633,9 @@ template <class Impl>
 bool
 AlphaFullCPU<Impl>::simPalCheck(int palFunc, unsigned tid)
 {
-    if (this->kernelStats)
-        this->kernelStats->callpal(palFunc,
-                                   this->execContexts[tid]);
+    if (this->thread[tid]->kernelStats)
+        this->thread[tid]->kernelStats->callpal(palFunc,
+                                                this->execContexts[tid]);
 
     switch (palFunc) {
       case PAL::halt:
diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index 7e12e75e5..5af2b02b2 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -57,6 +57,10 @@ class Sampler;
 class RemoteGDB;
 class GDBListener;
 
+namespace Kernel {
+    class Statistics;
+};
+
 #else
 
 class Process;
@@ -116,6 +120,8 @@ class OzoneCPU : public BaseCPU
         AlphaITB *getITBPtr() { return cpu->itb; }
 
         AlphaDTB * getDTBPtr() { return cpu->dtb; }
+
+        Kernel::Statistics *getKernelStats() { return thread->kernelStats; }
 #else
         Process *getProcessPtr() { return thread->process; }
 #endif
@@ -238,14 +244,7 @@ class OzoneCPU : public BaseCPU
 
   private:
     OzoneThreadState<Impl> thread;
-/*
-    // Squash event for when the XC needs to squash all inflight instructions.
-    struct XCSquashEvent : public Event
-    {
-        void process();
-        const char *description();
-    };
-*/
+
   public:
     // main simulation loop (one cycle)
     void tick();
@@ -288,7 +287,6 @@ class OzoneCPU : public BaseCPU
     void trace_data(T data);
 
   public:
-    //
     enum Status {
         Running,
         Idle,
@@ -325,8 +323,6 @@ class OzoneCPU : public BaseCPU
 
     int readCpuId() { return cpuId; }
 
-//    FunctionalMemory *getMemPtr() { return mem; }
-
     int cpuId;
 
     void switchOut(Sampler *sampler);
@@ -369,8 +365,6 @@ class OzoneCPU : public BaseCPU
     Status status() const { return _status; }
     void setStatus(Status new_status) { _status = new_status; }
 
-    // Not sure what an activate() call on the CPU's proxy XC would mean...
-
     virtual void activateContext(int thread_num, int delay);
     virtual void suspendContext(int thread_num);
     virtual void deallocateContext(int thread_num);
@@ -384,7 +378,6 @@ class OzoneCPU : public BaseCPU
   public:
     Counter numInst;
     Counter startNumInst;
-//    Stats::Scalar<> numInsts;
 
     virtual Counter totalInstructions() const
     {
@@ -392,9 +385,6 @@ class OzoneCPU : public BaseCPU
     }
 
   private:
-    // number of simulated memory references
-//    Stats::Scalar<> numMemRefs;
-
     // number of simulated loads
     Counter numLoad;
     Counter startNumLoad;
@@ -472,7 +462,6 @@ class OzoneCPU : public BaseCPU
     template <class T>
     Fault read(MemReqPtr &req, T &data)
     {
-//	panic("CPU READ NOT IMPLEMENTED W/NEW MEMORY\n");
 #if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
         if (req->flags & LOCKED) {
@@ -483,7 +472,6 @@ class OzoneCPU : public BaseCPU
 #endif
         Fault error;
         if (req->flags & LOCKED) {
-//            lockAddr = req->paddr;
             lockAddrList.insert(req->paddr);
             lockFlag = true;
         }
@@ -558,7 +546,7 @@ class OzoneCPU : public BaseCPU
             if (req->flags & UNCACHEABLE) {
                 req->result = 2;
             } else {
-                if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
+                if (this->lockFlag) {
                     if (lockAddrList.find(req->paddr) !=
                         lockAddrList.end()) {
                         req->result = 1;
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index 031b4b145..5675da3a8 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 The Regents of The University of Michigan
+ * Copyright (c) 2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <cstdio>
-#include <cstdlib>
+//#include <cstdio>
+//#include <cstdlib>
 
 #include "arch/isa_traits.hh" // For MachInst
 #include "base/trace.hh"
@@ -39,7 +39,7 @@
 #include "cpu/ozone/cpu.hh"
 #include "cpu/quiesce_event.hh"
 #include "cpu/static_inst.hh"
-#include "mem/base_mem.hh"
+//#include "mem/base_mem.hh"
 #include "mem/mem_interface.hh"
 #include "sim/sim_object.hh"
 #include "sim/stats.hh"
@@ -50,7 +50,7 @@
 #include "arch/alpha/tlb.hh"
 #include "arch/vtophys.hh"
 #include "base/callback.hh"
-#include "base/remote_gdb.hh"
+//#include "base/remote_gdb.hh"
 #include "cpu/profile.hh"
 #include "kern/kernel_stats.hh"
 #include "mem/functional/memory_control.hh"
@@ -94,80 +94,26 @@ OzoneCPU<Impl>::TickEvent::description()
 {
     return "OzoneCPU tick event";
 }
-/*
-template <class Impl>
-OzoneCPU<Impl>::ICacheCompletionEvent::ICacheCompletionEvent(OzoneCPU *_cpu)
-    : Event(&mainEventQueue),
-      cpu(_cpu)
-{
-}
 
-template <class Impl>
-void
-OzoneCPU<Impl>::ICacheCompletionEvent::process()
-{
-    cpu->processICacheCompletion();
-}
-
-template <class Impl>
-const char *
-OzoneCPU<Impl>::ICacheCompletionEvent::description()
-{
-    return "OzoneCPU I-cache completion event";
-}
-
-template <class Impl>
-OzoneCPU<Impl>::DCacheCompletionEvent::
-DCacheCompletionEvent(OzoneCPU *_cpu,
-                      DynInstPtr &_inst,
-                      DCacheCompEventIt &_dcceIt)
-    : Event(&mainEventQueue),
-      cpu(_cpu),
-      inst(_inst),
-      dcceIt(_dcceIt)
-{
-    this->setFlags(Event::AutoDelete);
-}
-
-template <class Impl>
-void
-OzoneCPU<Impl>::DCacheCompletionEvent::process()
-{
-    inst->setCompleted();
-
-    // Maybe remove the EA from the list of addrs?
-    cpu->eaList.clearAddr(inst->seqNum, inst->getEA());
-    cpu->dCacheCompList.erase(this->dcceIt);
-}
-
-template <class Impl>
-const char *
-OzoneCPU<Impl>::DCacheCompletionEvent::description()
-{
-    return "OzoneCPU D-cache completion event";
-}
-*/
 template <class Impl>
 OzoneCPU<Impl>::OzoneCPU(Params *p)
 #if FULL_SYSTEM
-    : BaseCPU(p), thread(this, 0, p->mem), tickEvent(this, p->width), mem(p->mem),
+    : BaseCPU(p), thread(this, 0, p->mem), tickEvent(this, p->width),
+      mem(p->mem),
 #else
     : BaseCPU(p), thread(this, 0, p->workload[0], 0), tickEvent(this, p->width),
       mem(p->workload[0]->getMemory()),
 #endif
       comm(5, 5)
 {
-    if (p->checker) {
-        BaseCPU *temp_checker = p->checker;
-        checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker);
-    } else {
-        checker = NULL;
-    }
     frontEnd = new FrontEnd(p);
     backEnd = new BackEnd(p);
 
     _status = Idle;
-    if (checker) {
+
+    if (p->checker) {
+        BaseCPU *temp_checker = p->checker;
+        checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker);
         checker->setMemory(mem);
 #if FULL_SYSTEM
         checker->setSystem(p->system);
@@ -176,19 +122,18 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
         thread.xcProxy = checkerXC;
         xcProxy = checkerXC;
     } else {
+        checker = NULL;
         thread.xcProxy = &ozoneXC;
         xcProxy = &ozoneXC;
     }
 
-    thread.inSyscall = false;
-
     ozoneXC.cpu = this;
     ozoneXC.thread = &thread;
 
+    thread.inSyscall = false;
+
     thread.setStatus(ExecContext::Suspended);
 #if FULL_SYSTEM
-//    xc = new ExecContext(this, 0, p->system, p->itb, p->dtb, p->mem);
-
     /***** All thread state stuff *****/
     thread.cpu = this;
     thread.tid = 0;
@@ -217,31 +162,15 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     thread.profileNode = &dummyNode;
     thread.profilePC = 3;
 #else
-//    xc = new ExecContext(this, /* thread_num */ 0, p->workload[0], /* asid */ 0);
     thread.cpu = this;
     thread.tid = 0;
     thread.process = p->workload[0];
-//    thread.mem = thread.process->getMemory();
     thread.asid = 0;
 #endif // !FULL_SYSTEM
-/*
-    icacheInterface = p->icache_interface;
-    dcacheInterface = p->dcache_interface;
 
-    cacheMemReq = new MemReq();
-    cacheMemReq->xc = xc;
-    cacheMemReq->asid = 0;
-    cacheMemReq->data = new uint8_t[64];
-*/
     numInst = 0;
     startNumInst = 0;
-/*    numLoad = 0;
-    startNumLoad = 0;
-    lastIcacheStall = 0;
-    lastDcacheStall = 0;
 
-    issueWidth = p->issueWidth;
-*/
     execContexts.push_back(xcProxy);
 
     frontEnd->setCPU(this);
@@ -286,47 +215,7 @@ template <class Impl>
 OzoneCPU<Impl>::~OzoneCPU()
 {
 }
-/*
-template <class Impl>
-void
-OzoneCPU<Impl>::copyFromXC()
-{
-    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
-        if (i < TheISA::NumIntRegs) {
-            renameTable[i]->setIntResult(xc->readIntReg(i));
-        } else if (i < TheISA::NumFloatRegs) {
-            renameTable[i]->setDoubleResult(xc->readFloatRegDouble(i));
-        }
-    }
 
-    DPRINTF(OzoneCPU, "Func Exe inst is: %i\n", xc->func_exe_inst);
-    backEnd->funcExeInst = xc->func_exe_inst;
-//    PC = xc->readPC();
-//    nextPC = xc->regs.npc;
-}
-
-template <class Impl>
-void
-OzoneCPU<Impl>::copyToXC()
-{
-    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
-        if (i < TheISA::NumIntRegs) {
-            xc->setIntReg(i, renameTable[i]->readIntResult());
-        } else if (i < TheISA::NumFloatRegs) {
-            xc->setFloatRegDouble(i, renameTable[i]->readDoubleResult());
-        }
-    }
-
-    this->xc->regs.miscRegs.fpcr = this->regFile.miscRegs[tid].fpcr;
-    this->xc->regs.miscRegs.uniq = this->regFile.miscRegs[tid].uniq;
-    this->xc->regs.miscRegs.lock_flag = this->regFile.miscRegs[tid].lock_flag;
-    this->xc->regs.miscRegs.lock_addr = this->regFile.miscRegs[tid].lock_addr;
-
-    xc->func_exe_inst = backEnd->funcExeInst;
-    xc->regs.pc = PC;
-    xc->regs.npc = nextPC;
-}
-*/
 template <class Impl>
 void
 OzoneCPU<Impl>::switchOut(Sampler *_sampler)
@@ -394,7 +283,6 @@ OzoneCPU<Impl>::activateContext(int thread_num, int delay)
 {
     // Eventually change this in SMT.
     assert(thread_num == 0);
-//    assert(xcProxy);
 
     assert(_status == Idle);
     notIdleFraction++;
@@ -410,8 +298,8 @@ OzoneCPU<Impl>::suspendContext(int thread_num)
 {
     // Eventually change this in SMT.
     assert(thread_num == 0);
-//    assert(xcProxy);
-    // @todo: Figure out how to initially set the status properly so this is running.
+    // @todo: Figure out how to initially set the status properly so
+    // this is running.
 //    assert(_status == Running);
     notIdleFraction--;
     unscheduleTickEvent();
@@ -486,14 +374,7 @@ void
 OzoneCPU<Impl>::init()
 {
     BaseCPU::init();
-/*
-    copyFromXC();
 
-    // ALso copy over PC/nextPC.  This isn't normally copied in "copyFromXC()"
-    // so that the XC doesn't mess up the PC when returning from a syscall.
-    PC = xc->readPC();
-    nextPC = xc->regs.npc;
-*/
     // Mark this as in syscall so it won't need to squash
     thread.inSyscall = true;
 #if FULL_SYSTEM
@@ -514,8 +395,6 @@ template <class Impl>
 void
 OzoneCPU<Impl>::serialize(std::ostream &os)
 {
-    // At this point, all DCacheCompEvents should be processed.
-
     BaseCPU::serialize(os);
     SERIALIZE_ENUM(_status);
     nameOut(os, csprintf("%s.xc", name()));
@@ -631,31 +510,7 @@ OzoneCPU<Impl>::dbg_vtophys(Addr addr)
     return vtophys(xcProxy, addr);
 }
 #endif // FULL_SYSTEM
-/*
-template <class Impl>
-void
-OzoneCPU<Impl>::processICacheCompletion()
-{
-    switch (status()) {
-      case IcacheMiss:
-        DPRINTF(OzoneCPU, "OzoneCPU: Finished Icache miss.\n");
 
-        icacheStallCycles += curTick - lastIcacheStall;
-        _status = IcacheMissComplete;
-        cacheBlkValid = true;
-//	scheduleTickEvent(1);
-        break;
-      case SwitchedOut:
-        // If this CPU has been switched out due to sampling/warm-up,
-        // ignore any further status changes (e.g., due to cache
-        // misses outstanding at the time of the switch).
-        return;
-      default:
-        panic("OzoneCPU::processICacheCompletion: bad state");
-        break;
-    }
-}
-*/
 #if FULL_SYSTEM
 template <class Impl>
 void
@@ -663,7 +518,6 @@ OzoneCPU<Impl>::post_interrupt(int int_num, int index)
 {
     BaseCPU::post_interrupt(int_num, index);
 
-//    if (thread._status == ExecContext::Suspended) {
     if (_status == Idle) {
         DPRINTF(IPI,"Suspended Processor awoke\n");
 //	thread.activate();
@@ -690,9 +544,6 @@ OzoneCPU<Impl>::tick()
     frontEnd->tick();
     backEnd->tick();
 
-    // Do this here?  For now the front end will control the PC.
-//    PC = nextPC;
-
     // check for instruction-count-based events
     comInstEventQueue[0]->serviceEvents(numInst);
 
@@ -742,11 +593,13 @@ OzoneCPU<Impl>::setSyscallReturn(SyscallReturn return_value, int tid)
     if (return_value.successful()) {
         // no error
         thread.renameTable[SyscallSuccessReg]->setIntResult(0);
-        thread.renameTable[ReturnValueReg]->setIntResult(return_value.value());
+        thread.renameTable[ReturnValueReg]->setIntResult(
+            return_value.value());
     } else {
         // got an error, return details
         thread.renameTable[SyscallSuccessReg]->setIntResult((IntReg) -1);
-        thread.renameTable[ReturnValueReg]->setIntResult(-return_value.value());
+        thread.renameTable[ReturnValueReg]->setIntResult(
+            -return_value.value());
     }
 }
 #else
@@ -756,15 +609,10 @@ OzoneCPU<Impl>::hwrei()
 {
     // Need to move this to ISA code
     // May also need to make this per thread
-/*
-    if (!inPalMode())
-        return new UnimplementedOpcodeFault;
 
-    thread.setNextPC(thread.readMiscReg(AlphaISA::IPR_EXC_ADDR));
-*/
     lockFlag = false;
     lockAddrList.clear();
-    kernelStats->hwrei();
+    thread.kernelStats->hwrei();
 
     checkInterrupts = true;
 
@@ -835,7 +683,7 @@ OzoneCPU<Impl>::simPalCheck(int palFunc)
 {
     // Need to move this to ISA code
     // May also need to make this per thread
-    this->kernelStats->callpal(palFunc, xcProxy);
+    thread.kernelStats->callpal(palFunc, xcProxy);
 
     switch (palFunc) {
       case PAL::halt:
@@ -874,7 +722,6 @@ template <class Impl>
 void
 OzoneCPU<Impl>::OzoneXC::setStatus(Status new_status)
 {
-//    cpu->_status = new_status;
     thread->_status = new_status;
 }
 
@@ -932,6 +779,7 @@ OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
     setStatus(old_context->status());
     copyArchRegs(old_context);
     setCpuId(old_context->readCpuId());
+
 #if !FULL_SYSTEM
     setFuncExeInst(old_context->readFuncExeInst());
 #else
@@ -944,6 +792,8 @@ OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
     if (thread->quiesceEvent) {
         thread->quiesceEvent->xc = this;
     }
+
+    thread->kernelStats = old_context->getKernelStats();
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
 #endif
@@ -954,7 +804,12 @@ OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
 template <class Impl>
 void
 OzoneCPU<Impl>::OzoneXC::regStats(const std::string &name)
-{ }
+{
+#if FULL_SYSTEM
+    thread->kernelStats = new Kernel::Statistics(cpu->system);
+    thread->kernelStats->regStats(name + ".kern");
+#endif
+}
 
 template <class Impl>
 void
diff --git a/cpu/simple/cpu.cc b/cpu/simple/cpu.cc
index 07f9d0dad..c03945ffa 100644
--- a/cpu/simple/cpu.cc
+++ b/cpu/simple/cpu.cc
@@ -782,7 +782,7 @@ SimpleCPU::tick()
 
 #if FULL_SYSTEM
         if (system->kernelBinning->fnbin) {
-            assert(kernelStats);
+            assert(cpuXC->getKernelStats());
             system->kernelBinning->execute(xcProxy, inst);
         }
 
diff --git a/cpu/thread_state.hh b/cpu/thread_state.hh
index e8381b9d3..e09cb12fd 100644
--- a/cpu/thread_state.hh
+++ b/cpu/thread_state.hh
@@ -1,3 +1,30 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __CPU_THREAD_STATE_HH__
 #define __CPU_THREAD_STATE_HH__
@@ -8,11 +35,20 @@
 class EndQuiesceEvent;
 class FunctionProfile;
 class ProfileNode;
+namespace Kernel {
+    class Statistics;
+};
 #else
-class Process;
 class FunctionalMemory;
+class Process;
 #endif
 
+/**
+ *  Struct for holding general thread state that is needed across CPU
+ *  models.  This includes things such as pointers to the process,
+ *  memory, quiesce events, and certain stats.  This can be expanded
+ *  to hold more thread-specific stats within it.
+ */
 struct ThreadState {
 #if FULL_SYSTEM
     ThreadState(int _cpuId, int _tid, FunctionalMemory *_mem)
@@ -55,6 +91,7 @@ struct ThreadState {
 
     EndQuiesceEvent *quiesceEvent;
 
+    Kernel::Statistics *kernelStats;
 #else
     Process *process;
 
diff --git a/kern/system_events.cc b/kern/system_events.cc
index 9b9861497..221eb228d 100644
--- a/kern/system_events.cc
+++ b/kern/system_events.cc
@@ -67,15 +67,17 @@ FnEvent::process(ExecContext *xc)
 void
 IdleStartEvent::process(ExecContext *xc)
 {
-    xc->getCpuPtr()->kernelStats->setIdleProcess(
-        xc->readMiscReg(AlphaISA::IPR_PALtemp23), xc);
+    if (xc->getKernelStats())
+        xc->getKernelStats()->setIdleProcess(
+            xc->readMiscReg(AlphaISA::IPR_PALtemp23), xc);
     remove();
 }
 
 void
 InterruptStartEvent::process(ExecContext *xc)
 {
-    xc->getCpuPtr()->kernelStats->mode(Kernel::interrupt, xc);
+    if (xc->getKernelStats())
+        xc->getKernelStats()->mode(Kernel::interrupt, xc);
 }
 
 void
@@ -83,5 +85,6 @@ InterruptEndEvent::process(ExecContext *xc)
 {
     // We go back to kernel, if we are user, inside the rti
     // pal code we will get switched to user because of the ICM write
-    xc->getCpuPtr()->kernelStats->mode(Kernel::kernel, xc);
+    if (xc->getKernelStats())
+        xc->getKernelStats()->mode(Kernel::kernel, xc);
 }
diff --git a/sim/pseudo_inst.cc b/sim/pseudo_inst.cc
index 4d9541b58..0c20a6a53 100644
--- a/sim/pseudo_inst.cc
+++ b/sim/pseudo_inst.cc
@@ -65,7 +65,8 @@ namespace AlphaPseudo
     void
     arm(ExecContext *xc)
     {
-        xc->getCpuPtr()->kernelStats->arm();
+        if (xc->getKernelStats())
+            xc->getKernelStats()->arm();
     }
 
     void
@@ -75,7 +76,8 @@ namespace AlphaPseudo
             return;
 
         xc->suspend();
-        xc->getCpuPtr()->kernelStats->quiesce();
+        if (xc->getKernelStats())
+            xc->getKernelStats()->arm();
     }
 
     void
@@ -92,7 +94,8 @@ namespace AlphaPseudo
             quiesceEvent->schedule(curTick + Clock::Int::ns * ns);
 
         xc->suspend();
-        xc->getCpuPtr()->kernelStats->quiesce();
+        if (xc->getKernelStats())
+            xc->getKernelStats()->quiesce();
     }
 
     void
@@ -111,7 +114,8 @@ namespace AlphaPseudo
                                    xc->getCpuPtr()->cycles(cycles));
 
         xc->suspend();
-        xc->getCpuPtr()->kernelStats->quiesce();
+        if (xc->getKernelStats())
+            xc->getKernelStats()->quiesce();
     }
 
     uint64_t
@@ -123,7 +127,8 @@ namespace AlphaPseudo
     void
     ivlb(ExecContext *xc)
     {
-        xc->getCpuPtr()->kernelStats->ivlb();
+        if (xc->getKernelStats())
+            xc->getKernelStats()->ivlb();
     }
 
     void

From 6c386396faef6f48f2d01911e59d09b192bf3c45 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 16:57:14 -0400
Subject: [PATCH 44/50] Code cleanup.

cpu/base_dyn_inst.hh:
    Code cleanup

--HG--
extra : convert_revision : 501c03f8e4346ffbcb545ddeee30c1f8ded9baa7
---
 cpu/base_dyn_inst.hh          |   5 +-
 cpu/ozone/dyn_inst.hh         |  52 +----------------
 cpu/ozone/dyn_inst_impl.hh    |   7 +--
 cpu/ozone/front_end.hh        |  34 ++++++++---
 cpu/ozone/front_end_impl.hh   |  66 +++++++++-------------
 cpu/ozone/lw_back_end.hh      | 103 +++++++++-------------------------
 cpu/ozone/lw_back_end_impl.hh |  67 +++++++++++++---------
 cpu/ozone/lw_lsq.hh           |  27 +--------
 cpu/ozone/lw_lsq_impl.hh      |  41 +++-----------
 cpu/ozone/rename_table.hh     |  28 +++++++++
 cpu/ozone/thread_state.hh     |  38 ++++++++++---
 11 files changed, 195 insertions(+), 273 deletions(-)

diff --git a/cpu/base_dyn_inst.hh b/cpu/base_dyn_inst.hh
index cd754dc3c..05ffa6fa4 100644
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@@ -447,13 +447,10 @@ class BaseDynInst : public FastAlloc, public RefCounted
         instResult.integer = val;
     }
 
-    //Push to .cc file.
     /** Records that one of the source registers is ready. */
     void markSrcRegReady();
 
-    /** Marks a specific register as ready.
-     *  @todo: Move this to .cc file.
-     */
+    /** Marks a specific register as ready. */
     void markSrcRegReady(RegIndex src_idx);
 
     /** Returns if a source register is ready. */
diff --git a/cpu/ozone/dyn_inst.hh b/cpu/ozone/dyn_inst.hh
index f251c28ea..5d48bb361 100644
--- a/cpu/ozone/dyn_inst.hh
+++ b/cpu/ozone/dyn_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 The Regents of The University of Michigan
+ * Copyright (c) 2005-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -52,8 +52,6 @@ class OzoneDynInst : public BaseDynInst<Impl>
     // Typedef for DynInstPtr.  This is really just a RefCountingPtr<OoODynInst>.
     typedef typename Impl::DynInstPtr DynInstPtr;
 
-//    typedef typename Impl::BranchPred::BPredInfo BPredInfo;
-
     typedef TheISA::ExtMachInst ExtMachInst;
     typedef TheISA::MachInst MachInst;
     typedef TheISA::MiscReg MiscReg;
@@ -107,12 +105,6 @@ class OzoneDynInst : public BaseDynInst<Impl>
     // up.  In the future, you only really need a counter.
     bool memDepReady() { return srcMemInsts.empty(); }
 
-//    void setBPredInfo(const BPredInfo &bp_info) { bpInfo = bp_info; }
-
-//    BPredInfo &getBPredInfo() { return bpInfo; }
-
-//    OzoneXC *thread;
-
   private:
     void initInstPtrs();
 
@@ -133,20 +125,12 @@ class OzoneDynInst : public BaseDynInst<Impl>
      */
     DynInstPtr prevDestInst[MaxInstSrcRegs];
 
-//    BPredInfo bpInfo;
-
   public:
 
     Fault initiateAcc();
 
     Fault completeAcc();
-/*
-    template <class T>
-    Fault read(Addr addr, T &data, unsigned flags);
 
-    template <class T>
-    Fault write(T data, Addr addr, unsigned flags, uint64_t *res);
-*/
     // The register accessor methods provide the index of the
     // instruction's operand (e.g., 0 or 1), not the architectural
     // register index, to simplify the implementation of register
@@ -244,38 +228,4 @@ class OzoneDynInst : public BaseDynInst<Impl>
     bool iqItValid;
 };
 
-/*
-template<class Impl>
-template<class T>
-inline Fault
-OzoneDynInst<Impl>::read(Addr addr, T &data, unsigned flags)
-{
-    Fault fault = this->cpu->read(addr, data, flags, this);
-
-    if (this->traceData) {
-        this->traceData->setAddr(addr);
-        this->traceData->setData(data);
-    }
-
-    return fault;
-}
-
-template<class Impl>
-template<class T>
-inline Fault
-OzoneDynInst<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
-{
-    Fault fault = this->cpu->write(data, addr, flags, res, this);
-
-    this->storeSize = sizeof(T);
-    this->storeData = data;
-
-    if (this->traceData) {
-        this->traceData->setAddr(addr);
-        this->traceData->setData(data);
-    }
-
-    return fault;
-}
-*/
 #endif // __CPU_OZONE_DYN_INST_HH__
diff --git a/cpu/ozone/dyn_inst_impl.hh b/cpu/ozone/dyn_inst_impl.hh
index a7e4460a1..f891ec515 100644
--- a/cpu/ozone/dyn_inst_impl.hh
+++ b/cpu/ozone/dyn_inst_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 The Regents of The University of Michigan
+ * Copyright (c) 2005-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -266,12 +266,7 @@ OzoneDynInst<Impl>::hwrei()
     this->setNextPC(this->thread->readMiscReg(AlphaISA::IPR_EXC_ADDR));
 
     this->cpu->hwrei();
-/*
-    this->cpu->kernelStats->hwrei();
 
-    this->cpu->checkInterrupts = true;
-    this->cpu->lockFlag = false;
-*/
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
 }
diff --git a/cpu/ozone/front_end.hh b/cpu/ozone/front_end.hh
index 326f7d2c9..dd382491f 100644
--- a/cpu/ozone/front_end.hh
+++ b/cpu/ozone/front_end.hh
@@ -1,14 +1,39 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __CPU_OZONE_FRONT_END_HH__
 #define __CPU_OZONE_FRONT_END_HH__
 
 #include <deque>
 
-//#include "cpu/ozone/cpu.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/bpred_unit.hh"
 #include "cpu/ozone/rename_table.hh"
-//#include "cpu/ozone/thread_state.hh"
 #include "mem/mem_req.hh"
 #include "sim/eventq.hh"
 #include "sim/stats.hh"
@@ -132,11 +157,6 @@ class FrontEnd
 
     typedef typename Impl::BranchPred BranchPred;
 
-    // Typedef for semi-opaque type that holds any information the branch
-    // predictor needs to update itself.  Only two fields are used outside of
-    // branch predictor, nextPC and isTaken.
-//    typedef typename BranchPred::BPredInfo BPredInfo;
-
     BranchPred branchPred;
 
     class ICacheCompletionEvent : public Event
diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh
index cd57aeef4..15adae9b4 100644
--- a/cpu/ozone/front_end_impl.hh
+++ b/cpu/ozone/front_end_impl.hh
@@ -1,3 +1,30 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #include "arch/faults.hh"
 #include "arch/isa_traits.hh"
@@ -26,14 +53,6 @@ FrontEnd<Impl>::FrontEnd(Params *params)
 
     status = Idle;
 
-    // Setup branch predictor.
-
-    // Setup Memory Request
-/*
-    memReq = new MemReq();
-    memReq->asid = 0;
-    memReq->data = new uint8_t[64];
-*/
     memReq = NULL;
     // Size of cache block.
     cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
@@ -77,7 +96,6 @@ void
 FrontEnd<Impl>::setXC(ExecContext *xc_ptr)
 {
     xc = xc_ptr;
-//    memReq->xc = xc;
 }
 
 template <class Impl>
@@ -321,7 +339,6 @@ FrontEnd<Impl>::tick()
             break;
         }
 
-        // if (generalizeFetch) {
         processInst(inst);
 
         if (status == SerializeBlocked) {
@@ -333,11 +350,6 @@ FrontEnd<Impl>::tick()
         instBuffer.push_back(inst);
         ++instBufferSize;
         ++num_inst;
-        // } else {
-        // fetch(num_inst);
-        // decode(num_inst);
-        // rename(num_inst);
-        // }
 
 #if FULL_SYSTEM
         if (inst->isQuiesce()) {
@@ -402,10 +414,6 @@ FrontEnd<Impl>::fetchCacheLine()
     // Translate the instruction request.
     fault = cpu->translateInstReq(memReq);
 
-    // In the case of faults, the fetch stage may need to stall and wait
-    // on what caused the fetch (ITB or Icache miss).
-//    assert(fault == NoFault);
-
     // Now do the timing access to see whether or not the instruction
     // exists within the cache.
     if (icacheInterface && fault == NoFault) {
@@ -466,7 +474,6 @@ FrontEnd<Impl>::processInst(DynInstPtr &inst)
 
     Addr inst_PC = inst->readPC();
 
-//    BPredInfo bp_info = branchPred.lookup(inst_PC);
     if (!inst->isControl()) {
         inst->setPredTarg(inst->readNextPC());
     } else {
@@ -482,7 +489,6 @@ FrontEnd<Impl>::processInst(DynInstPtr &inst)
             "%#x\n", inst->seqNum, inst_PC, next_PC);
 
 //    inst->setNextPC(next_PC);
-//    inst->setBPredInfo(bp_info);
 
     // Not sure where I should set this
     PC = next_PC;
@@ -535,7 +541,7 @@ void
 FrontEnd<Impl>::handleFault(Fault &fault)
 {
     DPRINTF(FE, "Fault at fetch, telling commit\n");
-//    backEnd->fetchFault(fault);
+
     // We're blocked on the back end until it handles this fault.
     status = TrapPending;
 
@@ -586,9 +592,6 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
         instBuffer.pop_back();
         --instBufferSize;
 
-        // Fix up branch predictor if necessary.
-//        branchPred.undo(inst->getBPredInfo());
-
         freeRegs+= inst->numDestRegs();
     }
 
@@ -607,7 +610,6 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
     // Clear the icache miss if it's outstanding.
     if (status == IcacheMissStall && icacheInterface) {
         DPRINTF(FE, "Squashing outstanding Icache miss.\n");
-//        icacheInterface->squash(0);
         memReq = NULL;
     }
 
@@ -693,17 +695,9 @@ template <class Impl>
 bool
 FrontEnd<Impl>::updateStatus()
 {
-//    bool rename_block = freeRegs <= 0;
     bool serialize_block = !backEnd->robEmpty() || instBufferSize;
     bool be_block = cpu->decoupledFrontEnd ? false : backEnd->isBlocked();
     bool ret_val = false;
-/*
-  // Should already be handled through addFreeRegs function
-    if (status == RenameBlocked && !rename_block) {
-        status = Running;
-        ret_val = true;
-    }
-*/
 
     if (status == SerializeBlocked && !serialize_block) {
         status = SerializeComplete;
@@ -753,10 +747,6 @@ FrontEnd<Impl>::getInstFromCacheline()
 
     // PC of inst is not in this cache block
     if (PC >= (cacheBlkPC + cacheBlkSize) || PC < cacheBlkPC || !cacheBlkValid) {
-//        DPRINTF(OoOCPU, "OoOCPU: PC is not in this cache block\n");
-//        DPRINTF(OoOCPU, "OoOCPU: PC: %#x, cacheBlkPC: %#x, cacheBlkValid: %i",
-//                PC, cacheBlkPC, cacheBlkValid);
-//        panic("Instruction not in cache line or cache line invalid!");
         return NULL;
     }
 
diff --git a/cpu/ozone/lw_back_end.hh b/cpu/ozone/lw_back_end.hh
index 770b66ad5..1c03ffb73 100644
--- a/cpu/ozone/lw_back_end.hh
+++ b/cpu/ozone/lw_back_end.hh
@@ -1,3 +1,30 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __CPU_OZONE_LW_BACK_END_HH__
 #define __CPU_OZONE_LW_BACK_END_HH__
@@ -238,10 +265,6 @@ class LWBackEnd
     Counter funcExeInst;
 
   private:
-//    typedef typename Impl::InstQueue InstQueue;
-
-//    InstQueue IQ;
-
     typedef typename Impl::LdstQueue LdstQueue;
 
     LdstQueue LSQ;
@@ -342,8 +365,6 @@ class LWBackEnd
 
     bool exactFullStall;
 
-//    bool fetchRedirect[Impl::MaxThreads];
-
     // number of cycles stalled for D-cache misses
 /*    Stats::Scalar<> dcacheStallCycles;
       Counter lastDcacheStall;
@@ -438,43 +459,6 @@ template <class T>
 Fault
 LWBackEnd<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 {
-/*    memReq->reset(addr, sizeof(T), flags);
-
-    // translate to physical address
-    Fault fault = cpu->translateDataReadReq(memReq);
-
-    // if we have a cache, do cache access too
-    if (fault == NoFault && dcacheInterface) {
-        memReq->cmd = Read;
-        memReq->completionEvent = NULL;
-        memReq->time = curTick;
-        memReq->flags &= ~INST_READ;
-        MemAccessResult result = dcacheInterface->access(memReq);
-
-        // Ugly hack to get an event scheduled *only* if the access is
-        // a miss.  We really should add first-class support for this
-        // at some point.
-        if (result != MA_HIT && dcacheInterface->doEvents()) {
-            // Fix this hack for keeping funcExeInst correct with loads that
-            // are executed twice.
-            --funcExeInst;
-
-            memReq->completionEvent = &cacheCompletionEvent;
-            lastDcacheStall = curTick;
-//	    unscheduleTickEvent();
-//	    status = DcacheMissStall;
-            DPRINTF(OzoneCPU, "Dcache miss stall!\n");
-        } else {
-            // do functional access
-            fault = thread->mem->read(memReq, data);
-
-        }
-    }
-*/
-/*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
-        recordEvent("Uncached Read");
-*/
     return LSQ.read(req, data, load_idx);
 }
 
@@ -483,39 +467,6 @@ template <class T>
 Fault
 LWBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
 {
-/*
-    memReq->reset(addr, sizeof(T), flags);
-
-    // translate to physical address
-    Fault fault = cpu->translateDataWriteReq(memReq);
-
-    if (fault == NoFault && dcacheInterface) {
-        memReq->cmd = Write;
-        memcpy(memReq->data,(uint8_t *)&data,memReq->size);
-        memReq->completionEvent = NULL;
-        memReq->time = curTick;
-        memReq->flags &= ~INST_READ;
-        MemAccessResult result = dcacheInterface->access(memReq);
-
-        // Ugly hack to get an event scheduled *only* if the access is
-        // a miss.  We really should add first-class support for this
-        // at some point.
-        if (result != MA_HIT && dcacheInterface->doEvents()) {
-            memReq->completionEvent = &cacheCompletionEvent;
-            lastDcacheStall = curTick;
-//	    unscheduleTickEvent();
-//	    status = DcacheMissStall;
-            DPRINTF(OzoneCPU, "Dcache miss stall!\n");
-        }
-    }
-
-    if (res && (fault == NoFault))
-        *res = memReq->result;
-        */
-/*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
-        recordEvent("Uncached Write");
-*/
     return LSQ.write(req, data, store_idx);
 }
 
diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh
index db0872e52..881d6e6b1 100644
--- a/cpu/ozone/lw_back_end_impl.hh
+++ b/cpu/ozone/lw_back_end_impl.hh
@@ -1,7 +1,34 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
-#include "encumbered/cpu/full/op_class.hh"
 #include "cpu/checker/cpu.hh"
 #include "cpu/ozone/lw_back_end.hh"
+#include "encumbered/cpu/full/op_class.hh"
 
 template <class Impl>
 void
@@ -194,7 +221,6 @@ LWBackEnd<Impl>::LWBackEnd(Params *params)
     switchedOut = false;
     switchPending = false;
 
-//    IQ.setBE(this);
     LSQ.setBE(this);
 
     // Setup IQ and LSQ with their parameters here.
@@ -202,8 +228,6 @@ LWBackEnd<Impl>::LWBackEnd(Params *params)
 
     instsToExecute = i2e.getWire(-1);
 
-//    IQ.setIssueExecQueue(&i2e);
-
     dispatchWidth = params->dispatchWidth ? params->dispatchWidth : width;
     issueWidth = params->issueWidth ? params->issueWidth : width;
     wbWidth = params->wbWidth ? params->wbWidth : width;
@@ -538,8 +562,6 @@ LWBackEnd<Impl>::regStats()
         .desc("ROB Occupancy per cycle")
         .flags(total | cdf)
         ;
-
-//    IQ.regStats();
 }
 
 template <class Impl>
@@ -652,17 +674,7 @@ LWBackEnd<Impl>::tick()
         squashFromTrap();
     } else if (xcSquash) {
         squashFromXC();
-    } /*else if (fetchHasFault && robEmpty() && frontEnd->isEmpty() && !LSQ.hasStoresToWB()) {
-        DPRINTF(BE, "ROB and front end empty, handling fetch fault\n");
-        Fault fetch_fault = frontEnd->getFault();
-        if (fetch_fault == NoFault) {
-            DPRINTF(BE, "Fetch no longer has a fault, cancelling out.\n");
-            fetchHasFault = false;
-        } else {
-            handleFault(fetch_fault);
-            fetchHasFault = false;
-        }
-        }*/
+    }
 #endif
 
     if (dispatchStatus != Blocked) {
@@ -773,7 +785,8 @@ LWBackEnd<Impl>::dispatchInsts()
                     inst->iqItValid = true;
                     waitingInsts++;
                 } else {
-                    DPRINTF(BE, "Instruction [sn:%lli] ready, addding to exeList.\n",
+                    DPRINTF(BE, "Instruction [sn:%lli] ready, addding to "
+                            "exeList.\n",
                             inst->seqNum);
                     exeList.push(inst);
                 }
@@ -784,7 +797,8 @@ LWBackEnd<Impl>::dispatchInsts()
                 inst->setExecuted();
                 inst->setCanCommit();
             } else {
-                DPRINTF(BE, "Instruction [sn:%lli] ready, addding to exeList.\n",
+                DPRINTF(BE, "Instruction [sn:%lli] ready, addding to "
+                        "exeList.\n",
                         inst->seqNum);
                 exeList.push(inst);
             }
@@ -993,7 +1007,7 @@ LWBackEnd<Impl>::instToCommit(DynInstPtr &inst)
 
     writeback_count[0]++;
 }
-
+#if 0
 template <class Impl>
 void
 LWBackEnd<Impl>::writebackInsts()
@@ -1040,7 +1054,7 @@ LWBackEnd<Impl>::writebackInsts()
     consumer_inst[0]+= consumer_insts;
     writeback_count[0]+= inst_num;
 }
-
+#endif
 template <class Impl>
 bool
 LWBackEnd<Impl>::commitInst(int inst_num)
@@ -1219,15 +1233,15 @@ LWBackEnd<Impl>::commitInst(int inst_num)
 
     --numInsts;
     ++thread->funcExeInst;
-    // Maybe move this to where the fault is handled; if the fault is handled,
-    // don't try to set this myself as the fault will set it.  If not, then
-    // I set thread->PC = thread->nextPC and thread->nextPC = thread->nextPC + 4.
+    // Maybe move this to where the fault is handled; if the fault is
+    // handled, don't try to set this myself as the fault will set it.
+    // If not, then I set thread->PC = thread->nextPC and
+    // thread->nextPC = thread->nextPC + 4.
     thread->setPC(thread->readNextPC());
     thread->setNextPC(thread->readNextPC() + sizeof(TheISA::MachInst));
     updateComInstStats(inst);
 
     // Write the done sequence number here.
-//    LSQ.commitLoads(inst->seqNum);
     toIEW->doneSeqNum = inst->seqNum;
     lastCommitCycle = curTick;
 
@@ -1357,7 +1371,8 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
     }
 
     while (memBarrier && memBarrier->seqNum > sn) {
-        DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously squashed)\n", memBarrier->seqNum);
+        DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously "
+                "squashed)\n", memBarrier->seqNum);
         memBarrier->clearMemDependents();
         if (memBarrier->memDepReady()) {
             DPRINTF(BE, "No previous barrier\n");
diff --git a/cpu/ozone/lw_lsq.hh b/cpu/ozone/lw_lsq.hh
index 042610324..6fe343b42 100644
--- a/cpu/ozone/lw_lsq.hh
+++ b/cpu/ozone/lw_lsq.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -138,7 +138,6 @@ class OzoneLWLSQ {
     /** Executes a load instruction. */
     Fault executeLoad(DynInstPtr &inst);
 
-//    Fault executeLoad(int lq_idx);
     /** Executes a store instruction. */
     Fault executeStore(DynInstPtr &inst);
 
@@ -304,10 +303,8 @@ class OzoneLWLSQ {
     Status _status;
 
     /** The store queue. */
-//    std::vector<SQEntry> storeQueue;
     std::list<SQEntry> storeQueue;
     /** The load queue. */
-//    std::vector<DynInstPtr> loadQueue;
     std::list<DynInstPtr> loadQueue;
 
     typedef typename std::list<SQEntry>::iterator SQIt;
@@ -365,7 +362,6 @@ class OzoneLWLSQ {
      */
     InstSeqNum stallingStoreIsn;
     /** The index of the above store. */
-//    int stallingLoadIdx;
     LQIt stallingLoad;
 
     /** Whether or not a load is blocked due to the memory system.  It is
@@ -398,8 +394,6 @@ class OzoneLWLSQ {
     template <class T>
     Fault write(MemReqPtr &req, T &data, int store_idx);
 
-    /** Returns the index of the head load instruction. */
-//    int getLoadHead() { return loadHead; }
     /** Returns the sequence number of the head load instruction. */
     InstSeqNum getLoadHeadSeqNum()
     {
@@ -411,8 +405,6 @@ class OzoneLWLSQ {
 
     }
 
-    /** Returns the index of the head store instruction. */
-//    int getStoreHead() { return storeHead; }
     /** Returns the sequence number of the head store instruction. */
     InstSeqNum getStoreHeadSeqNum()
     {
@@ -604,12 +596,7 @@ OzoneLWLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         DPRINTF(OzoneLSQ, "D-cache: PC:%#x reading from paddr:%#x "
                 "vaddr:%#x flags:%i\n",
                 inst->readPC(), req->paddr, req->vaddr, req->flags);
-/*
-        Addr debug_addr = ULL(0xfffffc0000be81a8);
-        if (req->vaddr == debug_addr) {
-            debug_break();
-        }
-*/
+
         assert(!req->completionEvent);
         req->completionEvent =
             new typename BackEnd::LdWritebackEvent(inst, be);
@@ -631,9 +618,6 @@ OzoneLWLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             _status = DcacheMissStall;
 
         } else {
-//            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
-//                    inst->seqNum);
-
             DPRINTF(OzoneLSQ, "D-cache hit!\n");
         }
     } else {
@@ -664,12 +648,7 @@ OzoneLWLSQ<Impl>::write(MemReqPtr &req, T &data, int store_idx)
     assert(!req->data);
     req->data = new uint8_t[64];
     memcpy(req->data, (uint8_t *)&(*sq_it).data, req->size);
-/*
-    Addr debug_addr = ULL(0xfffffc0000be81a8);
-    if (req->vaddr == debug_addr) {
-        debug_break();
-    }
-*/
+
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/cpu/ozone/lw_lsq_impl.hh b/cpu/ozone/lw_lsq_impl.hh
index fdf6bff07..2f85a0396 100644
--- a/cpu/ozone/lw_lsq_impl.hh
+++ b/cpu/ozone/lw_lsq_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -104,12 +104,6 @@ OzoneLWLSQ<Impl>::init(Params *params, unsigned maxLQEntries,
         SQIndices.push(i);
     }
 
-    // May want to initialize these entries to NULL
-
-//    loadHead = loadTail = 0;
-
-//    storeHead = storeWBIdx = storeTail = 0;
-
     usedPorts = 0;
     cachePorts = params->cachePorts;
 
@@ -197,8 +191,6 @@ OzoneLWLSQ<Impl>::insert(DynInstPtr &inst)
     } else {
         insertStore(inst);
     }
-
-//    inst->setInLSQ();
 }
 
 template <class Impl>
@@ -569,12 +561,9 @@ OzoneLWLSQ<Impl>::writebackStores()
             }
 
             if (result != MA_HIT && dcacheInterface->doEvents()) {
-//                Event *wb = NULL;
                 store_event->miss = true;
                 typename BackEnd::LdWritebackEvent *wb = NULL;
                 if (req->flags & LOCKED) {
-                    // Stx_C does not generate a system port transaction.
-//                    req->result=1;
                     wb = new typename BackEnd::LdWritebackEvent(inst,
                                                             be);
                     store_event->wbEvent = wb;
@@ -585,8 +574,6 @@ OzoneLWLSQ<Impl>::writebackStores()
 //                DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
 //                        inst->seqNum);
 
-                // Will stores need their own kind of writeback events?
-                // Do stores even need writeback events?
                 be->addDcacheMiss(inst);
 
                 lastDcacheStall = curTick;
@@ -604,20 +591,16 @@ OzoneLWLSQ<Impl>::writebackStores()
 //                        inst->seqNum);
 
                 if (req->flags & LOCKED) {
-                    // Stx_C does not generate a system port transaction.
-/*                    if (req->flags & UNCACHEABLE) {
-                        req->result = 2;
-                    } else {
-                        req->result = 1;
-                    }
-*/
+                    // Stx_C does not generate a system port
+                    // transaction in the 21264, but that might be
+                    // hard to accomplish in this model.
+
                     typename BackEnd::LdWritebackEvent *wb =
                         new typename BackEnd::LdWritebackEvent(inst,
                                                                be);
                     store_event->wbEvent = wb;
                 }
                 sq_it--;
-//                completeStore(inst->sqIdx);
             }
         } else {
             panic("Must HAVE DCACHE!!!!!\n");
@@ -780,7 +763,7 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx)
     SQIndices.push(inst->sqIdx);
     storeQueue.erase(sq_it);
     --stores;
-//    assert(!inst->isCompleted());
+
     inst->setCompleted();
     if (cpu->checker) {
         cpu->checker->tick(inst);
@@ -791,7 +774,6 @@ template <class Impl>
 void
 OzoneLWLSQ<Impl>::switchOut()
 {
-//    assert(loads == 0);
     assert(storesToWB == 0);
     switchedOut = true;
     SQIt sq_it = --(storeQueue.end());
@@ -804,8 +786,6 @@ OzoneLWLSQ<Impl>::switchOut()
 
         if ((*sq_it).size == 0 && !(*sq_it).completed) {
             sq_it--;
-//            completeStore(inst->sqIdx);
-
             continue;
         }
 
@@ -817,7 +797,8 @@ OzoneLWLSQ<Impl>::switchOut()
             continue;
         } else if ((*sq_it).req->flags & LOCKED) {
             sq_it--;
-            assert(!(*sq_it).canWB || ((*sq_it).canWB && (*sq_it).req->flags & LOCKED));
+            assert(!(*sq_it).canWB ||
+                   ((*sq_it).canWB && (*sq_it).req->flags & LOCKED));
             continue;
         }
 
@@ -886,12 +867,6 @@ OzoneLWLSQ<Impl>::takeOverFrom(ExecContext *old_xc)
         SQIndices.push(i);
     }
 
-    // May want to initialize these entries to NULL
-
-//    loadHead = loadTail = 0;
-
-//    storeHead = storeWBIdx = storeTail = 0;
-
     usedPorts = 0;
 
     loadFaultInst = storeFaultInst = memDepViolator = NULL;
diff --git a/cpu/ozone/rename_table.hh b/cpu/ozone/rename_table.hh
index afbf6ff32..6ee23b21b 100644
--- a/cpu/ozone/rename_table.hh
+++ b/cpu/ozone/rename_table.hh
@@ -1,3 +1,31 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #ifndef __CPU_OZONE_RENAME_TABLE_HH__
 #define __CPU_OZONE_RENAME_TABLE_HH__
 
diff --git a/cpu/ozone/thread_state.hh b/cpu/ozone/thread_state.hh
index 269fc6459..c86c3a720 100644
--- a/cpu/ozone/thread_state.hh
+++ b/cpu/ozone/thread_state.hh
@@ -1,3 +1,30 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __CPU_OZONE_THREAD_STATE_HH__
 #define __CPU_OZONE_THREAD_STATE_HH__
@@ -62,19 +89,14 @@ struct OzoneThreadState : public ThreadState {
 
     void setStatus(Status new_status) { _status = new_status; }
 
-    RenameTable<Impl> renameTable; // Should I include backend and frontend
-    // tables here?  For the ozone CPU, maybe, for the new full CPU, probably
-    // not...you wouldn't want threads just accessing the backend/frontend
-    // rename tables.
-    Addr PC; // What should these be set to?  Probably the committed ones.
+    RenameTable<Impl> renameTable;
+    Addr PC;
     Addr nextPC;
 
-    // Current instruction?
+    // Current instruction
     TheISA::MachInst inst;
 
     TheISA::RegFile regs;
-    // Front end?  Back end?
-//    MemReqPtr memReq;
 
     typename Impl::FullCPU *cpu;
 

From c9ad4a15d6b1460ea2b9c1515739f56f81ea9b57 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 16:59:13 -0400
Subject: [PATCH 45/50] Cleanup checker.

cpu/checker/cpu.cc:
    Cleanup checker, give more useful warning messages.
    Also fix bug
cpu/checker/cpu.hh:
    Cleanup checker, use forward declaration instead of include.

--HG--
extra : convert_revision : 8f231199a0a75788218320cdbcc7f70441e5d574
---
 cpu/checker/cpu.cc | 321 +++++++++++++++------------------------------
 cpu/checker/cpu.hh |  16 +--
 2 files changed, 111 insertions(+), 226 deletions(-)

diff --git a/cpu/checker/cpu.cc b/cpu/checker/cpu.cc
index f76f1e063..08ab5d5c8 100644
--- a/cpu/checker/cpu.cc
+++ b/cpu/checker/cpu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * Copyright (c) 2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,41 +26,17 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-//#include <cmath>
-#include <cstdio>
-//#include <cstdlib>
-#include <iostream>
-#include <iomanip>
 #include <list>
-//#include <sstream>
 #include <string>
 
-//#include "base/cprintf.hh"
-//#include "base/inifile.hh"
-//#include "base/loader/symtab.hh"
-#include "base/misc.hh"
-//#include "base/pollevent.hh"
-//#include "base/range.hh"
 #include "base/refcnt.hh"
-//#include "base/stats/events.hh"
 #include "cpu/base.hh"
 #include "cpu/base_dyn_inst.hh"
 #include "cpu/checker/cpu.hh"
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/exec_context.hh"
-//#include "cpu/exetrace.hh"
-//#include "cpu/profile.hh"
-#include "cpu/sampler/sampler.hh"
-//#include "cpu/smt.hh"
 #include "cpu/static_inst.hh"
-//#include "kern/kernel_stats.hh"
-#include "mem/base_mem.hh"
-#include "mem/mem_interface.hh"
 #include "sim/byteswap.hh"
-#include "sim/builder.hh"
-//#include "sim/debug.hh"
-//#include "sim/host.hh"
-//#include "sim/sim_events.hh"
 #include "sim/sim_object.hh"
 #include "sim/stats.hh"
 
@@ -72,15 +48,8 @@
 #include "cpu/ozone/simple_impl.hh"
 
 #if FULL_SYSTEM
-#include "base/remote_gdb.hh"
-#include "mem/functional/memory_control.hh"
-#include "mem/functional/physical.hh"
 #include "sim/system.hh"
-#include "arch/tlb.hh"
-#include "arch/stacktrace.hh"
 #include "arch/vtophys.hh"
-#else // !FULL_SYSTEM
-#include "mem/functional/functional.hh"
 #endif // FULL_SYSTEM
 
 using namespace std;
@@ -90,17 +59,6 @@ using namespace AlphaISA;
 void
 CheckerCPU::init()
 {
-/*
-    BaseCPU::init();
-#if FULL_SYSTEM
-    for (int i = 0; i < execContexts.size(); ++i) {
-        ExecContext *xc = execContexts[i];
-
-        // initialize CPU, including PC
-        TheISA::initCPU(xc, xc->readCpuId());
-    }
-#endif
-*/
 }
 
 CheckerCPU::CheckerCPU(Params *p)
@@ -151,6 +109,8 @@ CheckerCPU::setMemory(FunctionalMemory *mem)
         xcProxy = cpuXC->getProxy();
         execContexts.push_back(xcProxy);
         memReq->xc = xcProxy;
+        delete cpuXC->kernelStats;
+        cpuXC->kernelStats = NULL;
     }
 #endif
 }
@@ -168,6 +128,8 @@ CheckerCPU::setSystem(System *system)
         xcProxy = cpuXC->getProxy();
         execContexts.push_back(xcProxy);
         memReq->xc = xcProxy;
+        delete cpuXC->kernelStats;
+        cpuXC->kernelStats = NULL;
     }
 }
 #endif
@@ -197,82 +159,15 @@ CheckerCPU::unserialize(Checkpoint *cp, const string &section)
 Fault
 CheckerCPU::copySrcTranslate(Addr src)
 {
-    static bool no_warn = true;
-    int blk_size = 64;
-    // Only support block sizes of 64 atm.
-    assert(blk_size == 64);
-    int offset = src & (blk_size - 1);
-
-    // Make sure block doesn't span page
-    if (no_warn &&
-        (src & PageMask) != ((src + blk_size) & PageMask) &&
-        (src >> 40) != 0xfffffc) {
-        warn("Copied block source spans pages %x.", src);
-        no_warn = false;
-    }
-
-    memReq->reset(src & ~(blk_size - 1), blk_size);
-
-    // translate to physical address
-    Fault fault = cpuXC->translateDataReadReq(memReq);
-
-    if (fault == NoFault) {
-        cpuXC->copySrcAddr = src;
-        cpuXC->copySrcPhysAddr = memReq->paddr + offset;
-    } else {
-        assert(!fault->isAlignmentFault());
-
-        cpuXC->copySrcAddr = 0;
-        cpuXC->copySrcPhysAddr = 0;
-    }
-    return fault;
+    panic("Unimplemented!");
 }
 
 Fault
 CheckerCPU::copy(Addr dest)
 {
-    static bool no_warn = true;
-    int blk_size = 64;
-    // Only support block sizes of 64 atm.
-    assert(blk_size == 64);
-    uint8_t data[blk_size];
-    //assert(cpuXC->copySrcAddr);
-    int offset = dest & (blk_size - 1);
-
-    // Make sure block doesn't span page
-    if (no_warn &&
-        (dest & PageMask) != ((dest + blk_size) & PageMask) &&
-        (dest >> 40) != 0xfffffc) {
-        no_warn = false;
-        warn("Copied block destination spans pages %x. ", dest);
-    }
-
-    memReq->reset(dest & ~(blk_size -1), blk_size);
-    // translate to physical address
-    Fault fault = cpuXC->translateDataWriteReq(memReq);
-
-    if (fault == NoFault) {
-        Addr dest_addr = memReq->paddr + offset;
-        // Need to read straight from memory since we have more than 8 bytes.
-        memReq->paddr = cpuXC->copySrcPhysAddr;
-        cpuXC->mem->read(memReq, data);
-        memReq->paddr = dest_addr;
-        cpuXC->mem->write(memReq, data);
-        memReq->cmd = Copy;
-        memReq->completionEvent = NULL;
-        memReq->paddr = cpuXC->copySrcPhysAddr;
-        memReq->dest = dest_addr;
-        memReq->size = 64;
-        memReq->time = curTick;
-        memReq->flags &= ~INST_READ;
-    }
-    else
-        assert(!fault->isAlignmentFault());
-
-    return fault;
+    panic("Unimplemented!");
 }
 
-// precise architected memory state accessor macros
 template <class T>
 Fault
 CheckerCPU::read(Addr addr, T &data, unsigned flags)
@@ -280,17 +175,15 @@ CheckerCPU::read(Addr addr, T &data, unsigned flags)
     memReq->reset(addr, sizeof(T), flags);
 
     // translate to physical address
-    // Should I probe the DTB?  Or should I just take the physical address
-    // and assume correct translation?
     translateDataReadReq(memReq);
 
-    // if we have a cache, do cache access too
     memReq->cmd = Read;
     memReq->completionEvent = NULL;
     memReq->time = curTick;
     memReq->flags &= ~INST_READ;
 
     if (!(memReq->flags & UNCACHEABLE)) {
+        // Access memory to see if we have the same data
         cpuXC->read(memReq, data);
     } else {
         // Assume the data is correct if it's an uncached access
@@ -350,29 +243,34 @@ CheckerCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
     // translate to physical address
     cpuXC->translateDataWriteReq(memReq);
 
-    if ((!(unverifiedReq->flags & LOCKED) ||
-        ((unverifiedReq->flags & LOCKED) &&
-         unverifiedReq->result == 1)) &&
-        !(unverifiedReq->flags & UNCACHEABLE)) {
-        // do functional access
-//        cpuXC->read(memReq, data);
-
-        memReq->cmd = Write;
-//    memcpy(memReq->data,(uint8_t *)&data,memReq->size);
-        T inst_data;
-        memcpy(&inst_data, unverifiedReq->data, sizeof(T));
+    // Can compare the write data and result only if it's cacheable,
+    // not a store conditional, or is a store conditional that
+    // succeeded.
+    // @todo: Verify that actual memory matches up with these values.
+    // Right now it only verifies that the instruction data is the
+    // same as what was in the request that got sent to memory; there
+    // is no verification that it is the same as what is in memory.
+    // This is because the LSQ would have to be snooped in the CPU to
+    // verify this data.
+    if (unverifiedReq &&
+        !(unverifiedReq->flags & UNCACHEABLE) &&
+        (!(unverifiedReq->flags & LOCKED) ||
+         ((unverifiedReq->flags & LOCKED) &&
+          unverifiedReq->result == 1))) {
+#if 0
+        memReq->cmd = Read;
         memReq->completionEvent = NULL;
         memReq->time = curTick;
         memReq->flags &= ~INST_READ;
+        cpuXC->read(memReq, inst_data);
+#endif
+        T inst_data;
+        memcpy(&inst_data, unverifiedReq->data, sizeof(T));
 
-        // Hard to verify this as the data writes back after the
-        // instruction commits.  May only be able to check that the
-        // value produced from execute() matches the value produced
-        // from the instruction's first execution.
         if (data != inst_data) {
-            warn("Store value does not match value in memory! "
+            warn("%lli: Store value does not match value in memory! "
                  "Instruction: %#x, memory: %#x",
-                 inst_data, data);
+                 curTick, inst_data, data);
             handleError();
         }
     }
@@ -436,19 +334,6 @@ CheckerCPU::dbg_vtophys(Addr addr)
 }
 #endif // FULL_SYSTEM
 
-#if FULL_SYSTEM
-void
-CheckerCPU::post_interrupt(int int_num, int index)
-{
-    BaseCPU::post_interrupt(int_num, index);
-
-    if (cpuXC->status() == ExecContext::Suspended) {
-                DPRINTF(IPI,"Suspended Processor awoke\n");
-        cpuXC->activate();
-    }
-}
-#endif // FULL_SYSTEM
-
 bool
 CheckerCPU::translateInstReq(MemReqPtr &req)
 {
@@ -466,15 +351,16 @@ CheckerCPU::translateDataReadReq(MemReqPtr &req)
     cpuXC->translateDataReadReq(req);
 
     if (req->vaddr != unverifiedReq->vaddr) {
-        warn("Request virtual addresses do not match! Inst: %#x, checker:"
-             " %#x",
-             unverifiedReq->vaddr, req->vaddr);
+        warn("%lli: Request virtual addresses do not match! Inst: %#x, "
+             "checker: %#x",
+             curTick, unverifiedReq->vaddr, req->vaddr);
+        handleError();
     }
     req->paddr = unverifiedReq->paddr;
 
     if (checkFlags(req)) {
-        warn("Request flags do not match! Inst: %#x, checker: %#x",
-             unverifiedReq->flags, req->flags);
+        warn("%lli: Request flags do not match! Inst: %#x, checker: %#x",
+             curTick, unverifiedReq->flags, req->flags);
         handleError();
     }
 }
@@ -485,15 +371,16 @@ CheckerCPU::translateDataWriteReq(MemReqPtr &req)
     cpuXC->translateDataWriteReq(req);
 
     if (req->vaddr != unverifiedReq->vaddr) {
-        warn("Request virtual addresses do not match! Inst: %#x, checker:"
-             " %#x",
-             unverifiedReq->vaddr, req->vaddr);
+        warn("%lli: Request virtual addresses do not match! Inst: %#x, "
+             "checker: %#x",
+             curTick, unverifiedReq->vaddr, req->vaddr);
+        handleError();
     }
     req->paddr = unverifiedReq->paddr;
 
     if (checkFlags(req)) {
-        warn("Request flags do not match! Inst: %#x, checker: %#x",
-             unverifiedReq->flags, req->flags);
+        warn("%lli: Request flags do not match! Inst: %#x, checker: %#x",
+             curTick, unverifiedReq->flags, req->flags);
         handleError();
     }
 }
@@ -512,13 +399,17 @@ CheckerCPU::checkFlags(MemReqPtr &req)
     }
 }
 
-/* start simulation, program loaded, processor precise state initialized */
 template <class DynInstPtr>
 void
 Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
 {
     DynInstPtr inst;
 
+    // Either check this instruction, or add it to a list of
+    // instructions waiting to be checked.  Instructions must be
+    // checked in program order, so if a store has committed yet not
+    // completed, there may be some instructions that are waiting
+    // behind it that have completed and must be checked.
     if (!instList.empty()) {
         if (youngestSN < completed_inst->seqNum) {
             DPRINTF(Checker, "Adding instruction [sn:%lli] PC:%#x to list.\n",
@@ -547,16 +438,17 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
                 inst = completed_inst;
                 youngestSN = completed_inst->seqNum;
             } else {
-//                panic("SN already seen yet the list is empty!");
                 return;
             }
         }
     }
 
+    // Try to check all instructions that are completed, ending if we
+    // run out of instructions to check or if an instruction is not
+    // yet completed.
     while (1) {
         DPRINTF(Checker, "Processing instruction [sn:%lli] PC:%#x.\n",
                 inst->seqNum, inst->readPC());
-//    verifyInst = completed_inst;
         unverifiedResult.integer = inst->readIntResult();
         unverifiedReq = inst->req;
         numCycles++;
@@ -569,15 +461,9 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
         cpuXC->setFloatRegDouble(ZeroReg, 0.0);
 #endif // TARGET_ALPHA
 
-        // Try to fetch an instruction
-
-        // set up memory request for instruction fetch
-#if FULL_SYSTEM
-#define IFETCH_FLAGS(pc)	((pc) & 1) ? PHYSICAL : 0
-#else
-#define IFETCH_FLAGS(pc)	0
-#endif
-
+        // Check if any recent PC changes match up with anything we
+        // expect to happen.  This is mostly to check if traps or
+        // PC-based events have occurred in both the checker and CPU.
         if (changedPC) {
             DPRINTF(Checker, "Changed PC recently to %#x\n",
                     cpuXC->readPC());
@@ -585,9 +471,9 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
                 if (newPC == cpuXC->readPC()) {
                     DPRINTF(Checker, "Changed PC matches expected PC\n");
                 } else {
-                    warn("Changed PC does not match expected PC, changed: %#x, "
-                         "expected: %#x",
-                         cpuXC->readPC(), newPC);
+                    warn("%lli: Changed PC does not match expected PC, "
+                         "changed: %#x, expected: %#x",
+                         curTick, cpuXC->readPC(), newPC);
                     handleError();
                 }
                 willChangePC = false;
@@ -600,6 +486,15 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
             changedNextPC = false;
         }
 
+        // Try to fetch the instruction
+
+#if FULL_SYSTEM
+#define IFETCH_FLAGS(pc)	((pc) & 1) ? PHYSICAL : 0
+#else
+#define IFETCH_FLAGS(pc)	0
+#endif
+
+        // set up memory request for instruction fetch
         memReq->cmd = Read;
         memReq->reset(cpuXC->readPC() & ~3, sizeof(uint32_t),
                       IFETCH_FLAGS(cpuXC->readPC()));
@@ -608,8 +503,13 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
 
         if (!succeeded) {
             if (inst->getFault() == NoFault) {
-                warn("Instruction PC %#x was not found in the ITB!",
-                     cpuXC->readPC());
+                // In this case the instruction was not a dummy
+                // instruction carrying an ITB fault.  In the single
+                // threaded case the ITB should still be able to
+                // translate this instruction; in the SMT case it's
+                // possible that its ITB entry was kicked out.
+                warn("%lli: Instruction PC %#x was not found in the ITB!",
+                     curTick, cpuXC->readPC());
                 handleError();
 
                 // go to the next instruction
@@ -618,20 +518,18 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
 
                 return;
             } else {
+                // The instruction is carrying an ITB fault.  Handle
+                // the fault and see if our results match the CPU on
+                // the next tick().
                 fault = inst->getFault();
             }
         }
 
         if (fault == NoFault) {
-//        fault = cpuXC->mem->read(memReq, machInst);
             cpuXC->mem->read(memReq, machInst);
 
-            // If we've got a valid instruction (i.e., no fault on instruction
-            // fetch), then execute it.
-
-        // keep an instruction count
+            // keep an instruction count
             numInst++;
-//	numInsts++;
 
             // decode the instruction
             machInst = gtoh(machInst);
@@ -639,7 +537,8 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
             // Checks both the machine instruction and the PC.
             validateInst(inst);
 
-            curStaticInst = StaticInst::decode(makeExtMI(machInst, cpuXC->readPC()));
+            curStaticInst = StaticInst::decode(makeExtMI(machInst,
+                                                         cpuXC->readPC()));
 
 #if FULL_SYSTEM
             cpuXC->setInst(machInst);
@@ -660,10 +559,6 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
             // Checks to make sure instrution results are correct.
             validateExecution(inst);
 
-//	if (curStaticInst->isMemRef()) {
-//	    numMemRefs++;
-//	}
-
             if (curStaticInst->isLoad()) {
                 ++numLoad;
             }
@@ -693,6 +588,9 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
         }
 
 #if FULL_SYSTEM
+        // @todo: Determine if these should happen only if the
+        // instruction hasn't faulted.  In the SimpleCPU case this may
+        // not be true, but in the O3 or Ozone case this may be true.
         Addr oldpc;
         int count = 0;
         do {
@@ -707,10 +605,12 @@ Checker<DynInstPtr>::tick(DynInstPtr &completed_inst)
         }
 #endif
 
-        // Checks PC, next PC.  Optionally can check all registers. (Or just those
+        // @todo:  Optionally can check all registers. (Or just those
         // that have been modified).
         validateState();
 
+        // Continue verifying instructions if there's another completed
+        // instruction waiting to be verified.
         if (instList.empty()) {
             break;
         } else if (instList.front()->isCompleted()) {
@@ -726,7 +626,6 @@ template <class DynInstPtr>
 void
 Checker<DynInstPtr>::switchOut(Sampler *s)
 {
-    sampler = s;
     instList.clear();
 }
 
@@ -734,15 +633,6 @@ template <class DynInstPtr>
 void
 Checker<DynInstPtr>::takeOverFrom(BaseCPU *oldCPU)
 {
-//    BaseCPU::takeOverFrom(oldCPU);
-
-    // if any of this CPU's ExecContexts are active, mark the CPU as
-    // running and schedule its tick event.
-/*
-    for (int i = 0; i < execContexts.size(); ++i) {
-        ExecContext *xc = execContexts[i];
-    }
-*/
 }
 
 template <class DynInstPtr>
@@ -750,20 +640,22 @@ void
 Checker<DynInstPtr>::validateInst(DynInstPtr &inst)
 {
     if (inst->readPC() != cpuXC->readPC()) {
-        warn("PCs do not match! Inst: %#x, checker: %#x",
-             inst->readPC(), cpuXC->readPC());
+        warn("%lli: PCs do not match! Inst: %#x, checker: %#x",
+             curTick, inst->readPC(), cpuXC->readPC());
         if (changedPC) {
-            warn("Changed PCs recently, may not be an error");
+            warn("%lli: Changed PCs recently, may not be an error",
+                 curTick);
         } else {
             handleError();
         }
     }
 
-    if (static_cast<MachInst>(inst->staticInst->machInst) !=
-        machInst) {
-        warn("Binary instructions do not match! Inst: %#x, checker: %#x",
-             static_cast<MachInst>(inst->staticInst->machInst),
-             machInst);
+    MachInst mi = static_cast<MachInst>(inst->staticInst->machInst);
+
+    if (mi != machInst) {
+        warn("%lli: Binary instructions do not match! Inst: %#x, "
+             "checker: %#x",
+             curTick, mi, machInst);
         handleError();
     }
 }
@@ -773,10 +665,11 @@ void
 Checker<DynInstPtr>::validateExecution(DynInstPtr &inst)
 {
     if (inst->numDestRegs()) {
+        // @todo: Support more destination registers.
         if (inst->isUnverifiable()) {
-            // @todo: Support more destination registers.
-            // Grab the result from the instruction and write it to the
-            // register.
+            // Unverifiable instructions assume they were executed
+            // properly by the CPU. Grab the result from the
+            // instruction and write it to the register.
             RegIndex idx = inst->destRegIdx(0);
             if (idx < TheISA::FP_Base_DepTag) {
                 cpuXC->setIntReg(idx, inst->readIntResult());
@@ -786,16 +679,17 @@ Checker<DynInstPtr>::validateExecution(DynInstPtr &inst)
                 cpuXC->setMiscReg(idx, inst->readIntResult());
             }
         } else if (result.integer != inst->readIntResult()) {
-            warn("Instruction results do not match! (May not be integer results) "
-                 "Inst: %#x, checker: %#x",
-                 inst->readIntResult(), result.integer);
+            warn("%lli: Instruction results do not match! (Results may not "
+                 "actually be integers) Inst: %#x, checker: %#x",
+                 curTick, inst->readIntResult(), result.integer);
             handleError();
         }
     }
 
     if (inst->readNextPC() != cpuXC->readNextPC()) {
-        warn("Instruction next PCs do not match! Inst: %#x, checker: %#x",
-             inst->readNextPC(), cpuXC->readNextPC());
+        warn("%lli: Instruction next PCs do not match! Inst: %#x, "
+             "checker: %#x",
+             curTick, inst->readNextPC(), cpuXC->readNextPC());
         handleError();
     }
 
@@ -810,9 +704,10 @@ Checker<DynInstPtr>::validateExecution(DynInstPtr &inst)
 
         if (inst->xcBase()->readMiscReg(misc_reg_idx) !=
             cpuXC->readMiscReg(misc_reg_idx)) {
-            warn("Misc reg idx %i (side effect) does not match! Inst: %#x, "
-                 "checker: %#x",
-                 misc_reg_idx, inst->xcBase()->readMiscReg(misc_reg_idx),
+            warn("%lli: Misc reg idx %i (side effect) does not match! "
+                 "Inst: %#x, checker: %#x",
+                 curTick, misc_reg_idx,
+                 inst->xcBase()->readMiscReg(misc_reg_idx),
                  cpuXC->readMiscReg(misc_reg_idx));
             handleError();
         }
diff --git a/cpu/checker/cpu.hh b/cpu/checker/cpu.hh
index 678e888df..37fe59d95 100644
--- a/cpu/checker/cpu.hh
+++ b/cpu/checker/cpu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * Copyright (c) 2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@
 #include "cpu/base_dyn_inst.hh"
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/pc_event.hh"
-#include "cpu/sampler/sampler.hh"
 #include "cpu/static_inst.hh"
 #include "sim/eventq.hh"
 
@@ -63,6 +62,7 @@ class BaseDynInst;
 class ExecContext;
 class MemInterface;
 class Checkpoint;
+class Sampler;
 
 class CheckerCPU : public BaseCPU
 {
@@ -86,8 +86,6 @@ class CheckerCPU : public BaseCPU
     };
 
   public:
-    void post_interrupt(int int_num, int index);
-
     CheckerCPU(Params *p);
     virtual ~CheckerCPU();
 
@@ -111,8 +109,6 @@ class CheckerCPU : public BaseCPU
 
 #if FULL_SYSTEM
     Addr dbg_vtophys(Addr addr);
-
-    bool interval_stats;
 #endif
 
     union Result {
@@ -129,11 +125,6 @@ class CheckerCPU : public BaseCPU
     // Refcounted pointer to the one memory request.
     MemReqPtr memReq;
 
-    // Pointer to the sampler that is telling us to switchover.
-    // Used to signal the completion of the pipe drain and schedule
-    // the next switchover
-    Sampler *sampler;
-
     StaticInstPtr curStaticInst;
 
     // number of simulated instructions
@@ -284,6 +275,7 @@ class CheckerCPU : public BaseCPU
     bool simPalCheck(int palFunc) { return cpuXC->simPalCheck(palFunc); }
 #else
     // Assume that the normal CPU's call to syscall was successful.
+    // The checker's state would have already been updated by the syscall.
     void syscall() { }
 #endif
 
@@ -307,8 +299,6 @@ class CheckerCPU : public BaseCPU
     bool exitOnError;
 
     InstSeqNum youngestSN;
-//    std::map<Addr, uint64_t> storeBuff;
-//    typedef std::map<Addr, uint64_t>::iterator map_it;
 };
 
 template <class DynInstPtr>

From 358cf1b11765024309fe986262bb3a3d16c8a720 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 17:03:43 -0400
Subject: [PATCH 46/50] Rework how instructions are scheduled and executed. The
 "execute" portion of IEW is really just the last cycle of execution, at which
 point execute() gets called.  Execution begins inside the IQ, when it
 schedules FUs for specific instructions.  As a result, the Execute stage
 should just pull all completing instructions out of the IQ stage and execute
 them. Limiting the number of writebacks outstanding must still be done.

cpu/o3/iew_impl.hh:
    Rework how instructions are scheduled and executed.  There shouldn't be a specific "width" from issue to execute because issue does the scheduling of the functional units (really the beginning of the execution).
cpu/o3/inst_queue.hh:
cpu/o3/inst_queue_impl.hh:
    Rework how instructions are scheduled and executed.

--HG--
extra : convert_revision : bbf1a8a4c0a2f2a938bdd78d74493048fd3b4b55
---
 cpu/o3/iew_impl.hh        |  5 +++--
 cpu/o3/inst_queue.hh      |  4 ++++
 cpu/o3/inst_queue_impl.hh | 22 +++++++++++++++++-----
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 59f4055a6..c22850131 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -1232,13 +1232,14 @@ DefaultIEW<Impl>::executeInsts()
 #endif
 
     // Execute/writeback any instructions that are available.
+    int insts_to_execute = fromIssue->size;
     int inst_num = 0;
-    for ( ; inst_num < issueWidth && fromIssue->insts[inst_num];
+    for (; inst_num < insts_to_execute;
           ++inst_num) {
 
         DPRINTF(IEW, "Execute: Executing instructions from IQ.\n");
 
-        DynInstPtr inst = fromIssue->insts[inst_num];
+        DynInstPtr inst = instQueue.getInstToExecute();
 
         DPRINTF(IEW, "Execute: Processing PC %#x, [tid:%i] [sn:%i].\n",
                 inst->readPC(), inst->threadNumber,inst->seqNum);
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 6bdf4ddc2..518de73d9 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -171,6 +171,8 @@ class InstructionQueue
      */
     void insertBarrier(DynInstPtr &barr_inst);
 
+    DynInstPtr getInstToExecute();
+
     /**
      * Records the instruction as the producer of a register without
      * adding it to the rest of the IQ.
@@ -272,6 +274,8 @@ class InstructionQueue
     /** List of all the instructions in the IQ (some of which may be issued). */
     std::list<DynInstPtr> instList[Impl::MaxThreads];
 
+    std::list<DynInstPtr> instsToExecute;
+
     /**
      * Struct for comparing entries to be added to the priority queue.  This
      * gives reverse ordering to the instructions in terms of sequence
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index ed57ac257..412d59768 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -588,6 +588,16 @@ InstructionQueue<Impl>::insertBarrier(DynInstPtr &barr_inst)
     insertNonSpec(barr_inst);
 }
 
+template <class Impl>
+typename Impl::DynInstPtr
+InstructionQueue<Impl>::getInstToExecute()
+{
+    assert(!instsToExecute.empty());
+    DynInstPtr inst = instsToExecute.front();
+    instsToExecute.pop_front();
+    return inst;
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::addToOrderList(OpClass op_class)
@@ -662,9 +672,11 @@ InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
     // @todo: This could break if there's multiple multi-cycle ops
     // finishing on this cycle.  Maybe implement something like
     // instToCommit in iew_impl.hh.
-    int &size = issueToExecuteQueue->access(0)->size;
+    issueToExecuteQueue->access(0)->size++;
+    instsToExecute.push_back(inst);
+//    int &size = issueToExecuteQueue->access(0)->size;
 
-    issueToExecuteQueue->access(0)->insts[size++] = inst;
+//    issueToExecuteQueue->access(0)->insts[size++] = inst;
 }
 
 // @todo: Figure out a better way to remove the squashed items from the
@@ -690,9 +702,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     ListOrderIt order_it = listOrder.begin();
     ListOrderIt order_end_it = listOrder.end();
     int total_issued = 0;
-    int exec_queue_slot = i2e_info->size;
 
-    while (exec_queue_slot < totalWidth && total_issued < totalWidth &&
+    while (total_issued < totalWidth &&
            order_it != order_end_it) {
         OpClass op_class = (*order_it).queueType;
 
@@ -733,8 +744,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         if (idx == -2 || idx != -1) {
             if (op_latency == 1) {
-                i2e_info->insts[exec_queue_slot++] = issuing_inst;
+//                i2e_info->insts[exec_queue_slot++] = issuing_inst;
                 i2e_info->size++;
+                instsToExecute.push_back(issuing_inst);
 
                 // Add the FU onto the list of FU's to be freed next
                 // cycle if we used one.

From 5d3a1e8f65a741ff8f76e5cf45fa6da894c14d99 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 18:18:16 -0400
Subject: [PATCH 47/50] Updates to isa parser to make it see dependencies
 properly with the new scanner.

arch/alpha/isa/main.isa:
    Use automatic path includes thanks to updates to isa parser.
arch/isa_parser.py:
    Pull changes to isa parser from newmem into m5.  This fixes a bug where the files include in main.isa were not being included as dependencies properly.

--HG--
extra : convert_revision : 8ef1e2e1a64e7a5762baf7a09abc8665d7c2f688
---
 arch/alpha/isa/main.isa |  20 ++++----
 arch/isa_parser.py      | 108 ++++++++++++++++++++--------------------
 2 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/arch/alpha/isa/main.isa b/arch/alpha/isa/main.isa
index 17c9989ab..80a5e9ca1 100644
--- a/arch/alpha/isa/main.isa
+++ b/arch/alpha/isa/main.isa
@@ -418,31 +418,31 @@ def format BasicOperateWithNopCheck(code, *opt_args) {{
 }};
 
 // Integer instruction templates, formats, etc.
-##include "m5/arch/alpha/isa/int.isa"
+##include "int.isa"
 
 // Floating-point instruction templates, formats, etc.
-##include "m5/arch/alpha/isa/fp.isa"
+##include "fp.isa"
 
 // Memory instruction templates, formats, etc.
-##include "m5/arch/alpha/isa/mem.isa"
+##include "mem.isa"
 
 // Branch/jump instruction templates, formats, etc.
-##include "m5/arch/alpha/isa/branch.isa"
+##include "branch.isa"
 
 // PAL instruction templates, formats, etc.
-##include "m5/arch/alpha/isa/pal.isa"
+##include "pal.isa"
 
 // Opcdec fault instruction templates, formats, etc.
-##include "m5/arch/alpha/isa/opcdec.isa"
+##include "opcdec.isa"
 
 // Unimplemented instruction templates, formats, etc.
-##include "m5/arch/alpha/isa/unimp.isa"
+##include "unimp.isa"
 
 // Unknown instruction templates, formats, etc.
-##include "m5/arch/alpha/isa/unknown.isa"
+##include "unknown.isa"
 
 // Execution utility functions
-##include "m5/arch/alpha/isa/util.isa"
+##include "util.isa"
 
 // The actual decoder
-##include "m5/arch/alpha/isa/decoder.isa"
+##include "decoder.isa"
diff --git a/arch/isa_parser.py b/arch/isa_parser.py
index 570110d84..b0f10783f 100755
--- a/arch/isa_parser.py
+++ b/arch/isa_parser.py
@@ -1,5 +1,3 @@
-#! /usr/bin/env python
-
 # Copyright (c) 2003-2005 The Regents of The University of Michigan
 # All rights reserved.
 #
@@ -162,13 +160,12 @@ def t_CPPDIRECTIVE(t):
 
 def t_NEWFILE(t):
     r'^\#\#newfile\s+"[\w/.-]*"'
-    global fileNameStack
-    fileNameStack.append((t.value[11:-1], t.lineno))
+    fileNameStack.push((t.value[11:-1], t.lineno))
     t.lineno = 0
 
 def t_ENDFILE(t):
     r'^\#\#endfile'
-    (filename, t.lineno) = fileNameStack.pop()
+    (old_filename, t.lineno) = fileNameStack.pop()
 
 #
 # The functions t_NEWLINE, t_ignore, and t_error are
@@ -698,7 +695,7 @@ def p_error(t):
     if t:
         error(t.lineno, "syntax error at '%s'" % t.value)
     else:
-        error_bt(0, "unknown syntax error")
+        error(0, "unknown syntax error", True)
 
 # END OF GRAMMAR RULES
 #
@@ -896,6 +893,12 @@ formatStack = Stack(NoFormat())
 # The global default case stack.
 defaultStack = Stack( None )
 
+# Global stack that tracks current file and line number.
+# Each element is a tuple (filename, lineno) that records the
+# *current* filename and the line number in the *previous* file where
+# it was included.
+fileNameStack = Stack()
+
 ###################
 # Utility functions
 
@@ -932,25 +935,22 @@ def fixPythonIndentation(s):
     return s
 
 # Error handler.  Just call exit.  Output formatted to work under
-# Emacs compile-mode.  This function should be called when errors due
-# to user input are detected (as opposed to parser bugs).
-def error(lineno, string):
+# Emacs compile-mode.  Optional 'print_traceback' arg, if set to True,
+# prints a Python stack backtrace too (can be handy when trying to
+# debug the parser itself).
+def error(lineno, string, print_traceback = False):
     spaces = ""
     for (filename, line) in fileNameStack[0:-1]:
-        print spaces + "In file included from " + filename
+        print spaces + "In file included from " + filename + ":"
         spaces += "  "
-    # Uncomment the following line to get a Python stack backtrace for
-    # these errors too.  Can be handy when trying to debug the parser.
-    # traceback.print_exc()
-    sys.exit(spaces + "%s:%d: %s" % (fileNameStack[-1][0], lineno, string))
-
-# Like error(), but include a Python stack backtrace (for processing
-# Python exceptions).  This function should be called for errors that
-# appear to be bugs in the parser itself.
-def error_bt(lineno, string):
-    traceback.print_exc()
-    print >> sys.stderr, "%s:%d: %s" % (input_filename, lineno, string)
-    sys.exit(1)
+    # Print a Python stack backtrace if requested.
+    if (print_traceback):
+        traceback.print_exc()
+    if lineno != 0:
+        line_str = "%d:" % lineno
+    else:
+        line_str = ""
+    sys.exit(spaces + "%s:%s %s" % (fileNameStack[-1][0], line_str, string))
 
 
 #####################################################################
@@ -1070,7 +1070,7 @@ def buildOperandTypeMap(userDict, lineno):
             elif size == 64:
                 ctype = 'double'
         if ctype == '':
-            error(0, 'Unrecognized type description "%s" in userDict')
+            error(lineno, 'Unrecognized type description "%s" in userDict')
         operandTypeMap[ext] = (size, ctype, is_signed)
 
 #
@@ -1687,47 +1687,47 @@ def update_if_needed(file, contents):
         f.write(contents)
         f.close()
 
-# This regular expression matches include directives
+# This regular expression matches '##include' directives
 includeRE = re.compile(r'^\s*##include\s+"(?P<filename>[\w/.-]*)".*$',
                        re.MULTILINE)
 
-def preprocess_isa_desc(isa_desc):
+# Function to replace a matched '##include' directive with the
+# contents of the specified file (with nested ##includes replaced
+# recursively).  'matchobj' is an re match object (from a match of
+# includeRE) and 'dirname' is the directory relative to which the file
+# path should be resolved.
+def replace_include(matchobj, dirname):
+    fname = matchobj.group('filename')
+    full_fname = os.path.normpath(os.path.join(dirname, fname))
+    contents = '##newfile "%s"\n%s\n##endfile\n' % \
+               (full_fname, read_and_flatten(full_fname))
+    return contents
+
+# Read a file and recursively flatten nested '##include' files.
+def read_and_flatten(filename):
+    current_dir = os.path.dirname(filename)
+    try:
+        contents = open(filename).read()
+    except IOError:
+        error(0, 'Error including file "%s"' % filename)
+    fileNameStack.push((filename, 0))
     # Find any includes and include them
-    pos = 0
-    while 1:
-        m = includeRE.search(isa_desc, pos)
-        if not m:
-            break
-        filename = m.group('filename')
-        print 'Including file "%s"' % filename
-        try:
-            isa_desc = isa_desc[:m.start()] + \
-                       '##newfile "' + filename + '"\n' + \
-                       open(filename).read() + \
-                       '##endfile\n' + \
-                       isa_desc[m.end():]
-        except IOError:
-            error(0, 'Error including file "%s"' % (filename))
-        pos = m.start()
-    return isa_desc
+    contents = includeRE.sub(lambda m: replace_include(m, current_dir),
+                             contents)
+    fileNameStack.pop()
+    return contents
 
 #
 # Read in and parse the ISA description.
 #
 def parse_isa_desc(isa_desc_file, output_dir):
-    # set a global var for the input filename... used in error messages
-    global input_filename
-    input_filename = isa_desc_file
-    global fileNameStack
-    fileNameStack = [(input_filename, 1)]
+    # Read file and (recursively) all included files into a string.
+    # PLY requires that the input be in a single string so we have to
+    # do this up front.
+    isa_desc = read_and_flatten(isa_desc_file)
 
-    # Suck the ISA description file in.
-    input = open(isa_desc_file)
-    isa_desc = input.read()
-    input.close()
-
-    # Perform Preprocessing
-    isa_desc = preprocess_isa_desc(isa_desc)
+    # Initialize filename stack with outer file.
+    fileNameStack.push((isa_desc_file, 0))
 
     # Parse it.
     (isa_name, namespace, global_code, namespace_code) = yacc.parse(isa_desc)

From 3fe35232322daef87a0b85d7f3ca4c18330ed7c4 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Wed, 24 May 2006 14:31:06 -0400
Subject: [PATCH 48/50] Support new flags now used instead of flags in
 decoder.isa.

cpu/ozone/front_end_impl.hh:
cpu/ozone/lw_back_end_impl.hh:
cpu/ozone/lw_lsq_impl.hh:
    Support new flags added in.

--HG--
extra : convert_revision : 2e756fd1913cf600650afc39dd715d59b9b89c42
---
 cpu/ozone/front_end_impl.hh   | 10 +++++++---
 cpu/ozone/lw_back_end_impl.hh | 12 ++++++++----
 cpu/ozone/lw_lsq_impl.hh      |  7 +++----
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh
index 15adae9b4..ffbcf3340 100644
--- a/cpu/ozone/front_end_impl.hh
+++ b/cpu/ozone/front_end_impl.hh
@@ -503,11 +503,14 @@ FrontEnd<Impl>::processBarriers(DynInstPtr &inst)
     if (serializeNext) {
         inst->setSerializeBefore();
         serializeNext = false;
-    } else if (!inst->isSerializing()) {
+    } else if (!inst->isSerializing() &&
+               !inst->isIprAccess() &&
+               !inst->isStoreConditional()) {
         return false;
     }
 
-    if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
+    if ((inst->isIprAccess() || inst->isSerializeBefore()) &&
+        !inst->isSerializeHandled()) {
         DPRINTF(FE, "Serialize before instruction encountered.\n");
 
         if (!inst->isTempSerializeBefore()) {
@@ -523,7 +526,8 @@ FrontEnd<Impl>::processBarriers(DynInstPtr &inst)
 
         barrierInst = inst;
         return true;
-    } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) {
+    } else if ((inst->isStoreConditional() || inst->isSerializeAfter())
+               && !inst->isSerializeHandled()) {
         DPRINTF(FE, "Serialize after instruction encountered.\n");
 
         inst->setSerializeHandled();
diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh
index 881d6e6b1..41b4ea24b 100644
--- a/cpu/ozone/lw_back_end_impl.hh
+++ b/cpu/ozone/lw_back_end_impl.hh
@@ -66,8 +66,9 @@ LWBackEnd<Impl>::wakeDependents(DynInstPtr &inst, bool memory_deps)
         DPRINTF(BE, "Marking source reg ready [sn:%lli] in IQ\n", dep_inst->seqNum);
 
         if (dep_inst->readyToIssue() && dep_inst->isInROB() &&
-            !dep_inst->isNonSpeculative() &&
-            dep_inst->memDepReady() && !dep_inst->isMemBarrier() && !dep_inst->isWriteBarrier()) {
+            !dep_inst->isNonSpeculative() && !dep_inst->isStoreConditional() &&
+            dep_inst->memDepReady() && !dep_inst->isMemBarrier() &&
+            !dep_inst->isWriteBarrier()) {
             DPRINTF(BE, "Adding instruction to exeList [sn:%lli]\n",
                     dep_inst->seqNum);
             exeList.push(dep_inst);
@@ -768,7 +769,9 @@ LWBackEnd<Impl>::dispatchInsts()
             }
             memBarrier = inst;
             inst->setCanCommit();
-        } else if (inst->readyToIssue() && !inst->isNonSpeculative()) {
+        } else if (inst->readyToIssue() &&
+                   !inst->isNonSpeculative() &&
+                   !inst->isStoreConditional()) {
             if (inst->isMemRef()) {
 
                 LSQ.insert(inst);
@@ -803,7 +806,7 @@ LWBackEnd<Impl>::dispatchInsts()
                 exeList.push(inst);
             }
         } else {
-            if (inst->isNonSpeculative()) {
+            if (inst->isNonSpeculative() || inst->isStoreConditional()) {
                 inst->setCanCommit();
                 DPRINTF(BE, "Adding non speculative instruction\n");
             }
@@ -1079,6 +1082,7 @@ LWBackEnd<Impl>::commitInst(int inst_num)
     // or store inst.  Signal backwards that it should be executed.
     if (!inst->isExecuted()) {
         if (inst->isNonSpeculative() ||
+            inst->isStoreConditional() ||
             inst->isMemBarrier() ||
             inst->isWriteBarrier()) {
 #if !FULL_SYSTEM
diff --git a/cpu/ozone/lw_lsq_impl.hh b/cpu/ozone/lw_lsq_impl.hh
index 2f85a0396..f72bbb1cc 100644
--- a/cpu/ozone/lw_lsq_impl.hh
+++ b/cpu/ozone/lw_lsq_impl.hh
@@ -364,10 +364,9 @@ OzoneLWLSQ<Impl>::executeStore(DynInstPtr &store_inst)
         if (store_fault != NoFault) {
             panic("Fault in a store instruction!");
             storeFaultInst = store_inst;
-        } else if (store_inst->isNonSpeculative()) {
-            // Nonspeculative accesses (namely store conditionals)
-            // need to set themselves as able to writeback if we
-            // haven't had a fault by here.
+        } else if (store_inst->isStoreConditional()) {
+            // Store conditionals need to set themselves as able to
+            // writeback if we haven't had a fault by here.
             (*sq_it).canWB = true;
 
             ++storesToWB;

From 32509d83878816cd870cea1ccbb8a9eb46a1e3f6 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 25 May 2006 11:50:42 -0400
Subject: [PATCH 49/50] Fix up kernel stats, allow them to not be used as well.

arch/alpha/ev5.cc:
    Fix up some stuff I missed in the last kernel stats checkin.
cpu/checker/cpu.cc:
    Allow the checker to disable its kernel stats.
cpu/cpu_exec_context.cc:
    Allow CPUExecContext to be created without kernelStats.
cpu/cpu_exec_context.hh:
    Allow CPUExecContext to be created without kernelStats.  Default usage leaves kernelStats on.

--HG--
extra : convert_revision : 8ed5bffd3a5b6275baa07fb4ea385eeab1a0456a
---
 arch/alpha/ev5.cc       | 13 +++++++------
 cpu/checker/cpu.cc      |  4 ++--
 cpu/cpu_exec_context.cc | 15 +++++++++++----
 cpu/cpu_exec_context.hh |  3 ++-
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/alpha/ev5.cc b/arch/alpha/ev5.cc
index ad3a9ec4c..f113a2767 100644
--- a/arch/alpha/ev5.cc
+++ b/arch/alpha/ev5.cc
@@ -146,7 +146,8 @@ CPUExecContext::hwrei()
     setNextPC(readMiscReg(AlphaISA::IPR_EXC_ADDR));
 
     if (!misspeculating()) {
-        kernelStats->hwrei();
+        if (kernelStats)
+            kernelStats->hwrei();
 
         cpu->checkInterrupts = true;
     }
@@ -372,10 +373,9 @@ AlphaISA::MiscRegFile::setIpr(int idx, uint64_t val, ExecContext *xc)
         if (val & 0x18) {
             if (xc->getKernelStats())
                 xc->getKernelStats()->mode(Kernel::user, xc);
-            else {
-                if (xc->getKernelStats())
-                    xc->getKernelStats()->mode(Kernel::kernel, xc);
-            }
+        } else {
+            if (xc->getKernelStats())
+                xc->getKernelStats()->mode(Kernel::kernel, xc);
         }
 
       case AlphaISA::IPR_ICM:
@@ -562,7 +562,8 @@ AlphaISA::MiscRegFile::copyIprs(ExecContext *xc)
 bool
 CPUExecContext::simPalCheck(int palFunc)
 {
-    kernelStats->callpal(palFunc, proxy);
+    if (kernelStats)
+        kernelStats->callpal(palFunc, proxy);
 
     switch (palFunc) {
       case PAL::halt:
diff --git a/cpu/checker/cpu.cc b/cpu/checker/cpu.cc
index 08ab5d5c8..41ff6e769 100644
--- a/cpu/checker/cpu.cc
+++ b/cpu/checker/cpu.cc
@@ -103,7 +103,7 @@ CheckerCPU::setMemory(FunctionalMemory *mem)
     execContexts.push_back(xcProxy);
 #else
     if (systemPtr) {
-        cpuXC = new CPUExecContext(this, 0, systemPtr, itb, dtb, memPtr);
+        cpuXC = new CPUExecContext(this, 0, systemPtr, itb, dtb, memPtr, false);
 
         cpuXC->setStatus(ExecContext::Suspended);
         xcProxy = cpuXC->getProxy();
@@ -122,7 +122,7 @@ CheckerCPU::setSystem(System *system)
     systemPtr = system;
 
     if (memPtr) {
-        cpuXC = new CPUExecContext(this, 0, systemPtr, itb, dtb, memPtr);
+        cpuXC = new CPUExecContext(this, 0, systemPtr, itb, dtb, memPtr, false);
 
         cpuXC->setStatus(ExecContext::Suspended);
         xcProxy = cpuXC->getProxy();
diff --git a/cpu/cpu_exec_context.cc b/cpu/cpu_exec_context.cc
index 78ce058e8..e30295ef8 100644
--- a/cpu/cpu_exec_context.cc
+++ b/cpu/cpu_exec_context.cc
@@ -53,8 +53,9 @@ using namespace std;
 // constructor
 #if FULL_SYSTEM
 CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num, System *_sys,
-                         AlphaITB *_itb, AlphaDTB *_dtb,
-                         FunctionalMemory *_mem)
+                               AlphaITB *_itb, AlphaDTB *_dtb,
+                               FunctionalMemory *_mem,
+                               bool use_kernel_stats)
     : _status(ExecContext::Unallocated), cpu(_cpu), thread_num(_thread_num),
       cpu_id(-1), lastActivate(0), lastSuspend(0), mem(_mem), itb(_itb),
       dtb(_dtb), system(_sys), memctrl(_sys->memctrl), physmem(_sys->physmem),
@@ -79,6 +80,12 @@ CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num, System *_sys,
     static ProfileNode dummyNode;
     profileNode = &dummyNode;
     profilePC = 3;
+
+    if (use_kernel_stats) {
+        kernelStats = new Kernel::Statistics(system);
+    } else {
+        kernelStats = NULL;
+    }
 }
 #else
 CPUExecContext::CPUExecContext(BaseCPU *_cpu, int _thread_num,
@@ -279,8 +286,8 @@ void
 CPUExecContext::regStats(const string &name)
 {
 #if FULL_SYSTEM
-    kernelStats = new Kernel::Statistics(system);
-    kernelStats->regStats(name + ".kern");
+    if (kernelStats)
+        kernelStats->regStats(name + ".kern");
 #endif
 }
 
diff --git a/cpu/cpu_exec_context.hh b/cpu/cpu_exec_context.hh
index 3d1428933..061fe450a 100644
--- a/cpu/cpu_exec_context.hh
+++ b/cpu/cpu_exec_context.hh
@@ -193,7 +193,8 @@ class CPUExecContext
     // constructor: initialize context from given process structure
 #if FULL_SYSTEM
     CPUExecContext(BaseCPU *_cpu, int _thread_num, System *_system,
-                   AlphaITB *_itb, AlphaDTB *_dtb, FunctionalMemory *_dem);
+                   AlphaITB *_itb, AlphaDTB *_dtb, FunctionalMemory *_mem,
+                   bool use_kernel_stats = true);
 #else
     CPUExecContext(BaseCPU *_cpu, int _thread_num, Process *_process, int _asid);
     CPUExecContext(BaseCPU *_cpu, int _thread_num, FunctionalMemory *_mem,

From f1fab2a4469d6cb2e55ebac15da02f8c1fcb7055 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 25 May 2006 17:56:01 -0400
Subject: [PATCH 50/50] Fix stat typo.

--HG--
extra : convert_revision : f23d8c50f586fb8f25d4ce992730213f0c301b0f
---
 sim/pseudo_inst.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sim/pseudo_inst.cc b/sim/pseudo_inst.cc
index 0c20a6a53..2d737c0a2 100644
--- a/sim/pseudo_inst.cc
+++ b/sim/pseudo_inst.cc
@@ -77,7 +77,7 @@ namespace AlphaPseudo
 
         xc->suspend();
         if (xc->getKernelStats())
-            xc->getKernelStats()->arm();
+            xc->getKernelStats()->quiesce();
     }
 
     void