Merge ktlim@zizzer:/bk/newmem

into zamp.eecs.umich.edu:/z/ktlim2/clean/tmp/clean2 src/cpu/base_dyn_inst.hh: Hand merge. Line is no longer needed because it's handled in the ISA. --HG-- extra : convert_revision : 0be4067aa38759a5631c6940f0167d48fde2b680
2007-03-23 13:20:19 -04:00 · 2007-03-23 13:20:19 -04:00 · 047f77102b
commit 047f77102b
parent 2c47413a7a 2330adfa28
21 changed files with 353 additions and 148 deletions
--- a/src/arch/alpha/isa/decoder.isa
+++ b/src/arch/alpha/isa/decoder.isa
@ -728,8 +728,10 @@ decode OPCODE default Unknown::unknown() {
        0: OpcdecFault::hw_st_quad();
        1: decode HW_LDST_QUAD {
            format HwLoad {
-                0: hw_ld({{ EA = (Rb + disp) & ~3; }}, {{ Ra = Mem.ul; }}, L);
-                1: hw_ld({{ EA = (Rb + disp) & ~7; }}, {{ Ra = Mem.uq; }}, Q);
+                0: hw_ld({{ EA = (Rb + disp) & ~3; }}, {{ Ra = Mem.ul; }},
+                         L, IsSerializing, IsSerializeBefore);
+                1: hw_ld({{ EA = (Rb + disp) & ~7; }}, {{ Ra = Mem.uq; }},
+                         Q, IsSerializing, IsSerializeBefore);
            }
        }
    }
@ -740,9 +742,9 @@ decode OPCODE default Unknown::unknown() {
            1: decode HW_LDST_COND {
                0: decode HW_LDST_QUAD {
                    0: hw_st({{ EA = (Rb + disp) & ~3; }},
-                {{ Mem.ul = Ra<31:0>; }}, L);
+                {{ Mem.ul = Ra<31:0>; }}, L, IsSerializing, IsSerializeBefore);
                    1: hw_st({{ EA = (Rb + disp) & ~7; }},
-                {{ Mem.uq = Ra.uq; }}, Q);
+                {{ Mem.uq = Ra.uq; }}, Q, IsSerializing, IsSerializeBefore);
                }

                1: FailUnimpl::hw_st_cond();
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@ -226,7 +226,8 @@ BaseCPU::startup()
 #endif

    if (params->progress_interval) {
-        new CPUProgressEvent(&mainEventQueue, params->progress_interval,
+        new CPUProgressEvent(&mainEventQueue,
+                             cycles(params->progress_interval),
                             this);
    }
 }
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@ -171,15 +171,15 @@ class BaseDynInst : public FastAlloc, public RefCounted
    /** The kind of fault this instruction has generated. */
    Fault fault;

-    /** The memory request. */
-    Request *req;
-
    /** Pointer to the data for the memory access. */
    uint8_t *memData;

    /** The effective virtual address (lds & stores only). */
    Addr effAddr;

+    /** Is the effective virtual address valid. */
+    bool effAddrValid;
+
    /** The effective physical address. */
    Addr physEffAddr;

@ -601,12 +601,18 @@ class BaseDynInst : public FastAlloc, public RefCounted
    /** Returns whether or not this instruction is ready to issue. */
    bool readyToIssue() const { return status[CanIssue]; }

+    /** Clears this instruction being able to issue. */
+    void clearCanIssue() { status.reset(CanIssue); }
+
    /** Sets this instruction as issued from the IQ. */
    void setIssued() { status.set(Issued); }

    /** Returns whether or not this instruction has issued. */
    bool isIssued() const { return status[Issued]; }

+    /** Clears this instruction as being issued. */
+    void clearIssued() { status.reset(Issued); }
+
    /** Sets this instruction as executed. */
    void setExecuted() { status.set(Executed); }

@ -729,6 +735,12 @@ class BaseDynInst : public FastAlloc, public RefCounted
     */
    bool eaCalcDone;

+    /** Is this instruction's memory access uncacheable. */
+    bool isUncacheable;
+
+    /** Has this instruction generated a memory request. */
+    bool reqMade;
+
  public:
    /** Sets the effective address. */
    void setEA(Addr &ea) { instEffAddr = ea; eaCalcDone = true; }
@ -745,6 +757,12 @@ class BaseDynInst : public FastAlloc, public RefCounted
    /** Whether or not the memory operation is done. */
    bool memOpDone;

+    /** Is this instruction's memory access uncacheable. */
+    bool uncacheable() { return isUncacheable; }
+
+    /** Has this instruction generated a memory request. */
+    bool hasRequest() { return reqMade; }
+
  public:
    /** Load queue index. */
    int16_t lqIdx;
@ -776,25 +794,25 @@ template<class T>
 inline Fault
 BaseDynInst<Impl>::read(Addr addr, T &data, unsigned flags)
 {
-    // Sometimes reads will get retried, so they may come through here
-    // twice.
-    if (!req) {
-        req = new Request();
+    reqMade = true;
+    Request *req = new Request();
    req->setVirt(asid, addr, sizeof(T), flags, this->PC);
    req->setThreadContext(thread->readCpuId(), threadNumber);
-    } else {
-        assert(addr == req->getVaddr());
-    }

    if ((req->getVaddr() & (TheISA::VMPageSize - 1)) + req->getSize() >
        TheISA::VMPageSize) {
+        delete req;
        return TheISA::genAlignmentFault();
    }

    fault = cpu->translateDataReadReq(req, thread);

+    if (req->isUncacheable())
+        isUncacheable = true;
+
    if (fault == NoFault) {
        effAddr = req->getVaddr();
+        effAddrValid = true;
        physEffAddr = req->getPaddr();
        memReqFlags = req->getFlags();

@ -817,6 +835,7 @@ BaseDynInst<Impl>::read(Addr addr, T &data, unsigned flags)
        // Commit will have to clean up whatever happened.  Set this
        // instruction as executed.
        this->setExecuted();
+        delete req;
    }

    if (traceData) {
@ -837,21 +856,25 @@ BaseDynInst<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
        traceData->setData(data);
    }

-    assert(req == NULL);
-
-    req = new Request();
+    reqMade = true;
+    Request *req = new Request();
    req->setVirt(asid, addr, sizeof(T), flags, this->PC);
    req->setThreadContext(thread->readCpuId(), threadNumber);

    if ((req->getVaddr() & (TheISA::VMPageSize - 1)) + req->getSize() >
        TheISA::VMPageSize) {
+        delete req;
        return TheISA::genAlignmentFault();
    }

    fault = cpu->translateDataWriteReq(req, thread);

+    if (req->isUncacheable())
+        isUncacheable = true;
+
    if (fault == NoFault) {
        effAddr = req->getVaddr();
+        effAddrValid = true;
        physEffAddr = req->getPaddr();
        memReqFlags = req->getFlags();
 #if 0
@ -863,12 +886,8 @@ BaseDynInst<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
 #else
        fault = cpu->write(req, data, sqIdx);
 #endif
-    }
-
-    if (res) {
-        // always return some result to keep misspeculated paths
-        // (which will ignore faults) deterministic
-        *res = (fault == NoFault) ? req->getExtraData() : 0;
+    } else {
+        delete req;
    }

    return fault;
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@ -92,11 +92,13 @@ template <class Impl>
 void
 BaseDynInst<Impl>::initVars()
 {
-    req = NULL;
    memData = NULL;
    effAddr = 0;
+    effAddrValid = false;
    physEffAddr = 0;

+    isUncacheable = false;
+    reqMade = false;
    readyRegs = 0;

    instResult.integer = 0;
@ -140,10 +142,6 @@ BaseDynInst<Impl>::initVars()
 template <class Impl>
 BaseDynInst<Impl>::~BaseDynInst()
 {
-    if (req) {
-        delete req;
-    }
-
    if (memData) {
        delete [] memData;
    }
@ -271,7 +269,7 @@ void
 BaseDynInst<Impl>::markSrcRegReady()
 {
    if (++readyRegs == numSrcRegs()) {
-        status.set(CanIssue);
+        setCanIssue();
    }
 }

--- a/src/cpu/o3/alpha/cpu_builder.cc
+++ b/src/cpu/o3/alpha/cpu_builder.cc
@ -50,11 +50,11 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(DerivO3CPU)
    Param<int> clock;
    Param<int> phase;
    Param<int> numThreads;
+Param<int> cpu_id;
 Param<int> activity;

 #if FULL_SYSTEM
 SimObjectParam<System *> system;
-Param<int> cpu_id;
 SimObjectParam<AlphaISA::ITB *> itb;
 SimObjectParam<AlphaISA::DTB *> dtb;
 Param<Tick> profile;
@ -161,11 +161,11 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU)
    INIT_PARAM(clock, "clock speed"),
    INIT_PARAM_DFLT(phase, "clock phase", 0),
    INIT_PARAM(numThreads, "number of HW thread contexts"),
+    INIT_PARAM(cpu_id, "processor ID"),
    INIT_PARAM_DFLT(activity, "Initial activity count", 0),

 #if FULL_SYSTEM
    INIT_PARAM(system, "System object"),
-    INIT_PARAM(cpu_id, "processor ID"),
    INIT_PARAM(itb, "Instruction translation buffer"),
    INIT_PARAM(dtb, "Data translation buffer"),
    INIT_PARAM(profile, ""),
@ -305,14 +305,15 @@ CREATE_SIM_OBJECT(DerivO3CPU)
    AlphaSimpleParams *params = new AlphaSimpleParams;

    params->clock = clock;
+    params->phase = phase;

    params->name = getInstanceName();
    params->numberOfThreads = actual_num_threads;
+    params->cpu_id = cpu_id;
    params->activity = activity;

 #if FULL_SYSTEM
    params->system = system;
-    params->cpu_id = cpu_id;
    params->itb = itb;
    params->dtb = dtb;
    params->profile = profile;
--- a/src/cpu/o3/alpha/cpu_impl.hh
+++ b/src/cpu/o3/alpha/cpu_impl.hh
@ -114,6 +114,7 @@ AlphaO3CPU<Impl>::AlphaO3CPU(Params *params) : FullO3CPU<Impl>(params)
 #endif
        // Give the thread the TC.
        this->thread[i]->tc = tc;
+        this->thread[i]->setCpuId(params->cpu_id);

        // Add the TC to the CPU's list of TC's.
        this->threadContexts.push_back(tc);
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@ -247,6 +247,11 @@ class DefaultCommit
    /** Handles squashing due to an TC write. */
    void squashFromTC(unsigned tid);

+#if FULL_SYSTEM
+    /** Handles processing an interrupt. */
+    void handleInterrupt();
+#endif // FULL_SYSTEM
+
    /** Commits as many instructions as possible. */
    void commitInsts();

@ -409,6 +414,16 @@ class DefaultCommit
    /** The sequence number of the youngest valid instruction in the ROB. */
    InstSeqNum youngestSeqNum[Impl::MaxThreads];

+    /** Records if there is a trap currently in flight. */
+    bool trapInFlight[Impl::MaxThreads];
+
+    /** Records if there were any stores committed this cycle. */
+    bool committedStores[Impl::MaxThreads];
+
+    /** Records if commit should check if the ROB is truly empty (see
+        commit_impl.hh). */
+    bool checkEmptyROB[Impl::MaxThreads];
+
    /** Pointer to the list of active threads. */
    std::list<unsigned> *activeThreads;

--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@ -118,6 +118,9 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
    for (int i=0; i < numThreads; i++) {
        commitStatus[i] = Idle;
        changedROBNumEntries[i] = false;
+        checkEmptyROB[i] = false;
+        trapInFlight[i] = false;
+        committedStores[i] = false;
        trapSquash[i] = false;
        tcSquash[i] = false;
        PC[i] = nextPC[i] = nextNPC[i] = 0;
@ -335,6 +338,7 @@ DefaultCommit<Impl>::initStage()
    for (int i=0; i < numThreads; i++) {
        toIEW->commitInfo[i].usedROB = true;
        toIEW->commitInfo[i].freeROBEntries = rob->numFreeEntries(i);
+        toIEW->commitInfo[i].emptyROB = true;
    }

    cpu->activityThisCycle();
@ -473,14 +477,14 @@ DefaultCommit<Impl>::generateTrapEvent(unsigned tid)
    TrapEvent *trap = new TrapEvent(this, tid);

    trap->schedule(curTick + trapLatency);
-
-    thread[tid]->trapPending = true;
+    trapInFlight[tid] = true;
 }

 template <class Impl>
 void
 DefaultCommit<Impl>::generateTCEvent(unsigned tid)
 {
+    assert(!trapInFlight[tid]);
    DPRINTF(Commit, "Generating TC squash event for [tid:%i]\n", tid);

    tcSquash[tid] = true;
@ -495,7 +499,7 @@ DefaultCommit<Impl>::squashAll(unsigned tid)
    // Hopefully this doesn't mess things up.  Basically I want to squash
    // all instructions of this thread.
    InstSeqNum squashed_inst = rob->isEmpty() ?
-        0 : rob->readHeadInst(tid)->seqNum - 1;;
+        0 : rob->readHeadInst(tid)->seqNum - 1;

    // All younger instructions will be squashed. Set the sequence
    // number as the youngest instruction in the ROB (0 in this case.
@ -532,6 +536,7 @@ DefaultCommit<Impl>::squashFromTrap(unsigned tid)

    thread[tid]->trapPending = false;
    thread[tid]->inSyscall = false;
+    trapInFlight[tid] = false;

    trapSquash[tid] = false;

@ -580,6 +585,10 @@ DefaultCommit<Impl>::tick()
    while (threads != end) {
        unsigned tid = *threads++;

+        // Clear the bit saying if the thread has committed stores
+        // this cycle.
+        committedStores[tid] = false;
+
        if (commitStatus[tid] == ROBSquashing) {

            if (rob->isDoneSquashing(tid)) {
@ -635,16 +644,11 @@ DefaultCommit<Impl>::tick()
    updateStatus();
 }

+#if FULL_SYSTEM
 template <class Impl>
 void
-DefaultCommit<Impl>::commit()
+DefaultCommit<Impl>::handleInterrupt()
 {
-
-    //////////////////////////////////////
-    // Check for interrupts
-    //////////////////////////////////////
-
-#if FULL_SYSTEM
    if (interrupt != NoFault) {
        // Wait until the ROB is empty and all stores have drained in
        // order to enter the interrupt.
@ -653,6 +657,12 @@ DefaultCommit<Impl>::commit()
            // an interrupt needed to be handled.
            DPRINTF(Commit, "Interrupt detected.\n");

+            Fault new_interrupt = cpu->getInterrupts();
+            assert(new_interrupt == interrupt);
+
+            // Clear the interrupt now that it's going to be handled
+            toIEW->commitInfo[0].clearInterrupt = true;
+
            assert(!thread[0]->inSyscall);
            thread[0]->inSyscall = true;

@ -666,14 +676,12 @@ DefaultCommit<Impl>::commit()
            // Generate trap squash event.
            generateTrapEvent(0);

-            // Clear the interrupt now that it's been handled
-            toIEW->commitInfo[0].clearInterrupt = true;
            interrupt = NoFault;
        } else {
            DPRINTF(Commit, "Interrupt pending, waiting for ROB to empty.\n");
        }
-    } else if (cpu->check_interrupts(cpu->tcBase(0)) &&
-        commitStatus[0] != TrapPending &&
+    } else if (commitStatus[0] != TrapPending &&
+               cpu->check_interrupts(cpu->tcBase(0)) &&
               !trapSquash[0] &&
               !tcSquash[0]) {
        // Process interrupts if interrupts are enabled, not in PAL
@ -691,7 +699,21 @@ DefaultCommit<Impl>::commit()
            toIEW->commitInfo[0].interruptPending = true;
        }
    }
+}
+#endif // FULL_SYSTEM

+template <class Impl>
+void
+DefaultCommit<Impl>::commit()
+{
+
+#if FULL_SYSTEM
+    // Check for any interrupt, and start processing it.  Or if we
+    // have an outstanding interrupt and are at a point when it is
+    // valid to take an interrupt, process it.
+    if (cpu->check_interrupts(cpu->tcBase(0))) {
+        handleInterrupt();
+    }
 #endif // FULL_SYSTEM

    ////////////////////////////////////
@ -709,6 +731,7 @@ DefaultCommit<Impl>::commit()
            assert(!tcSquash[tid]);
            squashFromTrap(tid);
        } else if (tcSquash[tid] == true) {
+            assert(commitStatus[tid] != TrapPending);
            squashFromTC(tid);
        }

@ -753,6 +776,7 @@ DefaultCommit<Impl>::commit()
                bdelay_done_seq_num--;
 #endif
            }
+
            // All younger instructions will be squashed. Set the sequence
            // number as the youngest instruction in the ROB.
            youngestSeqNum[tid] = squashed_inst;
@ -817,13 +841,29 @@ DefaultCommit<Impl>::commit()
            toIEW->commitInfo[tid].usedROB = true;
            toIEW->commitInfo[tid].freeROBEntries = rob->numFreeEntries(tid);

-            if (rob->isEmpty(tid)) {
-                toIEW->commitInfo[tid].emptyROB = true;
-            }
-
            wroteToTimeBuffer = true;
            changedROBNumEntries[tid] = false;
+            if (rob->isEmpty(tid))
+                checkEmptyROB[tid] = true;
        }
+
+        // ROB is only considered "empty" for previous stages if: a)
+        // ROB is empty, b) there are no outstanding stores, c) IEW
+        // stage has received any information regarding stores that
+        // committed.
+        // c) is checked by making sure to not consider the ROB empty
+        // on the same cycle as when stores have been committed.
+        // @todo: Make this handle multi-cycle communication between
+        // commit and IEW.
+        if (checkEmptyROB[tid] && rob->isEmpty(tid) &&
+            !iewStage->hasStoresToWB() && !committedStores[tid]) {
+            checkEmptyROB[tid] = false;
+            toIEW->commitInfo[tid].usedROB = true;
+            toIEW->commitInfo[tid].emptyROB = true;
+            toIEW->commitInfo[tid].freeROBEntries = rob->numFreeEntries(tid);
+            wroteToTimeBuffer = true;
+        }
+
    }
 }

@ -966,8 +1006,6 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
        // and committed this instruction.
        thread[tid]->funcExeInst--;

-        head_inst->setAtCommit();
-
        if (head_inst->isNonSpeculative() ||
            head_inst->isStoreConditional() ||
            head_inst->isMemBarrier() ||
@ -977,19 +1015,9 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
                    "instruction [sn:%lli] at the head of the ROB, PC %#x.\n",
                    head_inst->seqNum, head_inst->readPC());

-            // Hack to make sure syscalls/memory barriers/quiesces
-            // aren't executed until all stores write back their data.
-            // This direct communication shouldn't be used for
-            // anything other than this.
-            if ((head_inst->isMemBarrier() || head_inst->isWriteBarrier() ||
-                    head_inst->isQuiesce()) &&
-                iewStage->hasStoresToWB())
-            {
+            if (inst_num > 0 || iewStage->hasStoresToWB()) {
                DPRINTF(Commit, "Waiting for all stores to writeback.\n");
                return false;
-            } else if (inst_num > 0 || iewStage->hasStoresToWB()) {
-                DPRINTF(Commit, "Waiting to become head of commit.\n");
-                return false;
            }

            toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
@ -1002,6 +1030,12 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)

            return false;
        } else if (head_inst->isLoad()) {
+            if (inst_num > 0 || iewStage->hasStoresToWB()) {
+                DPRINTF(Commit, "Waiting for all stores to writeback.\n");
+                return false;
+            }
+
+            assert(head_inst->uncacheable());
            DPRINTF(Commit, "[sn:%lli]: Uncached load, PC %#x.\n",
                    head_inst->seqNum, head_inst->readPC());

@ -1025,8 +1059,11 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
        panic("Thread sync instructions are not handled yet.\n");
    }

+    // Check if the instruction caused a fault.  If so, trap.
+    Fault inst_fault = head_inst->getFault();
+
    // Stores mark themselves as completed.
-    if (!head_inst->isStore()) {
+    if (!head_inst->isStore() && inst_fault == NoFault) {
        head_inst->setCompleted();
    }

@ -1038,9 +1075,6 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
    }
 #endif

-    // Check if the instruction caused a fault.  If so, trap.
-    Fault inst_fault = head_inst->getFault();
-
    // DTB will sometimes need the machine instruction for when
    // faults happen.  So we will set it here, prior to the DTB
    // possibly needing it for its fault.
@ -1048,7 +1082,6 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
        static_cast<TheISA::MachInst>(head_inst->staticInst->machInst));

    if (inst_fault != NoFault) {
-        head_inst->setCompleted();
        DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
                head_inst->seqNum, head_inst->readPC());

@ -1057,6 +1090,8 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
            return false;
        }

+        head_inst->setCompleted();
+
 #if USE_CHECKER
        if (cpu->checker && head_inst->isStore()) {
            cpu->checker->verify(head_inst);
@ -1082,6 +1117,13 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)

        commitStatus[tid] = TrapPending;

+        if (head_inst->traceData) {
+            head_inst->traceData->setFetchSeq(head_inst->seqNum);
+            head_inst->traceData->setCPSeq(thread[tid]->numInst);
+            head_inst->traceData->finalize();
+            head_inst->traceData = NULL;
+        }
+
        // Generate trap squash event.
        generateTrapEvent(tid);
 //        warn("%lli fault (%d) handled @ PC %08p", curTick, inst_fault->name(), head_inst->readPC());
@ -1123,6 +1165,10 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
    // Finally clear the head ROB entry.
    rob->retireHead(tid);

+    // If this was a store, record it for this cycle.
+    if (head_inst->isStore())
+        committedStores[tid] = true;
+
    // Return true to indicate that we have committed an instruction.
    return true;
 }
@ -1167,7 +1213,8 @@ DefaultCommit<Impl>::getInsts()
        int tid = inst->threadNumber;

        if (!inst->isSquashed() &&
-            commitStatus[tid] != ROBSquashing) {
+            commitStatus[tid] != ROBSquashing &&
+            commitStatus[tid] != TrapPending) {
            changedROBNumEntries[tid] = true;

            DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ROB.\n",
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@ -466,7 +466,7 @@ FullO3CPU<Impl>::tick()
            lastRunningCycle = curTick;
            timesIdled++;
        } else {
-            tickEvent.schedule(curTick + cycles(1));
+            tickEvent.schedule(nextCycle(curTick + cycles(1)));
            DPRINTF(O3CPU, "Scheduling next tick!\n");
        }
    }
@ -886,7 +886,7 @@ FullO3CPU<Impl>::resume()
 #endif

    if (!tickEvent.scheduled())
-        tickEvent.schedule(curTick);
+        tickEvent.schedule(nextCycle());
    _status = Running;
 }

@ -979,11 +979,11 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
        ThreadContext *tc = threadContexts[i];
        if (tc->status() == ThreadContext::Active && _status != Running) {
            _status = Running;
-            tickEvent.schedule(curTick);
+            tickEvent.schedule(nextCycle());
        }
    }
    if (!tickEvent.scheduled())
-        tickEvent.schedule(curTick);
+        tickEvent.schedule(nextCycle());
 }

 template <class Impl>
@ -1393,7 +1393,7 @@ FullO3CPU<Impl>::wakeCPU()

    idleCycles += (curTick - 1) - lastRunningCycle;

-    tickEvent.schedule(curTick);
+    tickEvent.schedule(nextCycle());
 }

 template <class Impl>
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@ -146,9 +146,9 @@ class FullO3CPU : public BaseO3CPU
    void scheduleTickEvent(int delay)
    {
        if (tickEvent.squashed())
-            tickEvent.reschedule(curTick + cycles(delay));
+            tickEvent.reschedule(nextCycle(curTick + cycles(delay)));
        else if (!tickEvent.scheduled())
-            tickEvent.schedule(curTick + cycles(delay));
+            tickEvent.schedule(nextCycle(curTick + cycles(delay)));
    }

    /** Unschedule tick event, regardless of its current state. */
@ -186,9 +186,11 @@ class FullO3CPU : public BaseO3CPU
    {
        // Schedule thread to activate, regardless of its current state.
        if (activateThreadEvent[tid].squashed())
-            activateThreadEvent[tid].reschedule(curTick + cycles(delay));
+            activateThreadEvent[tid].
+                reschedule(nextCycle(curTick + cycles(delay)));
        else if (!activateThreadEvent[tid].scheduled())
-            activateThreadEvent[tid].schedule(curTick + cycles(delay));
+            activateThreadEvent[tid].
+                schedule(nextCycle(curTick + cycles(delay)));
    }

    /** Unschedule actiavte thread event, regardless of its current state. */
@ -235,9 +237,11 @@ class FullO3CPU : public BaseO3CPU
    {
        // Schedule thread to activate, regardless of its current state.
        if (deallocateContextEvent[tid].squashed())
-            deallocateContextEvent[tid].reschedule(curTick + cycles(delay));
+            deallocateContextEvent[tid].
+                reschedule(nextCycle(curTick + cycles(delay)));
        else if (!deallocateContextEvent[tid].scheduled())
-            deallocateContextEvent[tid].schedule(curTick + cycles(delay));
+            deallocateContextEvent[tid].
+                schedule(nextCycle(curTick + cycles(delay)));
    }

    /** Unschedule thread deallocation in CPU */
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@ -620,6 +620,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
                fault = TheISA::genMachineCheckFault();
                delete mem_req;
                memReq[tid] = NULL;
+                warn("Bad address!\n");
            }
            assert(retryPkt == NULL);
            assert(retryTid == -1);
@ -670,11 +671,12 @@ DefaultFetch<Impl>::doSquash(const Addr &new_PC,
    // Get rid of the retrying packet if it was from this thread.
    if (retryTid == tid) {
        assert(cacheBlocked);
-        cacheBlocked = false;
-        retryTid = -1;
+        if (retryPkt) {
            delete retryPkt->req;
            delete retryPkt;
+        }
        retryPkt = NULL;
+        retryTid = -1;
    }

    fetchStatus[tid] = Squashing;
@ -1150,7 +1152,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)

            ///FIXME This needs to be more robust in dealing with delay slots
 #if !ISA_HAS_DELAY_SLOT
-            predicted_branch |=
+//	    predicted_branch |=
 #endif
            lookupAndUpdateNextPC(instruction, next_PC, next_NPC);
            predicted_branch |= (next_PC != fetch_NPC);
@ -1221,7 +1223,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
        // until commit handles the fault.  The only other way it can
        // wake up is if a squash comes along and changes the PC.
 #if FULL_SYSTEM
-        assert(numInst != fetchWidth);
+        assert(numInst < fetchWidth);
        // Get a sequence number.
        inst_seq = cpu->getAndIncrementInstSeq();
        // We will use a nop in order to carry the fault.
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@ -1152,19 +1152,6 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
            // Same as non-speculative stores.
            inst->setCanCommit();
            instQueue.insertBarrier(inst);
-            add_to_iq = false;
-        } else if (inst->isNonSpeculative()) {
-            DPRINTF(IEW, "[tid:%i]: Issue: Nonspeculative instruction "
-                    "encountered, skipping.\n", tid);
-
-            // Same as non-speculative stores.
-            inst->setCanCommit();
-
-            // Specifically insert it as nonspeculative.
-            instQueue.insertNonSpec(inst);
-
-            ++iewDispNonSpecInsts;
-
            add_to_iq = false;
        } else if (inst->isNop()) {
            DPRINTF(IEW, "[tid:%i]: Issue: Nop instruction encountered, "
@ -1193,6 +1180,20 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
        } else {
            add_to_iq = true;
        }
+        if (inst->isNonSpeculative()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Nonspeculative instruction "
+                    "encountered, skipping.\n", tid);
+
+            // Same as non-speculative stores.
+            inst->setCanCommit();
+
+            // Specifically insert it as nonspeculative.
+            instQueue.insertNonSpec(inst);
+
+            ++iewDispNonSpecInsts;
+
+            add_to_iq = false;
+        }

        // If the instruction queue is not full, then add the
        // instruction.
@ -1379,6 +1380,7 @@ DefaultIEW<Impl>::executeInsts()
                    predictedNotTakenIncorrect++;
                }
            } else if (ldstQueue.violation(tid)) {
+                assert(inst->isMemRef());
                // If there was an ordering violation, then get the
                // DynInst that caused the violation.  Note that this
                // clears the violation signal.
@ -1391,10 +1393,10 @@ DefaultIEW<Impl>::executeInsts()

                // Ensure the violating instruction is older than
                // current squash
-                if (fetchRedirect[tid] &&
-                    violator->seqNum >= toCommit->squashedSeqNum[tid])
+/*                if (fetchRedirect[tid] &&
+                    violator->seqNum >= toCommit->squashedSeqNum[tid] + 1)
                    continue;
-
+*/
                fetchRedirect[tid] = true;

                // Tell the instruction queue that a violation has occured.
@ -1414,6 +1416,33 @@ DefaultIEW<Impl>::executeInsts()

                squashDueToMemBlocked(inst, tid);
            }
+        } else {
+            // Reset any state associated with redirects that will not
+            // be used.
+            if (ldstQueue.violation(tid)) {
+                assert(inst->isMemRef());
+
+                DynInstPtr violator = ldstQueue.getMemDepViolator(tid);
+
+                DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
+                        "%#x, inst PC: %#x.  Addr is: %#x.\n",
+                        violator->readPC(), inst->readPC(), inst->physEffAddr);
+                DPRINTF(IEW, "Violation will not be handled because "
+                        "already squashing\n");
+
+                ++memOrderViolationEvents;
+            }
+            if (ldstQueue.loadBlocked(tid) &&
+                !ldstQueue.isLoadBlockedHandled(tid)) {
+                DPRINTF(IEW, "Load operation couldn't execute because the "
+                        "memory system is blocked.  PC: %#x [sn:%lli]\n",
+                        inst->readPC(), inst->seqNum);
+                DPRINTF(IEW, "Blocked load will not be handled because "
+                        "already squashing\n");
+
+                ldstQueue.setLoadBlockedHandled(tid);
+            }
+
        }
    }

@ -1563,6 +1592,7 @@ DefaultIEW<Impl>::tick()
            //DPRINTF(IEW,"NonspecInst from thread %i",tid);
            if (fromCommit->commitInfo[tid].uncached) {
                instQueue.replayMemInst(fromCommit->commitInfo[tid].uncachedLoad);
+                fromCommit->commitInfo[tid].uncachedLoad->setAtCommit();
            } else {
                instQueue.scheduleNonSpec(
                    fromCommit->commitInfo[tid].nonSpecSeqNum);
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@ -829,6 +829,8 @@ InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)

    unsigned tid = (*inst_it).second->threadNumber;

+    (*inst_it).second->setAtCommit();
+
    (*inst_it).second->setCanIssue();

    if (!(*inst_it).second->isMemRef()) {
@ -960,6 +962,8 @@ template <class Impl>
 void
 InstructionQueue<Impl>::rescheduleMemInst(DynInstPtr &resched_inst)
 {
+    DPRINTF(IQ, "Rescheduling mem inst [sn:%lli]\n", resched_inst->seqNum);
+    resched_inst->clearCanIssue();
    memDepUnit[resched_inst->threadNumber].reschedule(resched_inst);
 }

@ -984,7 +988,6 @@ InstructionQueue<Impl>::completeMemInst(DynInstPtr &completed_inst)
    completed_inst->memOpDone = true;

    memDepUnit[tid].completed(completed_inst);
-
    count[tid]--;
 }

@ -1084,10 +1087,14 @@ InstructionQueue<Impl>::doSquash(unsigned tid)

                    ++iqSquashedOperandsExamined;
                }
-            } else if (!squashed_inst->isStoreConditional() || !squashed_inst->isCompleted()) {
+            } else if (!squashed_inst->isStoreConditional() ||
+                       !squashed_inst->isCompleted()) {
                NonSpecMapIt ns_inst_it =
                    nonSpecInsts.find(squashed_inst->seqNum);
                assert(ns_inst_it != nonSpecInsts.end());
+                if (ns_inst_it == nonSpecInsts.end()) {
+                    assert(squashed_inst->getFault() != NoFault);
+                } else {

                    (*ns_inst_it).second = NULL;

@ -1095,6 +1102,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid)

                    ++iqSquashedNonSpecRemoved;
                }
+            }

            // Might want to also clear out the head of the dependency graph.

--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@ -497,6 +497,11 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
        (load_idx != loadHead || !load_inst->isAtCommit())) {
        iewStage->rescheduleMemInst(load_inst);
        ++lsqRescheduledLoads;
+
+        // Must delete request now that it wasn't handed off to
+        // memory.  This is quite ugly.  @todo: Figure out the proper
+        // place to really handle request deletes.
+        delete req;
        return TheISA::genMachineCheckFault();
    }

@ -534,6 +539,10 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)

        if (store_size == 0)
            continue;
+        else if (storeQueue[store_idx].inst->uncacheable())
+            continue;
+
+        assert(storeQueue[store_idx].inst->effAddrValid);

        // Check if the store data is within the lower and upper bounds of
        // addresses that the request needs.
@ -550,7 +559,7 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
            storeQueue[store_idx].inst->effAddr;

        // If the store's data has all of the data needed, we can forward.
-        if (store_has_lower_limit && store_has_upper_limit) {
+        if ((store_has_lower_limit && store_has_upper_limit)) {
            // Get shift amount for offset into the store's data.
            int shift_amt = req->getVaddr() & (store_size - 1);
            // @todo: Magic number, assumes byte addressing
@ -596,6 +605,7 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
            // If it's already been written back, then don't worry about
            // stalling on it.
            if (storeQueue[store_idx].completed) {
+                panic("Should not check one of these");
                continue;
            }

@ -614,6 +624,7 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
            // rescheduled eventually
            iewStage->rescheduleMemInst(load_inst);
            iewStage->decrWb(load_inst->seqNum);
+            load_inst->clearIssued();
            ++lsqRescheduledLoads;

            // Do not generate a writeback event as this instruction is not
@ -622,7 +633,11 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
                    "Store idx %i to load addr %#x\n",
                    store_idx, req->getVaddr());

-            ++lsqBlockedLoads;
+            // Must delete request now that it wasn't handed off to
+            // memory.  This is quite ugly.  @todo: Figure out the
+            // proper place to really handle request deletes.
+            delete req;
+
            return NoFault;
        }
    }
@ -654,8 +669,11 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
            // Delete state and data packet because a load retry
            // initiates a pipeline restart; it does not retry.
            delete state;
+            delete data_pkt->req;
            delete data_pkt;

+            req = NULL;
+
            if (result == Packet::BadAddress) {
                return TheISA::genMachineCheckFault();
            }
@ -669,6 +687,9 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
    // If the cache was blocked, or has become blocked due to the access,
    // handle it.
    if (lsq->cacheBlocked()) {
+        if (req)
+            delete req;
+
        ++lsqCacheBlocked;

        iewStage->decrWb(load_inst->seqNum);
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@ -81,6 +81,7 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
    if (isSwitchedOut() || inst->isSquashed()) {
        iewStage->decrWb(inst->seqNum);
        delete state;
+        delete pkt->req;
        delete pkt;
        return;
    } else {
@ -94,6 +95,7 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
    }

    delete state;
+    delete pkt->req;
    delete pkt;
 }

@ -403,12 +405,15 @@ template <class Impl>
 Fault
 LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
 {
+    using namespace TheISA;
    // Execute a specific load.
    Fault load_fault = NoFault;

    DPRINTF(LSQUnit, "Executing load PC %#x, [sn:%lli]\n",
            inst->readPC(),inst->seqNum);

+    assert(!inst->isSquashed());
+
    load_fault = inst->initiateAcc();

    // If the instruction faulted, then we need to send it along to commit
@ -418,12 +423,44 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
        // realizes there is activity.
        // Mark it as executed unless it is an uncached load that
        // needs to hit the head of commit.
-        if (!(inst->req && inst->req->isUncacheable()) ||
+        if (!(inst->hasRequest() && inst->uncacheable()) ||
            inst->isAtCommit()) {
            inst->setExecuted();
        }
        iewStage->instToCommit(inst);
        iewStage->activityThisCycle();
+    } else if (!loadBlocked()) {
+        assert(inst->effAddrValid);
+        int load_idx = inst->lqIdx;
+        incrLdIdx(load_idx);
+        while (load_idx != loadTail) {
+            // Really only need to check loads that have actually executed
+
+            // @todo: For now this is extra conservative, detecting a
+            // violation if the addresses match assuming all accesses
+            // are quad word accesses.
+
+            // @todo: Fix this, magic number being used here
+            if (loadQueue[load_idx]->effAddrValid &&
+                (loadQueue[load_idx]->effAddr >> 8) ==
+                (inst->effAddr >> 8)) {
+                // A load incorrectly passed this load.  Squash and refetch.
+                // For now return a fault to show that it was unsuccessful.
+                DynInstPtr violator = loadQueue[load_idx];
+                if (!memDepViolator ||
+                    (violator->seqNum < memDepViolator->seqNum)) {
+                    memDepViolator = violator;
+                } else {
+                    break;
+                }
+
+                ++lsqMemOrderViolation;
+
+                return genMachineCheckFault();
+            }
+
+            incrLdIdx(load_idx);
+        }
    }

    return load_fault;
@ -442,6 +479,8 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
    DPRINTF(LSQUnit, "Executing store PC %#x [sn:%lli]\n",
            store_inst->readPC(), store_inst->seqNum);

+    assert(!store_inst->isSquashed());
+
    // Check the recently completed loads to see if any match this store's
    // address.  If so, then we have a memory ordering violation.
    int load_idx = store_inst->lqIdx;
@ -465,7 +504,7 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
        ++storesToWB;
    }

-    if (!memDepViolator) {
+    assert(store_inst->effAddrValid);
    while (load_idx != loadTail) {
        // Really only need to check loads that have actually executed
        // It's safe to check all loads because effAddr is set to
@ -476,11 +515,19 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
        // are quad word accesses.

        // @todo: Fix this, magic number being used here
-            if ((loadQueue[load_idx]->effAddr >> 8) ==
+        if (loadQueue[load_idx]->effAddrValid &&
+            (loadQueue[load_idx]->effAddr >> 8) ==
            (store_inst->effAddr >> 8)) {
            // A load incorrectly passed this store.  Squash and refetch.
            // For now return a fault to show that it was unsuccessful.
-                memDepViolator = loadQueue[load_idx];
+            DynInstPtr violator = loadQueue[load_idx];
+            if (!memDepViolator ||
+                (violator->seqNum < memDepViolator->seqNum)) {
+                memDepViolator = violator;
+            } else {
+                break;
+            }
+
            ++lsqMemOrderViolation;

            return genMachineCheckFault();
@ -489,10 +536,6 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
        incrLdIdx(load_idx);
    }

-        // If we've reached this point, there was no violation.
-        memDepViolator = NULL;
-    }
-
    return store_fault;
 }

@ -660,7 +703,7 @@ LSQUnit<Impl>::writebackStores()
                panic("LSQ sent out a bad address for a completed store!");
            }
            // Need to handle becoming blocked on a store.
-            DPRINTF(IEW, "D-Cache became blcoked when writing [sn:%lli], will"
+            DPRINTF(IEW, "D-Cache became blocked when writing [sn:%lli], will"
                    "retry later\n",
                    inst->seqNum);
            isStoreBlocked = true;
@ -735,6 +778,10 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
        }
    }

+    if (memDepViolator && squashed_num < memDepViolator->seqNum) {
+        memDepViolator = NULL;
+    }
+
    int store_idx = storeTail;
    decrStIdx(store_idx);

@ -764,6 +811,11 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
        storeQueue[store_idx].inst = NULL;
        storeQueue[store_idx].canWB = 0;

+        // Must delete request now that it wasn't handed off to
+        // memory.  This is quite ugly.  @todo: Figure out the proper
+        // place to really handle request deletes.
+        delete storeQueue[store_idx].req;
+
        storeQueue[store_idx].req = NULL;
        --stores;

--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@ -214,6 +214,9 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
            inst_entry->regsReady = true;
        }

+        // Clear the bit saying this instruction can issue.
+        inst->clearCanIssue();
+
        // Add this instruction to the list of dependents.
        store_entry->dependInsts.push_back(inst_entry);

@ -357,7 +360,6 @@ void
 MemDepUnit<MemDepPred, Impl>::replay(DynInstPtr &inst)
 {
    DynInstPtr temp_inst;
-    bool found_inst = false;

    // For now this replay function replays all waiting memory ops.
    while (!instsToReplay.empty()) {
@ -371,14 +373,8 @@ MemDepUnit<MemDepPred, Impl>::replay(DynInstPtr &inst)

        moveToReady(inst_entry);

-        if (temp_inst == inst) {
-            found_inst = true;
-        }
-
        instsToReplay.pop_front();
    }
-
-    assert(found_inst);
 }

 template <class MemDepPred, class Impl>
--- a/src/cpu/o3/rename_map.cc
+++ b/src/cpu/o3/rename_map.cc
@ -192,8 +192,6 @@ SimpleRenameMap::rename(RegIndex arch_reg)
        // known that the prev reg was outside the range of normal registers
        // so the free list can avoid adding it.
        prev_reg = renamed_reg;
-
-        assert(renamed_reg < numPhysicalRegs + numMiscRegs);
    }

    DPRINTF(Rename, "Renamed reg %d to physical reg %d old mapping was %d\n",
--- a/src/mem/bus.cc
+++ b/src/mem/bus.cc
@ -171,8 +171,12 @@ Bus::recvTiming(PacketPtr pkt)
    }

    short dest = pkt->getDest();
+
+    // Make sure to clear the snoop commit flag so it doesn't think an
+    // access has been handled twice.
    if (dest == Packet::Broadcast) {
        port = findPort(pkt->getAddr(), pkt->getSrc());
+        pkt->flags &= ~SNOOP_COMMIT;
        if (timingSnoop(pkt, port ? port : interfaces[pkt->getSrc()])) {
            bool success;

--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@ -545,8 +545,13 @@ Cache<TagStore,Coherence>::access(PacketPtr &pkt)
        //We are determining prefetches on access stream, call prefetcher
        prefetcher->handleMiss(pkt, curTick);
    }
+
+    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
+
    if (!pkt->req->isUncacheable()) {
+        if (!missQueue->findMSHR(blk_addr)) {
            blk = handleAccess(pkt, lat, writebacks);
+        }
    } else {
        size = pkt->getSize();
    }
--- a/src/mem/cache/miss/miss_queue.cc
+++ b/src/mem/cache/miss/miss_queue.cc
@ -599,6 +599,7 @@ MissQueue::handleResponse(PacketPtr &pkt, Tick time)
            MemCmd cmd = mshr->getTarget()->cmd;
            mshr->pkt->setDest(Packet::Broadcast);
            mshr->pkt->result = Packet::Unknown;
+            mshr->pkt->req = mshr->getTarget()->req;
            mq.markPending(mshr, cmd);
            mshr->order = order++;
            cache->setMasterRequest(Request_MSHR, time);
--- a/tests/configs/o3-timing.py
+++ b/tests/configs/o3-timing.py
@ -1,4 +1,4 @@
-# Copyright (c) 2006 The Regents of The University of Michigan
+# Copyright (c) 2006-2007 The Regents of The University of Michigan
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@ -37,7 +37,7 @@ class MyCache(BaseCache):
    mshrs = 10
    tgts_per_mshr = 5

-cpu = DerivO3CPU()
+cpu = DerivO3CPU(cpu_id=0)
 cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
                              MyCache(size = '2MB'))