From 649c239ceef2d107fae253b1008c6f214f242d73 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Tue, 13 Sep 2011 12:58:08 -0400 Subject: [PATCH] LSQ: Only trigger a memory violation with a load/load if the value changes. Only create a memory ordering violation when the value could have changed between two subsequent loads, instead of just when loads go out-of-order to the same address. While not very common in the case of Alpha, with an architecture with a hardware table walker this can happen reasonably frequently beacuse a translation will miss and start a table walk and before the CPU re-schedules the faulting instruction another one will pass it to the same address (or cache block depending on the dendency checking). This patch has been tested with a couple of self-checking hand crafted programs to stress ordering between two cores. The performance improvement on SPEC benchmarks can be substantial (2-10%). --- src/arch/arm/faults.cc | 14 ----- src/arch/arm/faults.hh | 10 ---- src/cpu/base_dyn_inst.hh | 13 ++++ src/cpu/base_dyn_inst_impl.hh | 2 + src/cpu/o3/lsq_impl.hh | 14 +++-- src/cpu/o3/lsq_unit.hh | 13 +++- src/cpu/o3/lsq_unit_impl.hh | 109 ++++++++++++++++++++++++++++++---- src/sim/faults.cc | 6 ++ src/sim/faults.hh | 10 ++++ 9 files changed, 150 insertions(+), 41 deletions(-) diff --git a/src/arch/arm/faults.cc b/src/arch/arm/faults.cc index 3c361404e..68c5fa0e8 100644 --- a/src/arch/arm/faults.cc +++ b/src/arch/arm/faults.cc @@ -75,9 +75,6 @@ template<> ArmFault::FaultVals ArmFaultVals::vals = template<> ArmFault::FaultVals ArmFaultVals::vals = {"Pipe Flush", 0x00, MODE_SVC, 0, 0, true, true}; // some dummy values -template<> ArmFault::FaultVals ArmFaultVals::vals = - {"ReExec Flush", 0x00, MODE_SVC, 0, 0, true, true}; // some dummy values - template<> ArmFault::FaultVals ArmFaultVals::vals = {"ArmSev Flush", 0x00, MODE_SVC, 0, 0, true, true}; // some dummy values Addr @@ -240,17 +237,6 @@ FlushPipe::invoke(ThreadContext *tc, StaticInstPtr inst) { tc->pcState(pc); } -void -ReExec::invoke(ThreadContext *tc, StaticInstPtr inst) { - DPRINTF(Faults, "Invoking ReExec Fault\n"); - - // Set the PC to then the faulting instruction. - // Net effect is simply squashing all instructions including this - // instruction and refetching/rexecuting current instruction - PCState pc = tc->pcState(); - tc->pcState(pc); -} - template void AbortFault::invoke(ThreadContext *tc, StaticInstPtr inst); template void AbortFault::invoke(ThreadContext *tc, diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh index 54edb336b..fe1258a16 100644 --- a/src/arch/arm/faults.hh +++ b/src/arch/arm/faults.hh @@ -242,16 +242,6 @@ class FlushPipe : public ArmFaultVals StaticInstPtr inst = StaticInst::nullStaticInstPtr); }; -// A fault that flushes the pipe, including the faulting instructions -class ReExec : public ArmFaultVals -{ - public: - ReExec() {} - void invoke(ThreadContext *tc, - StaticInstPtr inst = StaticInst::nullStaticInstPtr); -}; - - static inline Fault genMachineCheckFault() { return new Reset(); diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index 5f0a6106e..5719fc84d 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -146,6 +146,19 @@ class BaseDynInst : public FastAlloc, public RefCounted /** True if the DTB address translation has completed. */ bool translationCompleted; + /** True if this address was found to match a previous load and they issued + * out of order. If that happend, then it's only a problem if an incoming + * snoop invalidate modifies the line, in which case we need to squash. + * If nothing modified the line the order doesn't matter. + */ + bool possibleLoadViolation; + + /** True if the address hit a external snoop while sitting in the LSQ. + * If this is true and a older instruction sees it, this instruction must + * reexecute + */ + bool hitExternalSnoop; + /** * Returns true if the DTB address translation is being delayed due to a hw * page table walk. diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh index b2819bb26..a37ec5e25 100644 --- a/src/cpu/base_dyn_inst_impl.hh +++ b/src/cpu/base_dyn_inst_impl.hh @@ -110,6 +110,8 @@ BaseDynInst::initVars() translationStarted = false; translationCompleted = false; + possibleLoadViolation = false; + hitExternalSnoop = false; isUncacheable = false; reqMade = false; diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index 41b0f3041..ef9167d8c 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -90,11 +90,17 @@ LSQ::DcachePort::recvTiming(PacketPtr pkt) DPRINTF(LSQ, "Got error packet back for address: %#X\n", pkt->getAddr()); if (pkt->isResponse()) { lsq->thread[pkt->req->threadId()].completeDataAccess(pkt); - } - else { - // must be a snoop + } else { + DPRINTF(LSQ, "received pkt for addr:%#x %s\n", pkt->getAddr(), + pkt->cmdString()); - // @TODO someday may need to process invalidations in LSQ here + // must be a snoop + if (pkt->isInvalidate()) { + DPRINTF(LSQ, "received invalidation for addr:%#x\n", pkt->getAddr()); + for (ThreadID tid = 0; tid < lsq->numThreads; tid++) { + lsq->thread[tid].checkSnoop(pkt); + } + } // to provide stronger consistency model } return true; diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 693bee82c..af926759c 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -115,12 +115,20 @@ class LSQUnit { /** Inserts a store instruction. */ void insertStore(DynInstPtr &store_inst); - /** Check for ordering violations in the LSQ + /** Check for ordering violations in the LSQ. For a store squash if we + * ever find a conflicting load. For a load, only squash if we + * an external snoop invalidate has been seen for that load address * @param load_idx index to start checking at * @param inst the instruction to check */ Fault checkViolations(int load_idx, DynInstPtr &inst); + /** Check if an incoming invalidate hits in the lsq on a load + * that might have issued out of order wrt another load beacuse + * of the intermediate invalidate. + */ + void checkSnoop(PacketPtr pkt); + /** Executes a load instruction. */ Fault executeLoad(DynInstPtr &inst); @@ -417,6 +425,9 @@ class LSQUnit { //list mshrSeqNums; + /** Address Mask for a cache block (e.g. ~(cache_block_size-1)) */ + Addr cacheBlockMask; + /** Wire to read information from the issue stage time queue. */ typename TimeBuffer::wire fromIssue; diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 79a20a673..f934e2032 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -135,7 +135,7 @@ LSQUnit::completeDataAccess(PacketPtr pkt) template LSQUnit::LSQUnit() - : loads(0), stores(0), storesToWB(0), stalled(false), + : loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false), isStoreBlocked(false), isLoadBlocked(false), loadBlockedHandled(false), hasPendingPkt(false) { @@ -154,6 +154,8 @@ LSQUnit::init(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params, switchedOut = false; + cacheBlockMask = 0; + lsq = lsq_ptr; lsqID = id; @@ -297,6 +299,9 @@ LSQUnit::takeOverFrom() stalled = false; isLoadBlocked = false; loadBlockedHandled = false; + + // Just incase the memory system changed out from under us + cacheBlockMask = 0; } template @@ -442,6 +447,60 @@ LSQUnit::numLoadsReady() return retval; } +template +void +LSQUnit::checkSnoop(PacketPtr pkt) +{ + int load_idx = loadHead; + + if (!cacheBlockMask) { + assert(dcachePort); + Addr bs = dcachePort->peerBlockSize(); + + // Make sure we actually got a size + assert(bs != 0); + + cacheBlockMask = ~(bs - 1); + } + + // If this is the only load in the LSQ we don't care + if (load_idx == loadTail) + return; + incrLdIdx(load_idx); + + DPRINTF(LSQUnit, "Got snoop for address %#x\n", pkt->getAddr()); + Addr invalidate_addr = pkt->getAddr() & cacheBlockMask; + while (load_idx != loadTail) { + DynInstPtr ld_inst = loadQueue[load_idx]; + + if (!ld_inst->effAddrValid || ld_inst->uncacheable()) { + incrLdIdx(load_idx); + continue; + } + + Addr load_addr = ld_inst->physEffAddr & cacheBlockMask; + DPRINTF(LSQUnit, "-- inst [sn:%lli] load_addr: %#x to pktAddr:%#x\n", + ld_inst->seqNum, load_addr, invalidate_addr); + + if (load_addr == invalidate_addr) { + if (ld_inst->possibleLoadViolation) { + DPRINTF(LSQUnit, "Conflicting load at addr %#x [sn:%lli]\n", + ld_inst->physEffAddr, pkt->getAddr(), ld_inst->seqNum); + + // Mark the load for re-execution + ld_inst->fault = new ReExec; + } else { + // If a older load checks this and it's true + // then we might have missed the snoop + // in which case we need to invalidate to be sure + ld_inst->hitExternalSnoop = true; + } + } + incrLdIdx(load_idx); + } + return; +} + template Fault LSQUnit::checkViolations(int load_idx, DynInstPtr &inst) @@ -466,20 +525,46 @@ LSQUnit::checkViolations(int load_idx, DynInstPtr &inst) (ld_inst->effAddr + ld_inst->effSize - 1) >> depCheckShift; if (inst_eff_addr2 >= ld_eff_addr1 && inst_eff_addr1 <= ld_eff_addr2) { - // A load/store incorrectly passed this load/store. - // Check if we already have a violator, or if it's newer - // squash and refetch. - if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum) - break; + if (inst->isLoad()) { + // If this load is to the same block as an external snoop + // invalidate that we've observed then the load needs to be + // squashed as it could have newer data + if (ld_inst->hitExternalSnoop) { + if (!memDepViolator || + ld_inst->seqNum < memDepViolator->seqNum) { + DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] " + " and [sn:%lli] at address %#x\n", inst->seqNum, + ld_inst->seqNum, ld_eff_addr1); + memDepViolator = ld_inst; - DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] and [sn:%lli]" - " at address %#x\n", inst->seqNum, ld_inst->seqNum, - ld_eff_addr1); - memDepViolator = ld_inst; + ++lsqMemOrderViolation; - ++lsqMemOrderViolation; + return TheISA::genMachineCheckFault(); + } + } - return TheISA::genMachineCheckFault(); + // Otherwise, mark the load has a possible load violation + // and if we see a snoop before it's commited, we need to squash + ld_inst->possibleLoadViolation = true; + DPRINTF(LSQUnit, "Found possible load violaiton at addr: %#x" + " between instructions [sn:%lli] and [sn:%lli]\n", + inst_eff_addr1, inst->seqNum, ld_inst->seqNum); + } else { + // A load/store incorrectly passed this store. + // Check if we already have a violator, or if it's newer + // squash and refetch. + if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum) + break; + + DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] and [sn:%lli]" + " at address %#x\n", inst->seqNum, ld_inst->seqNum, + ld_eff_addr1); + memDepViolator = ld_inst; + + ++lsqMemOrderViolation; + + return TheISA::genMachineCheckFault(); + } } incrLdIdx(load_idx); diff --git a/src/sim/faults.cc b/src/sim/faults.cc index 3f1369bcc..bd05df834 100644 --- a/src/sim/faults.cc +++ b/src/sim/faults.cc @@ -56,6 +56,12 @@ void UnimpFault::invoke(ThreadContext * tc, StaticInstPtr inst) panic("Unimpfault: %s\n", panicStr.c_str()); } +void ReExec::invoke(ThreadContext *tc, StaticInstPtr inst) +{ + tc->pcState(tc->pcState()); +} + + #if !FULL_SYSTEM void GenericPageTableFault::invoke(ThreadContext *tc, StaticInstPtr inst) { diff --git a/src/sim/faults.hh b/src/sim/faults.hh index 7f431b313..f5e9aded1 100644 --- a/src/sim/faults.hh +++ b/src/sim/faults.hh @@ -75,6 +75,16 @@ class UnimpFault : public FaultBase StaticInstPtr inst = StaticInst::nullStaticInstPtr); }; +class ReExec : public FaultBase +{ + public: + virtual FaultName name() const { return "Re-execution fault";} + ReExec() {} + void invoke(ThreadContext *tc, + StaticInstPtr inst = StaticInst::nullStaticInstPtr); +}; + + #if !FULL_SYSTEM class GenericPageTableFault : public FaultBase {