From 7dde557fdc51140988092962137e1006d1609bea Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Mon, 4 Apr 2011 11:42:23 -0500 Subject: [PATCH] O3: Tighten memory order violation checking to 16 bytes. The comment in the code suggests that the checking granularity should be 16 bytes, however in reality the shift by 8 is 256 bytes which seems much larger than required. --- src/cpu/base_dyn_inst.hh | 5 ++ src/cpu/o3/O3CPU.py | 3 + src/cpu/o3/lsq_unit.hh | 14 ++++ src/cpu/o3/lsq_unit_impl.hh | 124 ++++++++++++++++-------------------- 4 files changed, 77 insertions(+), 69 deletions(-) diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index 8b6662d70..a5357a7b0 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -243,6 +243,9 @@ class BaseDynInst : public FastAlloc, public RefCounted /** The effective virtual address (lds & stores only). */ Addr effAddr; + /** The size of the request */ + Addr effSize; + /** Is the effective virtual address valid. */ bool effAddrValid; @@ -892,6 +895,7 @@ BaseDynInst::readBytes(Addr addr, uint8_t *data, if (translationCompleted) { if (fault == NoFault) { effAddr = req->getVaddr(); + effSize = size; effAddrValid = true; fault = cpu->read(req, sreqLow, sreqHigh, data, lqIdx); } else { @@ -962,6 +966,7 @@ BaseDynInst::writeBytes(uint8_t *data, unsigned size, if (fault == NoFault && translationCompleted) { effAddr = req->getVaddr(); + effSize = size; effAddrValid = true; fault = cpu->write(req, sreqLow, sreqHigh, data, sqIdx); } diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py index f7602cb86..f379fcd8a 100644 --- a/src/cpu/o3/O3CPU.py +++ b/src/cpu/o3/O3CPU.py @@ -118,6 +118,9 @@ class DerivO3CPU(BaseCPU): LQEntries = Param.Unsigned(32, "Number of load queue entries") SQEntries = Param.Unsigned(32, "Number of store queue entries") + LSQDepCheckShift = Param.Unsigned(4, "Number of places to shift addr before check") + LSQCheckLoads = Param.Bool(True, + "Should dependency violations be checked for loads & stores or just stores") LFSTSize = Param.Unsigned(1024, "Last fetched store table size") SSITSize = Param.Unsigned(1024, "Store set ID table size") diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 2bb42cadc..bdc524dec 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -111,6 +111,12 @@ class LSQUnit { /** Inserts a store instruction. */ void insertStore(DynInstPtr &store_inst); + /** Check for ordering violations in the LSQ + * @param load_idx index to start checking at + * @param inst the instruction to check + */ + Fault checkViolations(int load_idx, DynInstPtr &inst); + /** Executes a load instruction. */ Fault executeLoad(DynInstPtr &inst); @@ -366,6 +372,14 @@ class LSQUnit { */ unsigned SQEntries; + /** The number of places to shift addresses in the LSQ before checking + * for dependency violations + */ + unsigned depCheckShift; + + /** Should loads be checked for dependency issues */ + bool checkLoads; + /** The number of load instructions in the LQ. */ int loads; /** The number of store instructions in the SQ. */ diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 1a4e686a3..70b87ff26 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -162,6 +162,9 @@ LSQUnit::init(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params, loadQueue.resize(LQEntries); storeQueue.resize(SQEntries); + depCheckShift = params->LSQDepCheckShift; + checkLoads = params->LSQCheckLoads; + loadHead = loadTail = 0; storeHead = storeWBIdx = storeTail = 0; @@ -436,6 +439,55 @@ LSQUnit::numLoadsReady() return retval; } +template +Fault +LSQUnit::checkViolations(int load_idx, DynInstPtr &inst) +{ + Addr inst_eff_addr1 = inst->effAddr >> depCheckShift; + Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift; + + /** @todo in theory you only need to check an instruction that has executed + * however, there isn't a good way in the pipeline at the moment to check + * all instructions that will execute before the store writes back. Thus, + * like the implementation that came before it, we're overly conservative. + */ + while (load_idx != loadTail) { + DynInstPtr ld_inst = loadQueue[load_idx]; + if (!ld_inst->effAddrValid || ld_inst->uncacheable()) { + incrLdIdx(load_idx); + continue; + } + + Addr ld_eff_addr1 = ld_inst->effAddr >> depCheckShift; + Addr ld_eff_addr2 = + (ld_inst->effAddr + ld_inst->effSize - 1) >> depCheckShift; + + if ((inst_eff_addr2 > ld_eff_addr1 && inst_eff_addr1 < ld_eff_addr2) || + inst_eff_addr1 == ld_eff_addr1) { + // A load/store incorrectly passed this load/store. + // Check if we already have a violator, or if it's newer + // squash and refetch. + if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum) + break; + + DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] and [sn:%lli]" + " at address %#x\n", inst->seqNum, ld_inst->seqNum, + ld_eff_addr1); + memDepViolator = ld_inst; + + ++lsqMemOrderViolation; + + return TheISA::genMachineCheckFault(); + } + + incrLdIdx(load_idx); + } + return NoFault; +} + + + + template Fault LSQUnit::executeLoad(DynInstPtr &inst) @@ -477,39 +529,9 @@ LSQUnit::executeLoad(DynInstPtr &inst) assert(inst->effAddrValid); int load_idx = inst->lqIdx; incrLdIdx(load_idx); - while (load_idx != loadTail) { - // Really only need to check loads that have actually executed - // @todo: For now this is extra conservative, detecting a - // violation if the addresses match assuming all accesses - // are quad word accesses. - - // @todo: Fix this, magic number being used here - - // @todo: Uncachable load is not executed until it reaches - // the head of the ROB. Once this if checks only the executed - // loads(as noted above), this check can be removed - if (loadQueue[load_idx]->effAddrValid && - ((loadQueue[load_idx]->effAddr >> 8) - == (inst->effAddr >> 8)) && - !loadQueue[load_idx]->uncacheable()) { - // A load incorrectly passed this load. Squash and refetch. - // For now return a fault to show that it was unsuccessful. - DynInstPtr violator = loadQueue[load_idx]; - if (!memDepViolator || - (violator->seqNum < memDepViolator->seqNum)) { - memDepViolator = violator; - } else { - break; - } - - ++lsqMemOrderViolation; - - return genMachineCheckFault(); - } - - incrLdIdx(load_idx); - } + if (checkLoads) + return checkViolations(load_idx, inst); } return load_fault; @@ -564,44 +586,8 @@ LSQUnit::executeStore(DynInstPtr &store_inst) ++storesToWB; } - assert(store_inst->effAddrValid); - while (load_idx != loadTail) { - // Really only need to check loads that have actually executed - // It's safe to check all loads because effAddr is set to - // InvalAddr when the dyn inst is created. + return checkViolations(load_idx, store_inst); - // @todo: For now this is extra conservative, detecting a - // violation if the addresses match assuming all accesses - // are quad word accesses. - - // @todo: Fix this, magic number being used here - - // @todo: Uncachable load is not executed until it reaches - // the head of the ROB. Once this if checks only the executed - // loads(as noted above), this check can be removed - if (loadQueue[load_idx]->effAddrValid && - ((loadQueue[load_idx]->effAddr >> 8) - == (store_inst->effAddr >> 8)) && - !loadQueue[load_idx]->uncacheable()) { - // A load incorrectly passed this store. Squash and refetch. - // For now return a fault to show that it was unsuccessful. - DynInstPtr violator = loadQueue[load_idx]; - if (!memDepViolator || - (violator->seqNum < memDepViolator->seqNum)) { - memDepViolator = violator; - } else { - break; - } - - ++lsqMemOrderViolation; - - return genMachineCheckFault(); - } - - incrLdIdx(load_idx); - } - - return store_fault; } template