From 8b9b85e92cde81ef9eb0cf6595be59c96fd13f97 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Mon, 15 Nov 2010 19:37:03 -0800 Subject: [PATCH] O3: Make O3 support variably lengthed instructions. --- src/arch/alpha/predecoder.hh | 11 +- src/arch/mips/predecoder.hh | 11 +- src/arch/power/predecoder.hh | 8 +- src/arch/sparc/predecoder.hh | 13 +- src/cpu/base.hh | 3 + src/cpu/o3/fetch.hh | 13 +- src/cpu/o3/fetch_impl.hh | 272 +++++++++++++++++++---------------- src/cpu/simple/base.hh | 3 - 8 files changed, 196 insertions(+), 138 deletions(-) diff --git a/src/arch/alpha/predecoder.hh b/src/arch/alpha/predecoder.hh index f9a716b7f..a8788051f 100644 --- a/src/arch/alpha/predecoder.hh +++ b/src/arch/alpha/predecoder.hh @@ -47,10 +47,11 @@ class Predecoder // The extended machine instruction being generated ExtMachInst ext_inst; + bool emiIsReady; public: Predecoder(ThreadContext * _tc) - : tc(_tc) + : tc(_tc), emiIsReady(false) {} ThreadContext * @@ -71,7 +72,9 @@ class Predecoder void reset() - { } + { + emiIsReady = false; + } // Use this to give data to the predecoder. This should be used // when there is control flow. @@ -79,6 +82,7 @@ class Predecoder moreBytes(const PCState &pc, Addr fetchPC, MachInst inst) { ext_inst = inst; + emiIsReady = true; #if FULL_SYSTEM ext_inst |= (static_cast(pc.pc() & 0x1) << 32); #endif @@ -93,13 +97,14 @@ class Predecoder bool extMachInstReady() { - return true; + return emiIsReady; } // This returns a constant reference to the ExtMachInst to avoid a copy const ExtMachInst & getExtMachInst(PCState &pc) { + emiIsReady = false; return ext_inst; } }; diff --git a/src/arch/mips/predecoder.hh b/src/arch/mips/predecoder.hh index f059710e5..4220b768c 100644 --- a/src/arch/mips/predecoder.hh +++ b/src/arch/mips/predecoder.hh @@ -47,9 +47,10 @@ class Predecoder ThreadContext * tc; //The extended machine instruction being generated ExtMachInst emi; + bool emiIsReady; public: - Predecoder(ThreadContext * _tc) : tc(_tc) + Predecoder(ThreadContext * _tc) : tc(_tc), emiIsReady(false) {} ThreadContext *getTC() @@ -70,7 +71,9 @@ class Predecoder void reset() - {} + { + emiIsReady = false; + } //Use this to give data to the predecoder. This should be used //when there is control flow. @@ -78,6 +81,7 @@ class Predecoder moreBytes(const PCState &pc, Addr fetchPC, MachInst inst) { emi = inst; + emiIsReady = true; } bool @@ -89,13 +93,14 @@ class Predecoder bool extMachInstReady() { - return true; + return emiIsReady; } //This returns a constant reference to the ExtMachInst to avoid a copy const ExtMachInst & getExtMachInst(PCState &pc) { + emiIsReady = false; return emi; } }; diff --git a/src/arch/power/predecoder.hh b/src/arch/power/predecoder.hh index b1f2b6e38..431c5d1b7 100644 --- a/src/arch/power/predecoder.hh +++ b/src/arch/power/predecoder.hh @@ -51,10 +51,11 @@ class Predecoder // The extended machine instruction being generated ExtMachInst emi; + bool emiIsReady; public: Predecoder(ThreadContext * _tc) - : tc(_tc) + : tc(_tc), emiIsReady(false) { } @@ -78,6 +79,7 @@ class Predecoder void reset() { + emiIsReady = false; } // Use this to give data to the predecoder. This should be used @@ -86,6 +88,7 @@ class Predecoder moreBytes(const PCState &pc, Addr fetchPC, MachInst inst) { emi = inst; + emiIsReady = true; } // Use this to give data to the predecoder. This should be used @@ -105,13 +108,14 @@ class Predecoder bool extMachInstReady() { - return true; + return emiIsReady; } // This returns a constant reference to the ExtMachInst to avoid a copy const ExtMachInst & getExtMachInst(PCState &pcState) { + emiIsReady = false; return emi; } }; diff --git a/src/arch/sparc/predecoder.hh b/src/arch/sparc/predecoder.hh index f7c7c90b4..670c547d0 100644 --- a/src/arch/sparc/predecoder.hh +++ b/src/arch/sparc/predecoder.hh @@ -49,9 +49,10 @@ class Predecoder ThreadContext * tc; // The extended machine instruction being generated ExtMachInst emi; + bool emiIsReady; public: - Predecoder(ThreadContext * _tc) : tc(_tc) + Predecoder(ThreadContext * _tc) : tc(_tc), emiIsReady(false) {} ThreadContext * @@ -67,7 +68,11 @@ class Predecoder } void process() {} - void reset() {} + void + reset() + { + emiIsReady = false; + } // Use this to give data to the predecoder. This should be used // when there is control flow. @@ -87,6 +92,7 @@ class Predecoder emi |= (static_cast(bits(inst, 12, 5)) << (sizeof(MachInst) * 8)); } + emiIsReady = true; } bool @@ -98,13 +104,14 @@ class Predecoder bool extMachInstReady() { - return true; + return emiIsReady; } // This returns a constant reference to the ExtMachInst to avoid a copy const ExtMachInst & getExtMachInst(PCState &pcState) { + emiIsReady = false; return emi; } }; diff --git a/src/cpu/base.hh b/src/cpu/base.hh index b96a8adb2..5b03d904f 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -180,6 +180,9 @@ class BaseCPU : public MemObject public: + // Mask to align PCs to MachInst sized boundaries + static const Addr PCMask = ~((Addr)sizeof(TheISA::MachInst) - 1); + /// Provide access to the tracer pointer Trace::InstTracer * getTracer() { return tracer; } diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 22e9e51b4..56f97e463 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -235,13 +235,14 @@ class DefaultFetch * Fetches the cache line that contains fetch_PC. Returns any * fault that happened. Puts the data into the class variable * cacheData. - * @param fetch_PC The PC address that is being fetched from. + * @param vaddr The memory address that is being fetched from. * @param ret_fault The fault reference that will be set to the result of * the icache access. * @param tid Thread id. + * @param pc The actual PC of the current instruction. * @return Any fault that occured. */ - bool fetchCacheLine(Addr fetch_PC, Fault &ret_fault, ThreadID tid); + bool fetchCacheLine(Addr vaddr, Fault &ret_fault, ThreadID tid, Addr pc); /** Squashes a specific thread and resets the PC. */ inline void doSquash(const TheISA::PCState &newPC, ThreadID tid); @@ -291,6 +292,10 @@ class DefaultFetch } private: + DynInstPtr buildInst(ThreadID tid, StaticInstPtr staticInst, + StaticInstPtr curMacroop, TheISA::PCState thisPC, + TheISA::PCState nextPC, bool trace); + /** Handles retrying the fetch access. */ void recvRetry(); @@ -347,6 +352,10 @@ class DefaultFetch TheISA::PCState pc[Impl::MaxThreads]; + Addr fetchOffset[Impl::MaxThreads]; + + StaticInstPtr macroop[Impl::MaxThreads]; + /** Memory request used to access cache. */ RequestPtr memReq[Impl::MaxThreads]; diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index bbd9ce4a2..cca6b7a57 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -317,6 +317,8 @@ DefaultFetch::initStage() // Setup PC and nextPC with initial state. for (ThreadID tid = 0; tid < numThreads; tid++) { pc[tid] = cpu->pcState(tid); + fetchOffset[tid] = 0; + macroop[tid] = NULL; } for (ThreadID tid = 0; tid < numThreads; tid++) { @@ -534,7 +536,8 @@ DefaultFetch::lookupAndUpdateNextPC( template bool -DefaultFetch::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, ThreadID tid) +DefaultFetch::fetchCacheLine(Addr vaddr, Fault &ret_fault, ThreadID tid, + Addr pc) { Fault fault = NoFault; @@ -547,7 +550,7 @@ DefaultFetch::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, ThreadID tid DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, switched out\n", tid); return false; - } else if (interruptPending && !(fetch_PC & 0x3)) { + } else if (interruptPending && !(pc & 0x3)) { // Hold off fetch from getting new instructions when: // Cache is blocked, or // while an interrupt is pending and we're not in PAL mode, or @@ -557,8 +560,8 @@ DefaultFetch::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, ThreadID tid return false; } - // Align the fetch PC so it's at the start of a cache block. - Addr block_PC = icacheBlockAlignPC(fetch_PC); + // Align the fetch address so it's at the start of a cache block. + Addr block_PC = icacheBlockAlignPC(vaddr); // If we've already got the block, no need to try to fetch it again. if (cacheDataValid[tid] && block_PC == cacheDataPC[tid]) { @@ -570,7 +573,7 @@ DefaultFetch::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, ThreadID tid // Build request here. RequestPtr mem_req = new Request(tid, block_PC, cacheBlkSize, Request::INST_FETCH, - fetch_PC, cpu->thread[tid]->contextId(), tid); + pc, cpu->thread[tid]->contextId(), tid); memReq[tid] = mem_req; @@ -645,6 +648,9 @@ DefaultFetch::doSquash(const TheISA::PCState &newPC, ThreadID tid) tid, newPC); pc[tid] = newPC; + fetchOffset[tid] = 0; + macroop[tid] = NULL; + predecoder.reset(); // Clear the icache miss if it's outstanding. if (fetchStatus[tid] == IcacheWaitResponse) { @@ -957,6 +963,53 @@ DefaultFetch::checkSignalsAndUpdate(ThreadID tid) return false; } +template +typename Impl::DynInstPtr +DefaultFetch::buildInst(ThreadID tid, StaticInstPtr staticInst, + StaticInstPtr curMacroop, TheISA::PCState thisPC, + TheISA::PCState nextPC, bool trace) +{ + // Get a sequence number. + InstSeqNum seq = cpu->getAndIncrementInstSeq(); + + // Create a new DynInst from the instruction fetched. + DynInstPtr instruction = + new DynInst(staticInst, thisPC, nextPC, seq, cpu); + instruction->setTid(tid); + + instruction->setASID(tid); + + instruction->setThreadState(cpu->thread[tid]); + + DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x (%d) created " + "[sn:%lli]\n", tid, thisPC.instAddr(), + thisPC.microPC(), seq); + + DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n", tid, + instruction->staticInst-> + disassemble(thisPC.instAddr())); + +#if TRACING_ON + if (trace) { + instruction->traceData = + cpu->getTracer()->getInstRecord(curTick, cpu->tcBase(tid), + instruction->staticInst, thisPC, curMacroop); + } +#else + instruction->traceData = NULL; +#endif + + // Add instruction to the CPU's list of instructions. + instruction->setInstListIt(cpu->addInst(instruction)); + + // Write the instruction to the first slot in the queue + // that heads to decode. + assert(numInst < fetchWidth); + toDecode->insts[toDecode->size++] = instruction; + + return instruction; +} + template void DefaultFetch::fetch(bool &status_change) @@ -977,25 +1030,28 @@ DefaultFetch::fetch(bool &status_change) DPRINTF(Fetch, "Attempting to fetch from [tid:%i]\n", tid); // The current PC. - TheISA::PCState fetchPC = pc[tid]; + TheISA::PCState thisPC = pc[tid]; // Fault code for memory access. Fault fault = NoFault; + Addr pcOffset = fetchOffset[tid]; + Addr fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask; + // If returning from the delay of a cache miss, then update the status // to running, otherwise do the cache access. Possibly move this up // to tick() function. if (fetchStatus[tid] == IcacheAccessComplete) { - DPRINTF(Fetch, "[tid:%i]: Icache miss is complete.\n", - tid); + DPRINTF(Fetch, "[tid:%i]: Icache miss is complete.\n",tid); fetchStatus[tid] = Running; status_change = true; } else if (fetchStatus[tid] == Running) { DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read " - "instruction, starting at PC %s.\n", tid, fetchPC); + "instruction, starting at PC %#x.\n", tid, fetchAddr); - bool fetch_success = fetchCacheLine(fetchPC.instAddr(), fault, tid); + bool fetch_success = fetchCacheLine(fetchAddr, fault, tid, + thisPC.instAddr()); if (!fetch_success) { if (cacheBlocked) { ++icacheStallCycles; @@ -1033,143 +1089,133 @@ DefaultFetch::fetch(bool &status_change) return; } - TheISA::PCState nextPC = fetchPC; - - InstSeqNum inst_seq; - MachInst inst; - ExtMachInst ext_inst; + TheISA::PCState nextPC = thisPC; StaticInstPtr staticInst = NULL; - StaticInstPtr macroop = NULL; + StaticInstPtr curMacroop = macroop[tid]; if (fault == NoFault) { - //XXX Masking out pal mode bit. This will break x86. Alpha needs - //to pull the pal mode bit ouf ot the instruction address. - unsigned offset = (fetchPC.instAddr() & ~1) - cacheDataPC[tid]; - assert(offset < cacheBlkSize); // If the read of the first instruction was successful, then grab the // instructions from the rest of the cache line and put them into the // queue heading to decode. - DPRINTF(Fetch, "[tid:%i]: Adding instructions to queue to " - "decode.\n",tid); + DPRINTF(Fetch, + "[tid:%i]: Adding instructions to queue to decode.\n", tid); // Need to keep track of whether or not a predicted branch // ended this fetch block. - bool predicted_branch = false; + bool predictedBranch = false; - while (offset < cacheBlkSize && + TheISA::MachInst *cacheInsts = + reinterpret_cast(cacheData[tid]); + + const unsigned numInsts = cacheBlkSize / instSize; + unsigned blkOffset = (fetchAddr - cacheDataPC[tid]) / instSize; + + // Loop through instruction memory from the cache. + while (blkOffset < numInsts && numInst < fetchWidth && - !predicted_branch) { + !predictedBranch) { - // Make sure this is a valid index. - assert(offset <= cacheBlkSize - instSize); - - if (!macroop) { - // Get the instruction from the array of the cache line. - inst = TheISA::gtoh(*reinterpret_cast - (&cacheData[tid][offset])); + // If we need to process more memory, do it now. + if (!curMacroop && !predecoder.extMachInstReady()) { + if (ISA_HAS_DELAY_SLOT && pcOffset == 0) { + // Walk past any annulled delay slot instructions. + Addr pcAddr = thisPC.instAddr() & BaseCPU::PCMask; + while (fetchAddr != pcAddr && blkOffset < numInsts) { + blkOffset++; + fetchAddr += instSize; + } + if (blkOffset >= numInsts) + break; + } + MachInst inst = TheISA::gtoh(cacheInsts[blkOffset]); predecoder.setTC(cpu->thread[tid]->getTC()); - predecoder.moreBytes(fetchPC, fetchPC.instAddr(), inst); + predecoder.moreBytes(thisPC, fetchAddr, inst); - ext_inst = predecoder.getExtMachInst(fetchPC); - staticInst = StaticInstPtr(ext_inst, fetchPC.instAddr()); - if (staticInst->isMacroop()) - macroop = staticInst; + if (predecoder.needMoreBytes()) { + blkOffset++; + fetchAddr += instSize; + pcOffset += instSize; + } } + + // Extract as many instructions and/or microops as we can from + // the memory we've processed so far. do { - if (macroop) { - staticInst = macroop->fetchMicroop(fetchPC.microPC()); + if (!curMacroop) { + if (predecoder.extMachInstReady()) { + ExtMachInst extMachInst; + + extMachInst = predecoder.getExtMachInst(thisPC); + pcOffset = 0; + staticInst = StaticInstPtr(extMachInst, + thisPC.instAddr()); + + // Increment stat of fetched instructions. + ++fetchedInsts; + + if (staticInst->isMacroop()) + curMacroop = staticInst; + } else { + // We need more bytes for this instruction. + break; + } + } + if (curMacroop) { + staticInst = curMacroop->fetchMicroop(thisPC.microPC()); if (staticInst->isLastMicroop()) - macroop = NULL; + curMacroop = NULL; } - // Get a sequence number. - inst_seq = cpu->getAndIncrementInstSeq(); + DynInstPtr instruction = + buildInst(tid, staticInst, curMacroop, + thisPC, nextPC, true); - // Create a new DynInst from the instruction fetched. - DynInstPtr instruction = new DynInst(staticInst, - fetchPC, nextPC, - inst_seq, cpu); - instruction->setTid(tid); + numInst++; - instruction->setASID(tid); - - instruction->setThreadState(cpu->thread[tid]); - - DPRINTF(Fetch, "[tid:%i]: Instruction PC %s (%d) created " - "[sn:%lli]\n", tid, instruction->pcState(), - instruction->microPC(), inst_seq); - - //DPRINTF(Fetch, "[tid:%i]: MachInst is %#x\n", tid, ext_inst); - - DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n", tid, - instruction->staticInst-> - disassemble(fetchPC.instAddr())); - -#if TRACING_ON - instruction->traceData = - cpu->getTracer()->getInstRecord(curTick, cpu->tcBase(tid), - instruction->staticInst, fetchPC, macroop); -#else - instruction->traceData = NULL; -#endif + nextPC = thisPC; // If we're branching after this instruction, quite fetching // from the same block then. - predicted_branch = fetchPC.branching(); - predicted_branch |= + predictedBranch |= thisPC.branching(); + predictedBranch |= lookupAndUpdateNextPC(instruction, nextPC); - if (predicted_branch) { - DPRINTF(Fetch, "Branch detected with PC = %s\n", fetchPC); + if (predictedBranch) { + DPRINTF(Fetch, "Branch detected with PC = %s\n", thisPC); } - // Add instruction to the CPU's list of instructions. - instruction->setInstListIt(cpu->addInst(instruction)); - - // Write the instruction to the first slot in the queue - // that heads to decode. - toDecode->insts[numInst] = instruction; - - toDecode->size++; - - // Increment stat of fetched instructions. - ++fetchedInsts; - // Move to the next instruction, unless we have a branch. - fetchPC = nextPC; + thisPC = nextPC; if (instruction->isQuiesce()) { - DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!", - curTick); + DPRINTF(Fetch, + "Quiesce instruction encountered, halting fetch!"); fetchStatus[tid] = QuiescePending; - ++numInst; status_change = true; break; } - - ++numInst; - } while (staticInst->isMicroop() && - !staticInst->isLastMicroop() && + } while ((curMacroop || predecoder.extMachInstReady()) && numInst < fetchWidth); - //XXX Masking out pal mode bit. - offset = (fetchPC.instAddr() & ~1) - cacheDataPC[tid]; } - if (predicted_branch) { + if (predictedBranch) { DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch " "instruction encountered.\n", tid); } else if (numInst >= fetchWidth) { DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth " "for this cycle.\n", tid); - } else if (offset >= cacheBlkSize) { + } else if (blkOffset >= cacheBlkSize) { DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " "block.\n", tid); } } + macroop[tid] = curMacroop; + fetchOffset[tid] = pcOffset; + if (numInst > 0) { wroteToTimeBuffer = true; } @@ -1188,42 +1234,24 @@ DefaultFetch::fetch(bool &status_change) // Send the fault to commit. This thread will not do anything // until commit handles the fault. The only other way it can - // wake up is if a squash comes along and changes the PC. - assert(numInst < fetchWidth); - // Get a sequence number. - inst_seq = cpu->getAndIncrementInstSeq(); - // We will use a nop in order to carry the fault. - ext_inst = TheISA::NoopMachInst; + // wake up is if a squash comes along and changes the PC. Send the + // fault on a dummy nop. + staticInst = StaticInstPtr(TheISA::NoopMachInst, thisPC.instAddr()); - // Create a new DynInst from the dummy nop. - DynInstPtr instruction = new DynInst(ext_inst, fetchPC, nextPC, - inst_seq, cpu); - TheISA::advancePC(nextPC, instruction->staticInst); + DynInstPtr instruction = + buildInst(tid, staticInst, NULL, thisPC, nextPC, false); + + TheISA::advancePC(nextPC, staticInst); instruction->setPredTarg(nextPC); - instruction->setTid(tid); - - instruction->setASID(tid); - - instruction->setThreadState(cpu->thread[tid]); - - instruction->traceData = NULL; - - instruction->setInstListIt(cpu->addInst(instruction)); - instruction->fault = fault; - toDecode->insts[numInst] = instruction; - toDecode->size++; - - wroteToTimeBuffer = true; - DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid); fetchStatus[tid] = TrapPending; status_change = true; DPRINTF(Fetch, "[tid:%i]: fault (%s) detected @ PC %s", - tid, fault->name(), pc[tid]); + tid, fault->name(), thisPC); } } diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh index ed5d0b1a6..f7dcd4a86 100644 --- a/src/cpu/simple/base.hh +++ b/src/cpu/simple/base.hh @@ -191,9 +191,6 @@ class BaseSimpleCPU : public BaseCPU return numInst - startNumInst; } - // Mask to align PCs to MachInst sized boundaries - static const Addr PCMask = ~((Addr)sizeof(TheISA::MachInst) - 1); - // number of simulated memory references Stats::Scalar numMemRefs;