cpu: Add a fetch queue to the o3 cpu

This patch adds a fetch queue that sits between fetch and decode to the
o3 cpu.  This effectively decouples fetch from decode stalls allowing it
to be more aggressive, running futher ahead in the instruction stream.
This commit is contained in:
Mitch Hayenga 2014-09-03 07:42:35 -04:00
parent 1716749c8c
commit ecd5300971
3 changed files with 55 additions and 21 deletions

View file

@ -61,6 +61,7 @@ class DerivO3CPU(BaseCPU):
commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay")
fetchWidth = Param.Unsigned(8, "Fetch width")
fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes")
fetchQueueSize = Param.Unsigned(32, "Fetch queue size in micro-ops")
renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay")
iewToDecodeDelay = Param.Cycles(1, "Issue/Execute/Writeback to decode "

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010-2012 ARM Limited
* Copyright (c) 2010-2012, 2014 ARM Limited
* All rights reserved
*
* The license below extends only to copyright in the software and shall
@ -401,9 +401,6 @@ class DefaultFetch
/** Wire to get commit's information from backwards time buffer. */
typename TimeBuffer<TimeStruct>::wire fromCommit;
/** Internal fetch instruction queue. */
TimeBuffer<FetchStruct> *fetchQueue;
//Might be annoying how this name is different than the queue.
/** Wire used to write any information heading to decode. */
typename TimeBuffer<FetchStruct>::wire toDecode;
@ -455,6 +452,9 @@ class DefaultFetch
/** The width of fetch in instructions. */
unsigned fetchWidth;
/** The width of decode in instructions. */
unsigned decodeWidth;
/** Is the cache blocked? If so no threads can access it. */
bool cacheBlocked;
@ -481,6 +481,12 @@ class DefaultFetch
/** The PC of the first instruction loaded into the fetch buffer. */
Addr fetchBufferPC[Impl::MaxThreads];
/** The size of the fetch queue in micro-ops */
unsigned fetchQueueSize;
/** Queue of fetched instructions */
std::deque<DynInstPtr> fetchQueue;
/** Whether or not the fetch buffer data is valid. */
bool fetchBufferValid[Impl::MaxThreads];

View file

@ -82,11 +82,13 @@ DefaultFetch<Impl>::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params)
iewToFetchDelay(params->iewToFetchDelay),
commitToFetchDelay(params->commitToFetchDelay),
fetchWidth(params->fetchWidth),
decodeWidth(params->decodeWidth),
retryPkt(NULL),
retryTid(InvalidThreadID),
cacheBlkSize(cpu->cacheLineSize()),
fetchBufferSize(params->fetchBufferSize),
fetchBufferMask(fetchBufferSize - 1),
fetchQueueSize(params->fetchQueueSize),
numThreads(params->numThreads),
numFetchingThreads(params->smtNumFetchingThreads),
finishTranslationEvent(this)
@ -313,12 +315,10 @@ DefaultFetch<Impl>::setActiveThreads(std::list<ThreadID> *at_ptr)
template<class Impl>
void
DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *ftb_ptr)
{
fetchQueue = fq_ptr;
// Create wire to write information to proper place in fetch queue.
toDecode = fetchQueue->getWire(0);
// Create wire to write information to proper place in fetch time buf.
toDecode = ftb_ptr->getWire(0);
}
template<class Impl>
@ -342,6 +342,7 @@ DefaultFetch<Impl>::resetStage()
cacheBlocked = false;
priorityList.clear();
fetchQueue.clear();
// Setup PC and nextPC with initial state.
for (ThreadID tid = 0; tid < numThreads; ++tid) {
@ -454,6 +455,10 @@ DefaultFetch<Impl>::isDrained() const
return false;
}
// Not drained if fetch queue contains entries
if (!fetchQueue.empty())
return false;
/* The pipeline might start up again in the middle of the drain
* cycle if the finish translation event is scheduled, so make
* sure that's not the case.
@ -673,11 +678,8 @@ DefaultFetch<Impl>::finishTranslation(Fault fault, RequestPtr mem_req)
fetchStatus[tid] = IcacheWaitResponse;
}
} else {
// Don't send an instruction to decode if it can't handle it.
// Asynchronous nature of this function's calling means we have to
// check 2 signals to see if decode is stalled.
if (!(numInst < fetchWidth) || stalls[tid].decode ||
fromDecode->decodeBlock[tid]) {
// Don't send an instruction to decode if we can't handle it.
if (!(numInst < fetchWidth) || !(fetchQueue.size() < fetchQueueSize)) {
assert(!finishTranslationEvent.scheduled());
finishTranslationEvent.setFault(fault);
finishTranslationEvent.setReq(mem_req);
@ -758,6 +760,15 @@ DefaultFetch<Impl>::doSquash(const TheISA::PCState &newPC,
fetchStatus[tid] = Squashing;
// Empty fetch queue
auto inst_itr = fetchQueue.begin();
while (inst_itr != fetchQueue.end()) {
if ((*inst_itr)->threadNumber == tid)
inst_itr = fetchQueue.erase(inst_itr);
else
++inst_itr;
}
// microops are being squashed, it is not known wheather the
// youngest non-squashed microop was marked delayed commit
// or not. Setting the flag to true ensures that the
@ -796,9 +807,6 @@ DefaultFetch<Impl>::checkStall(ThreadID tid) const
assert(cpu->isDraining());
DPRINTF(Fetch,"[tid:%i]: Drain stall detected.\n",tid);
ret_val = true;
} else if (stalls[tid].decode) {
DPRINTF(Fetch,"[tid:%i]: Stall from Decode stage detected.\n",tid);
ret_val = true;
}
return ret_val;
@ -921,6 +929,21 @@ DefaultFetch<Impl>::tick()
}
}
// Send instructions enqueued into the fetch queue to decode.
// Limit rate by fetchWidth. Stall if decode is stalled.
unsigned instsToDecode = 0;
while(!fetchQueue.empty() &&
instsToDecode < decodeWidth &&
!stalls[fetchQueue.front()->threadNumber].decode) {
auto inst = fetchQueue.front();
toDecode->insts[toDecode->size++] = inst;
DPRINTF(Fetch, "[tid:%i][sn:%i]: Sending instruction to decode from "
"fetch queue. Fetch queue size: %i.\n",
inst->threadNumber, inst->seqNum, fetchQueue.size());
fetchQueue.pop_front();
instsToDecode++;
}
// Reset the number of the instruction we've fetched.
numInst = 0;
}
@ -1072,7 +1095,11 @@ DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
// Write the instruction to the first slot in the queue
// that heads to decode.
assert(numInst < fetchWidth);
toDecode->insts[toDecode->size++] = instruction;
fetchQueue.push_back(instruction);
assert(fetchQueue.size() <= fetchQueueSize);
DPRINTF(Fetch, "[tid:%i]: Fetch queue entry created (%i/%i).\n",
tid, fetchQueue.size(), fetchQueueSize);
//toDecode->insts[toDecode->size++] = instruction;
// Keep track of if we can take an interrupt at this boundary
delayedCommit[tid] = instruction->isDelayedCommit();
@ -1186,8 +1213,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
// Loop through instruction memory from the cache.
// Keep issuing while fetchWidth is available and branch is not
// predicted taken
while (numInst < fetchWidth && !predictedBranch) {
while (numInst < fetchWidth && fetchQueue.size() < fetchQueueSize
&& !predictedBranch) {
// We need to process more memory if we aren't going to get a
// StaticInst from the rom, the current macroop, or what's already
// in the decoder.
@ -1310,7 +1337,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
break;
}
} while ((curMacroop || decoder[tid]->instReady()) &&
numInst < fetchWidth);
numInst < fetchWidth && fetchQueue.size() < fetchQueueSize);
}
if (predictedBranch) {