cpu: allow the fetch buffer to be smaller than a cache line

the current implementation of the fetch buffer in the o3 cpu
is only allowed to be the size of a cache line. some
architectures, e.g., ARM, have fetch buffers smaller than a cache
line, see slide 22 at:
http://www.arm.com/files/pdf/at-exploring_the_design_of_the_cortex-a15.pdf

this patch allows the fetch buffer to be set to values smaller
than a cache line.
This commit is contained in:
Anthony Gutierrez 2013-11-15 13:21:15 -05:00
parent f028da7af7
commit 8a53da22c2
5 changed files with 73 additions and 56 deletions

View file

@ -119,6 +119,7 @@ class O3_ARM_v7a_3(DerivO3CPU):
commitToRenameDelay = 1 commitToRenameDelay = 1
commitToIEWDelay = 1 commitToIEWDelay = 1
fetchWidth = 3 fetchWidth = 3
fetchBufferSize = 16
fetchToDecodeDelay = 3 fetchToDecodeDelay = 3
decodeWidth = 3 decodeWidth = 3
decodeToRenameDelay = 2 decodeToRenameDelay = 2

View file

@ -148,7 +148,7 @@ class SourceFile(object):
def __ge__(self, other): return self.filename >= other.filename def __ge__(self, other): return self.filename >= other.filename
def __eq__(self, other): return self.filename == other.filename def __eq__(self, other): return self.filename == other.filename
def __ne__(self, other): return self.filename != other.filename def __ne__(self, other): return self.filename != other.filename
class Source(SourceFile): class Source(SourceFile):
'''Add a c/c++ source file to the build''' '''Add a c/c++ source file to the build'''
def __init__(self, source, Werror=True, swig=False, **guards): def __init__(self, source, Werror=True, swig=False, **guards):
@ -164,7 +164,7 @@ class PySource(SourceFile):
modules = {} modules = {}
tnodes = {} tnodes = {}
symnames = {} symnames = {}
def __init__(self, package, source, **guards): def __init__(self, package, source, **guards):
'''specify the python package, the source file, and any guards''' '''specify the python package, the source file, and any guards'''
super(PySource, self).__init__(source, **guards) super(PySource, self).__init__(source, **guards)

View file

@ -60,6 +60,7 @@ class DerivO3CPU(BaseCPU):
"delay") "delay")
commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay") commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay")
fetchWidth = Param.Unsigned(8, "Fetch width") fetchWidth = Param.Unsigned(8, "Fetch width")
fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes")
renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay") renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay")
iewToDecodeDelay = Param.Cycles(1, "Issue/Execute/Writeback to decode " iewToDecodeDelay = Param.Cycles(1, "Issue/Execute/Writeback to decode "

View file

@ -274,9 +274,9 @@ class DefaultFetch
bool lookupAndUpdateNextPC(DynInstPtr &inst, TheISA::PCState &pc); bool lookupAndUpdateNextPC(DynInstPtr &inst, TheISA::PCState &pc);
/** /**
* Fetches the cache line that contains fetch_PC. Returns any * Fetches the cache line that contains the fetch PC. Returns any
* fault that happened. Puts the data into the class variable * fault that happened. Puts the data into the class variable
* cacheData. * fetchBuffer, which may not hold the entire fetched cache line.
* @param vaddr The memory address that is being fetched from. * @param vaddr The memory address that is being fetched from.
* @param ret_fault The fault reference that will be set to the result of * @param ret_fault The fault reference that will be set to the result of
* the icache access. * the icache access.
@ -339,10 +339,10 @@ class DefaultFetch
*/ */
void fetch(bool &status_change); void fetch(bool &status_change);
/** Align a PC to the start of an I-cache block. */ /** Align a PC to the start of a fetch buffer block. */
Addr icacheBlockAlignPC(Addr addr) Addr fetchBufferAlignPC(Addr addr)
{ {
return (addr & ~(cacheBlkMask)); return (addr & ~(fetchBufferMask));
} }
/** The decoder. */ /** The decoder. */
@ -463,17 +463,22 @@ class DefaultFetch
/** Cache block size. */ /** Cache block size. */
unsigned int cacheBlkSize; unsigned int cacheBlkSize;
/** Mask to get a cache block's address. */ /** The size of the fetch buffer in bytes. The fetch buffer
Addr cacheBlkMask; * itself may be smaller than a cache line.
*/
unsigned fetchBufferSize;
/** The cache line being fetched. */ /** Mask to align a fetch address to a fetch buffer boundary. */
uint8_t *cacheData[Impl::MaxThreads]; Addr fetchBufferMask;
/** The PC of the cacheline that has been loaded. */ /** The fetch data that is being fetched and buffered. */
Addr cacheDataPC[Impl::MaxThreads]; uint8_t *fetchBuffer[Impl::MaxThreads];
/** Whether or not the cache data is valid. */ /** The PC of the first instruction loaded into the fetch buffer. */
bool cacheDataValid[Impl::MaxThreads]; Addr fetchBufferPC[Impl::MaxThreads];
/** Whether or not the fetch buffer data is valid. */
bool fetchBufferValid[Impl::MaxThreads];
/** Size of instructions. */ /** Size of instructions. */
int instSize; int instSize;

View file

@ -85,7 +85,8 @@ DefaultFetch<Impl>::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params)
retryPkt(NULL), retryPkt(NULL),
retryTid(InvalidThreadID), retryTid(InvalidThreadID),
cacheBlkSize(cpu->cacheLineSize()), cacheBlkSize(cpu->cacheLineSize()),
cacheBlkMask(cacheBlkSize - 1), fetchBufferSize(params->fetchBufferSize),
fetchBufferMask(fetchBufferSize - 1),
numThreads(params->numThreads), numThreads(params->numThreads),
numFetchingThreads(params->smtNumFetchingThreads), numFetchingThreads(params->smtNumFetchingThreads),
finishTranslationEvent(this) finishTranslationEvent(this)
@ -98,6 +99,12 @@ DefaultFetch<Impl>::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params)
fatal("fetchWidth (%d) is larger than compiled limit (%d),\n" fatal("fetchWidth (%d) is larger than compiled limit (%d),\n"
"\tincrease MaxWidth in src/cpu/o3/impl.hh\n", "\tincrease MaxWidth in src/cpu/o3/impl.hh\n",
fetchWidth, static_cast<int>(Impl::MaxWidth)); fetchWidth, static_cast<int>(Impl::MaxWidth));
if (fetchBufferSize > cacheBlkSize)
fatal("fetch buffer size (%u bytes) is greater than the cache "
"block size (%u bytes)\n", fetchBufferSize, cacheBlkSize);
if (cacheBlkSize % fetchBufferSize)
fatal("cache block (%u bytes) is not a multiple of the "
"fetch buffer (%u bytes)\n", cacheBlkSize, fetchBufferSize);
std::string policy = params->smtFetchPolicy; std::string policy = params->smtFetchPolicy;
@ -131,16 +138,19 @@ DefaultFetch<Impl>::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params)
instSize = sizeof(TheISA::MachInst); instSize = sizeof(TheISA::MachInst);
for (int i = 0; i < Impl::MaxThreads; i++) { for (int i = 0; i < Impl::MaxThreads; i++) {
decoder[i] = new TheISA::Decoder; decoder[i] = NULL;
fetchBuffer[i] = NULL;
fetchBufferPC[i] = 0;
fetchBufferValid[i] = false;
} }
branchPred = params->branchPred; branchPred = params->branchPred;
for (ThreadID tid = 0; tid < numThreads; tid++) { for (ThreadID tid = 0; tid < numThreads; tid++) {
// Create space to store a cache line. decoder[tid] = new TheISA::Decoder;
cacheData[tid] = new uint8_t[cacheBlkSize]; // Create space to buffer the cache line data,
cacheDataPC[tid] = 0; // which may not hold the entire cache line.
cacheDataValid[tid] = false; fetchBuffer[tid] = new uint8_t[fetchBufferSize];
} }
} }
@ -327,7 +337,7 @@ DefaultFetch<Impl>::resetStage()
priorityList.clear(); priorityList.clear();
// Setup PC and nextPC with initial state. // Setup PC and nextPC with initial state.
for (ThreadID tid = 0; tid < numThreads; tid++) { for (ThreadID tid = 0; tid < numThreads; ++tid) {
fetchStatus[tid] = Running; fetchStatus[tid] = Running;
pc[tid] = cpu->pcState(tid); pc[tid] = cpu->pcState(tid);
fetchOffset[tid] = 0; fetchOffset[tid] = 0;
@ -342,16 +352,14 @@ DefaultFetch<Impl>::resetStage()
stalls[tid].commit = false; stalls[tid].commit = false;
stalls[tid].drain = false; stalls[tid].drain = false;
fetchBufferPC[tid] = 0;
fetchBufferValid[tid] = false;
priorityList.push_back(tid); priorityList.push_back(tid);
} }
wroteToTimeBuffer = false; wroteToTimeBuffer = false;
_status = Inactive; _status = Inactive;
for (ThreadID tid = 0; tid < numThreads; tid++) {
cacheDataPC[tid] = 0;
cacheDataValid[tid] = false;
}
} }
template<class Impl> template<class Impl>
@ -373,8 +381,8 @@ DefaultFetch<Impl>::processCacheCompletion(PacketPtr pkt)
return; return;
} }
memcpy(cacheData[tid], pkt->getPtr<uint8_t>(), cacheBlkSize); memcpy(fetchBuffer[tid], pkt->getPtr<uint8_t>(), fetchBufferSize);
cacheDataValid[tid] = true; fetchBufferValid[tid] = true;
// Wake up the CPU (if it went to sleep and was waiting on // Wake up the CPU (if it went to sleep and was waiting on
// this completion event). // this completion event).
@ -573,18 +581,19 @@ DefaultFetch<Impl>::fetchCacheLine(Addr vaddr, ThreadID tid, Addr pc)
return false; return false;
} }
// Align the fetch address so it's at the start of a cache block. // Align the fetch address to the start of a fetch buffer segment.
Addr block_PC = icacheBlockAlignPC(vaddr); Addr fetchBufferBlockPC = fetchBufferAlignPC(vaddr);
DPRINTF(Fetch, "[tid:%i] Fetching cache line %#x for addr %#x\n", DPRINTF(Fetch, "[tid:%i] Fetching cache line %#x for addr %#x\n",
tid, block_PC, vaddr); tid, fetchBufferBlockPC, vaddr);
// Setup the memReq to do a read of the first instruction's address. // Setup the memReq to do a read of the first instruction's address.
// Set the appropriate read size and flags as well. // Set the appropriate read size and flags as well.
// Build request here. // Build request here.
RequestPtr mem_req = RequestPtr mem_req =
new Request(tid, block_PC, cacheBlkSize, Request::INST_FETCH, new Request(tid, fetchBufferBlockPC, fetchBufferSize,
cpu->instMasterId(), pc, cpu->thread[tid]->contextId(), tid); Request::INST_FETCH, cpu->instMasterId(), pc,
cpu->thread[tid]->contextId(), tid);
memReq[tid] = mem_req; memReq[tid] = mem_req;
@ -601,7 +610,7 @@ void
DefaultFetch<Impl>::finishTranslation(Fault fault, RequestPtr mem_req) DefaultFetch<Impl>::finishTranslation(Fault fault, RequestPtr mem_req)
{ {
ThreadID tid = mem_req->threadId(); ThreadID tid = mem_req->threadId();
Addr block_PC = mem_req->getVaddr(); Addr fetchBufferBlockPC = mem_req->getVaddr();
assert(!cpu->switchedOut()); assert(!cpu->switchedOut());
@ -634,10 +643,10 @@ DefaultFetch<Impl>::finishTranslation(Fault fault, RequestPtr mem_req)
// Build packet here. // Build packet here.
PacketPtr data_pkt = new Packet(mem_req, MemCmd::ReadReq); PacketPtr data_pkt = new Packet(mem_req, MemCmd::ReadReq);
data_pkt->dataDynamicArray(new uint8_t[cacheBlkSize]); data_pkt->dataDynamicArray(new uint8_t[fetchBufferSize]);
cacheDataPC[tid] = block_PC; fetchBufferPC[tid] = fetchBufferBlockPC;
cacheDataValid[tid] = false; fetchBufferValid[tid] = false;
DPRINTF(Fetch, "Fetch: Doing instruction read.\n"); DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
fetchedCacheLines++; fetchedCacheLines++;
@ -1154,13 +1163,13 @@ DefaultFetch<Impl>::fetch(bool &status_change)
fetchStatus[tid] = Running; fetchStatus[tid] = Running;
status_change = true; status_change = true;
} else if (fetchStatus[tid] == Running) { } else if (fetchStatus[tid] == Running) {
// Align the fetch PC so its at the start of a cache block. // Align the fetch PC so its at the start of a fetch buffer segment.
Addr block_PC = icacheBlockAlignPC(fetchAddr); Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);
// If buffer is no longer valid or fetchAddr has moved to point // If buffer is no longer valid or fetchAddr has moved to point
// to the next cache block, AND we have no remaining ucode // to the next cache block, AND we have no remaining ucode
// from a macro-op, then start fetch from icache. // from a macro-op, then start fetch from icache.
if (!(cacheDataValid[tid] && block_PC == cacheDataPC[tid]) if (!(fetchBufferValid[tid] && fetchBufferBlockPC == fetchBufferPC[tid])
&& !inRom && !macroop[tid]) { && !inRom && !macroop[tid]) {
DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read " DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read "
"instruction, starting at PC %s.\n", tid, thisPC); "instruction, starting at PC %s.\n", tid, thisPC);
@ -1211,10 +1220,10 @@ DefaultFetch<Impl>::fetch(bool &status_change)
bool predictedBranch = false; bool predictedBranch = false;
TheISA::MachInst *cacheInsts = TheISA::MachInst *cacheInsts =
reinterpret_cast<TheISA::MachInst *>(cacheData[tid]); reinterpret_cast<TheISA::MachInst *>(fetchBuffer[tid]);
const unsigned numInsts = cacheBlkSize / instSize; const unsigned numInsts = fetchBufferSize / instSize;
unsigned blkOffset = (fetchAddr - cacheDataPC[tid]) / instSize; unsigned blkOffset = (fetchAddr - fetchBufferPC[tid]) / instSize;
// Loop through instruction memory from the cache. // Loop through instruction memory from the cache.
// Keep issuing while fetchWidth is available and branch is not // Keep issuing while fetchWidth is available and branch is not
@ -1227,12 +1236,13 @@ DefaultFetch<Impl>::fetch(bool &status_change)
bool needMem = !inRom && !curMacroop && bool needMem = !inRom && !curMacroop &&
!decoder[tid]->instReady(); !decoder[tid]->instReady();
fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask; fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
Addr block_PC = icacheBlockAlignPC(fetchAddr); Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);
if (needMem) { if (needMem) {
// If buffer is no longer valid or fetchAddr has moved to point // If buffer is no longer valid or fetchAddr has moved to point
// to the next cache block then start fetch from icache. // to the next cache block then start fetch from icache.
if (!cacheDataValid[tid] || block_PC != cacheDataPC[tid]) if (!fetchBufferValid[tid] ||
fetchBufferBlockPC != fetchBufferPC[tid])
break; break;
if (blkOffset >= numInsts) { if (blkOffset >= numInsts) {
@ -1328,7 +1338,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
if (newMacro) { if (newMacro) {
fetchAddr = thisPC.instAddr() & BaseCPU::PCMask; fetchAddr = thisPC.instAddr() & BaseCPU::PCMask;
blkOffset = (fetchAddr - cacheDataPC[tid]) / instSize; blkOffset = (fetchAddr - fetchBufferPC[tid]) / instSize;
pcOffset = 0; pcOffset = 0;
curMacroop = NULL; curMacroop = NULL;
} }
@ -1350,9 +1360,9 @@ DefaultFetch<Impl>::fetch(bool &status_change)
} else if (numInst >= fetchWidth) { } else if (numInst >= fetchWidth) {
DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth " DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth "
"for this cycle.\n", tid); "for this cycle.\n", tid);
} else if (blkOffset >= cacheBlkSize) { } else if (blkOffset >= fetchBufferSize) {
DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of the"
"block.\n", tid); "fetch buffer.\n", tid);
} }
macroop[tid] = curMacroop; macroop[tid] = curMacroop;
@ -1364,11 +1374,11 @@ DefaultFetch<Impl>::fetch(bool &status_change)
pc[tid] = thisPC; pc[tid] = thisPC;
// pipeline a fetch if we're crossing a cache boundary and not in // pipeline a fetch if we're crossing a fetch buffer boundary and not in
// a state that would preclude fetching // a state that would preclude fetching
fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask; fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
Addr block_PC = icacheBlockAlignPC(fetchAddr); Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);
issuePipelinedIfetch[tid] = block_PC != cacheDataPC[tid] && issuePipelinedIfetch[tid] = fetchBufferBlockPC != fetchBufferPC[tid] &&
fetchStatus[tid] != IcacheWaitResponse && fetchStatus[tid] != IcacheWaitResponse &&
fetchStatus[tid] != ItlbWait && fetchStatus[tid] != ItlbWait &&
fetchStatus[tid] != IcacheWaitRetry && fetchStatus[tid] != IcacheWaitRetry &&
@ -1575,11 +1585,11 @@ DefaultFetch<Impl>::pipelineIcacheAccesses(ThreadID tid)
Addr pcOffset = fetchOffset[tid]; Addr pcOffset = fetchOffset[tid];
Addr fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask; Addr fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
// Align the fetch PC so its at the start of a cache block. // Align the fetch PC so its at the start of a fetch buffer segment.
Addr block_PC = icacheBlockAlignPC(fetchAddr); Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);
// Unless buffer already got the block, fetch it from icache. // Unless buffer already got the block, fetch it from icache.
if (!(cacheDataValid[tid] && block_PC == cacheDataPC[tid])) { if (!(fetchBufferValid[tid] && fetchBufferBlockPC == fetchBufferPC[tid])) {
DPRINTF(Fetch, "[tid:%i]: Issuing a pipelined I-cache access, " DPRINTF(Fetch, "[tid:%i]: Issuing a pipelined I-cache access, "
"starting at PC %s.\n", tid, thisPC); "starting at PC %s.\n", tid, thisPC);