O3: Enhance data address translation by supporting hardware page table walkers.

Some ISAs (like ARM) relies on hardware page table walkers.  For those ISAs,
when a TLB miss occurs, initiateTranslation() can return with NoFault but with
the translation unfinished.

Instructions experiencing a delayed translation due to a hardware page table
walk are deferred until the translation completes and kept into the IQ.  In
order to keep track of them, the IQ has been augmented with a queue of the
outstanding delayed memory instructions.  When their translation completes,
instructions are re-executed (only their initiateAccess() was already
executed; their DTB translation is now skipped).  The IEW stage has been
modified to support such a 2-pass execution.
This commit is contained in:
Giacomo Gabrielli 2011-02-11 18:29:35 -06:00
parent 453dbc772d
commit e2507407b1
11 changed files with 277 additions and 37 deletions

View file

@ -696,6 +696,8 @@ TLB::translateTiming(RequestPtr req, ThreadContext *tc,
#endif
if (!delay)
translation->finish(fault, req, tc, mode);
else
translation->markDelayed();
return fault;
}

View file

@ -1,4 +1,16 @@
/*
* Copyright (c) 2011 ARM Limited
* All rights reserved.
*
* The license below extends only to copyright in the software and shall
* not be construed as granting a license to any other intellectual
* property including but not limited to intellectual property relating
* to a hardware implementation of the functionality of the software
* licensed hereunder. You may use the software subject to the license
* terms below provided that you ensure that this notice is replicated
* unmodified and in its entirety in all distributions of the software,
* modified or unmodified, in source code or in binary form.
*
* Copyright (c) 2004-2006 The Regents of The University of Michigan
* Copyright (c) 2009 The University of Edinburgh
* All rights reserved.
@ -150,6 +162,29 @@ class BaseDynInst : public FastAlloc, public RefCounted
/** Finish a DTB address translation. */
void finishTranslation(WholeTranslationState *state);
/** True if the DTB address translation has started. */
bool translationStarted;
/** True if the DTB address translation has completed. */
bool translationCompleted;
/**
* Returns true if the DTB address translation is being delayed due to a hw
* page table walk.
*/
bool isTranslationDelayed() const
{
return (translationStarted && !translationCompleted);
}
/**
* Saved memory requests (needed when the DTB address translation is
* delayed due to a hw page table walk).
*/
RequestPtr savedReq;
RequestPtr savedSreqLow;
RequestPtr savedSreqHigh;
/** @todo: Consider making this private. */
public:
/** The sequence number of the instruction. */
@ -835,33 +870,42 @@ BaseDynInst<Impl>::readBytes(Addr addr, uint8_t *data,
unsigned size, unsigned flags)
{
reqMade = true;
Request *req = new Request(asid, addr, size, flags, this->pc.instAddr(),
thread->contextId(), threadNumber);
Request *req = NULL;
Request *sreqLow = NULL;
Request *sreqHigh = NULL;
// Only split the request if the ISA supports unaligned accesses.
if (TheISA::HasUnalignedMemAcc) {
splitRequest(req, sreqLow, sreqHigh);
}
initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Read);
if (fault == NoFault) {
effAddr = req->getVaddr();
effAddrValid = true;
fault = cpu->read(req, sreqLow, sreqHigh, data, lqIdx);
if (reqMade && translationStarted) {
req = savedReq;
sreqLow = savedSreqLow;
sreqHigh = savedSreqHigh;
} else {
// Commit will have to clean up whatever happened. Set this
// instruction as executed.
this->setExecuted();
req = new Request(asid, addr, size, flags, this->pc.instAddr(),
thread->contextId(), threadNumber);
// Only split the request if the ISA supports unaligned accesses.
if (TheISA::HasUnalignedMemAcc) {
splitRequest(req, sreqLow, sreqHigh);
}
initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Read);
}
if (fault != NoFault) {
// Return a fixed value to keep simulation deterministic even
// along misspeculated paths.
if (data)
bzero(data, size);
if (translationCompleted) {
if (fault == NoFault) {
effAddr = req->getVaddr();
effAddrValid = true;
fault = cpu->read(req, sreqLow, sreqHigh, data, lqIdx);
} else {
// Commit will have to clean up whatever happened. Set this
// instruction as executed.
this->setExecuted();
}
if (fault != NoFault) {
// Return a fixed value to keep simulation deterministic even
// along misspeculated paths.
if (data)
bzero(data, size);
}
}
if (traceData) {
@ -897,19 +941,26 @@ BaseDynInst<Impl>::writeBytes(uint8_t *data, unsigned size,
}
reqMade = true;
Request *req = new Request(asid, addr, size, flags, this->pc.instAddr(),
thread->contextId(), threadNumber);
Request *req = NULL;
Request *sreqLow = NULL;
Request *sreqHigh = NULL;
// Only split the request if the ISA supports unaligned accesses.
if (TheISA::HasUnalignedMemAcc) {
splitRequest(req, sreqLow, sreqHigh);
}
initiateTranslation(req, sreqLow, sreqHigh, res, BaseTLB::Write);
if (reqMade && translationStarted) {
req = savedReq;
sreqLow = savedSreqLow;
sreqHigh = savedSreqHigh;
} else {
req = new Request(asid, addr, size, flags, this->pc.instAddr(),
thread->contextId(), threadNumber);
if (fault == NoFault) {
// Only split the request if the ISA supports unaligned accesses.
if (TheISA::HasUnalignedMemAcc) {
splitRequest(req, sreqLow, sreqHigh);
}
initiateTranslation(req, sreqLow, sreqHigh, res, BaseTLB::Write);
}
if (fault == NoFault && translationCompleted) {
effAddr = req->getVaddr();
effAddrValid = true;
fault = cpu->write(req, sreqLow, sreqHigh, data, sqIdx);
@ -953,6 +1004,8 @@ BaseDynInst<Impl>::initiateTranslation(RequestPtr req, RequestPtr sreqLow,
RequestPtr sreqHigh, uint64_t *res,
BaseTLB::Mode mode)
{
translationStarted = true;
if (!TheISA::HasUnalignedMemAcc || sreqLow == NULL) {
WholeTranslationState *state =
new WholeTranslationState(req, NULL, res, mode);
@ -961,6 +1014,12 @@ BaseDynInst<Impl>::initiateTranslation(RequestPtr req, RequestPtr sreqLow,
DataTranslation<BaseDynInst<Impl> > *trans =
new DataTranslation<BaseDynInst<Impl> >(this, state);
cpu->dtb->translateTiming(req, thread->getTC(), trans, mode);
if (!translationCompleted) {
// Save memory requests.
savedReq = state->mainReq;
savedSreqLow = state->sreqLow;
savedSreqHigh = state->sreqHigh;
}
} else {
WholeTranslationState *state =
new WholeTranslationState(req, sreqLow, sreqHigh, NULL, res, mode);
@ -973,6 +1032,12 @@ BaseDynInst<Impl>::initiateTranslation(RequestPtr req, RequestPtr sreqLow,
cpu->dtb->translateTiming(sreqLow, thread->getTC(), stransLow, mode);
cpu->dtb->translateTiming(sreqHigh, thread->getTC(), stransHigh, mode);
if (!translationCompleted) {
// Save memory requests.
savedReq = state->mainReq;
savedSreqLow = state->sreqLow;
savedSreqHigh = state->sreqHigh;
}
}
}
@ -998,6 +1063,8 @@ BaseDynInst<Impl>::finishTranslation(WholeTranslationState *state)
state->deleteReqs();
}
delete state;
translationCompleted = true;
}
#endif // __CPU_BASE_DYN_INST_HH__

View file

@ -1,4 +1,16 @@
/*
* Copyright (c) 2011 ARM Limited
* All rights reserved.
*
* The license below extends only to copyright in the software and shall
* not be construed as granting a license to any other intellectual
* property including but not limited to intellectual property relating
* to a hardware implementation of the functionality of the software
* licensed hereunder. You may use the software subject to the license
* terms below provided that you ensure that this notice is replicated
* unmodified and in its entirety in all distributions of the software,
* modified or unmodified, in source code or in binary form.
*
* Copyright (c) 2004-2006 The Regents of The University of Michigan
* All rights reserved.
*
@ -107,6 +119,9 @@ BaseDynInst<Impl>::initVars()
effAddrValid = false;
physEffAddr = 0;
translationStarted = false;
translationCompleted = false;
isUncacheable = false;
reqMade = false;
readyRegs = 0;

View file

@ -136,6 +136,10 @@ class DefaultFetch
: fetch(_fetch)
{}
void
markDelayed()
{}
void
finish(Fault fault, RequestPtr req, ThreadContext *tc,
BaseTLB::Mode mode)

View file

@ -1241,12 +1241,33 @@ DefaultIEW<Impl>::executeInsts()
// Loads will mark themselves as executed, and their writeback
// event adds the instruction to the queue to commit
fault = ldstQueue.executeLoad(inst);
if (inst->isTranslationDelayed() &&
fault == NoFault) {
// A hw page table walk is currently going on; the
// instruction must be deferred.
DPRINTF(IEW, "Execute: Delayed translation, deferring "
"load.\n");
instQueue.deferMemInst(inst);
continue;
}
if (inst->isDataPrefetch() || inst->isInstPrefetch()) {
fault = NoFault;
}
} else if (inst->isStore()) {
fault = ldstQueue.executeStore(inst);
if (inst->isTranslationDelayed() &&
fault == NoFault) {
// A hw page table walk is currently going on; the
// instruction must be deferred.
DPRINTF(IEW, "Execute: Delayed translation, deferring "
"store.\n");
instQueue.deferMemInst(inst);
continue;
}
// If the store had a fault then it may not have a mem req
if (fault != NoFault || inst->readPredicate() == false ||
!inst->isStoreConditional()) {

View file

@ -1,4 +1,16 @@
/*
* Copyright (c) 2011 ARM Limited
* All rights reserved.
*
* The license below extends only to copyright in the software and shall
* not be construed as granting a license to any other intellectual
* property including but not limited to intellectual property relating
* to a hardware implementation of the functionality of the software
* licensed hereunder. You may use the software subject to the license
* terms below provided that you ensure that this notice is replicated
* unmodified and in its entirety in all distributions of the software,
* modified or unmodified, in source code or in binary form.
*
* Copyright (c) 2004-2006 The Regents of The University of Michigan
* All rights reserved.
*
@ -180,6 +192,11 @@ class InstructionQueue
*/
DynInstPtr getInstToExecute();
/** Returns a memory instruction that was referred due to a delayed DTB
* translation if it is now ready to execute.
*/
DynInstPtr getDeferredMemInstToExecute();
/**
* Records the instruction as the producer of a register without
* adding it to the rest of the IQ.
@ -223,6 +240,12 @@ class InstructionQueue
/** Completes a memory operation. */
void completeMemInst(DynInstPtr &completed_inst);
/**
* Defers a memory instruction when its DTB translation incurs a hw
* page table walk.
*/
void deferMemInst(DynInstPtr &deferred_inst);
/** Indicates an ordering violation between a store and a load. */
void violation(DynInstPtr &store, DynInstPtr &faulting_load);
@ -284,6 +307,11 @@ class InstructionQueue
/** List of instructions that are ready to be executed. */
std::list<DynInstPtr> instsToExecute;
/** List of instructions waiting for their DTB translation to
* complete (hw page table walk in progress).
*/
std::list<DynInstPtr> deferredMemInsts;
/**
* Struct for comparing entries to be added to the priority queue.
* This gives reverse ordering to the instructions in terms of

View file

@ -1,4 +1,16 @@
/*
* Copyright (c) 2011 ARM Limited
* All rights reserved.
*
* The license below extends only to copyright in the software and shall
* not be construed as granting a license to any other intellectual
* property including but not limited to intellectual property relating
* to a hardware implementation of the functionality of the software
* licensed hereunder. You may use the software subject to the license
* terms below provided that you ensure that this notice is replicated
* unmodified and in its entirety in all distributions of the software,
* modified or unmodified, in source code or in binary form.
*
* Copyright (c) 2004-2006 The Regents of The University of Michigan
* All rights reserved.
*
@ -397,6 +409,7 @@ InstructionQueue<Impl>::resetState()
}
nonSpecInsts.clear();
listOrder.clear();
deferredMemInsts.clear();
}
template <class Impl>
@ -733,6 +746,15 @@ InstructionQueue<Impl>::scheduleReadyInsts()
IssueStruct *i2e_info = issueToExecuteQueue->access(0);
DynInstPtr deferred_mem_inst;
int total_deferred_mem_issued = 0;
while (total_deferred_mem_issued < totalWidth &&
(deferred_mem_inst = getDeferredMemInstToExecute()) != NULL) {
issueToExecuteQueue->access(0)->size++;
instsToExecute.push_back(deferred_mem_inst);
total_deferred_mem_issued++;
}
// Have iterator to head of the list
// While I haven't exceeded bandwidth or reached the end of the list,
// Try to get a FU that can do what this op needs.
@ -745,7 +767,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
ListOrderIt order_end_it = listOrder.end();
int total_issued = 0;
while (total_issued < totalWidth &&
while (total_issued < (totalWidth - total_deferred_mem_issued) &&
iewStage->canIssue() &&
order_it != order_end_it) {
OpClass op_class = (*order_it).queueType;
@ -858,7 +880,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
iqInstsIssued+= total_issued;
// If we issued any instructions, tell the CPU we had activity.
if (total_issued) {
if (total_issued || total_deferred_mem_issued) {
cpu->activityThisCycle();
} else {
DPRINTF(IQ, "Not able to schedule any instructions.\n");
@ -1021,6 +1043,11 @@ void
InstructionQueue<Impl>::rescheduleMemInst(DynInstPtr &resched_inst)
{
DPRINTF(IQ, "Rescheduling mem inst [sn:%lli]\n", resched_inst->seqNum);
// Reset DTB translation state
resched_inst->translationStarted = false;
resched_inst->translationCompleted = false;
resched_inst->clearCanIssue();
memDepUnit[resched_inst->threadNumber].reschedule(resched_inst);
}
@ -1049,6 +1076,28 @@ InstructionQueue<Impl>::completeMemInst(DynInstPtr &completed_inst)
count[tid]--;
}
template <class Impl>
void
InstructionQueue<Impl>::deferMemInst(DynInstPtr &deferred_inst)
{
deferredMemInsts.push_back(deferred_inst);
}
template <class Impl>
typename Impl::DynInstPtr
InstructionQueue<Impl>::getDeferredMemInstToExecute()
{
for (ListIt it = deferredMemInsts.begin(); it != deferredMemInsts.end();
++it) {
if ((*it)->translationCompleted) {
DynInstPtr ret = *it;
deferredMemInsts.erase(it);
return ret;
}
}
return NULL;
}
template <class Impl>
void
InstructionQueue<Impl>::violation(DynInstPtr &store,

View file

@ -445,12 +445,16 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
Fault load_fault = NoFault;
DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n",
inst->pcState(),inst->seqNum);
inst->pcState(), inst->seqNum);
assert(!inst->isSquashed());
load_fault = inst->initiateAcc();
if (inst->isTranslationDelayed() &&
load_fault == NoFault)
return load_fault;
// If the instruction faulted or predicated false, then we need to send it
// along to commit without the instruction completing.
if (load_fault != NoFault || inst->readPredicate() == false) {
@ -532,6 +536,10 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
Fault store_fault = store_inst->initiateAcc();
if (store_inst->isTranslationDelayed() &&
store_fault == NoFault)
return store_fault;
if (store_inst->readPredicate() == false)
store_inst->forwardOldRegs();

View file

@ -107,6 +107,10 @@ class TimingSimpleCPU : public BaseSimpleCPU
: cpu(_cpu)
{}
void
markDelayed()
{}
void
finish(Fault fault, RequestPtr req, ThreadContext *tc,
BaseTLB::Mode mode)

View file

@ -1,4 +1,16 @@
/*
* Copyright (c) 2011 ARM Limited
* All rights reserved.
*
* The license below extends only to copyright in the software and shall
* not be construed as granting a license to any other intellectual
* property including but not limited to intellectual property relating
* to a hardware implementation of the functionality of the software
* licensed hereunder. You may use the software subject to the license
* terms below provided that you ensure that this notice is replicated
* unmodified and in its entirety in all distributions of the software,
* modified or unmodified, in source code or in binary form.
*
* Copyright (c) 2002-2005 The Regents of The University of Michigan
* Copyright (c) 2009 The University of Edinburgh
* All rights reserved.
@ -53,6 +65,7 @@ class WholeTranslationState
Fault faults[2];
public:
bool delay;
bool isSplit;
RequestPtr mainReq;
RequestPtr sreqLow;
@ -67,8 +80,8 @@ class WholeTranslationState
*/
WholeTranslationState(RequestPtr _req, uint8_t *_data, uint64_t *_res,
BaseTLB::Mode _mode)
: outstanding(1), isSplit(false), mainReq(_req), sreqLow(NULL),
sreqHigh(NULL), data(_data), res(_res), mode(_mode)
: outstanding(1), delay(false), isSplit(false), mainReq(_req),
sreqLow(NULL), sreqHigh(NULL), data(_data), res(_res), mode(_mode)
{
faults[0] = faults[1] = NoFault;
assert(mode == BaseTLB::Read || mode == BaseTLB::Write);
@ -82,8 +95,9 @@ class WholeTranslationState
WholeTranslationState(RequestPtr _req, RequestPtr _sreqLow,
RequestPtr _sreqHigh, uint8_t *_data, uint64_t *_res,
BaseTLB::Mode _mode)
: outstanding(2), isSplit(true), mainReq(_req), sreqLow(_sreqLow),
sreqHigh(_sreqHigh), data(_data), res(_res), mode(_mode)
: outstanding(2), delay(false), isSplit(true), mainReq(_req),
sreqLow(_sreqLow), sreqHigh(_sreqHigh), data(_data), res(_res),
mode(_mode)
{
faults[0] = faults[1] = NoFault;
assert(mode == BaseTLB::Read || mode == BaseTLB::Write);
@ -220,6 +234,16 @@ class DataTranslation : public BaseTLB::Translation
{
}
/**
* Signal the translation state that the translation has been delayed due
* to a hw page table walk. Split requests are transparently handled.
*/
void
markDelayed()
{
state->delay = true;
}
/**
* Finish this part of the translation and indicate that the whole
* translation is complete if the state says so.

View file

@ -1,4 +1,16 @@
/*
* Copyright (c) 2011 ARM Limited
* All rights reserved.
*
* The license below extends only to copyright in the software and shall
* not be construed as granting a license to any other intellectual
* property including but not limited to intellectual property relating
* to a hardware implementation of the functionality of the software
* licensed hereunder. You may use the software subject to the license
* terms below provided that you ensure that this notice is replicated
* unmodified and in its entirety in all distributions of the software,
* modified or unmodified, in source code or in binary form.
*
* Copyright (c) 2006 The Regents of The University of Michigan
* All rights reserved.
*
@ -64,6 +76,12 @@ class BaseTLB : public SimObject
virtual ~Translation()
{}
/**
* Signal that the translation has been delayed due to a hw page table
* walk.
*/
virtual void markDelayed() = 0;
/*
* The memory for this object may be dynamically allocated, and it may
* be responsible for cleaning itself up which will happen in this