X86: Move address based decode caching in front of the predecoder.
The predecoder in x86 does a lot of work, most of which can be skipped if the decoder cache is put in front of it. Committed by: Nilay Vaish <nilay@cs.wisc.edu>
This commit is contained in:
parent
63b10907ef
commit
d1965af220
4 changed files with 245 additions and 71 deletions
|
@ -38,10 +38,15 @@
|
|||
|
||||
namespace X86ISA
|
||||
{
|
||||
void Decoder::doReset()
|
||||
|
||||
Decoder::State
|
||||
Decoder::doResetState()
|
||||
{
|
||||
origPC = basePC + offset;
|
||||
DPRINTF(Decoder, "Setting origPC to %#x\n", origPC);
|
||||
instBytes = &decodePages->lookup(origPC);
|
||||
chunkIdx = 0;
|
||||
|
||||
emi.rex = 0;
|
||||
emi.legacy = 0;
|
||||
emi.opcode.num = 0;
|
||||
|
@ -55,12 +60,17 @@ void Decoder::doReset()
|
|||
|
||||
emi.modRM = 0;
|
||||
emi.sib = 0;
|
||||
m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
|
||||
emi.mode.mode = m5Reg.mode;
|
||||
emi.mode.submode = m5Reg.submode;
|
||||
|
||||
if (instBytes->si) {
|
||||
return FromCacheState;
|
||||
} else {
|
||||
instBytes->chunks.clear();
|
||||
return PrefixState;
|
||||
}
|
||||
}
|
||||
|
||||
void Decoder::process()
|
||||
void
|
||||
Decoder::process()
|
||||
{
|
||||
//This function drives the decoder state machine.
|
||||
|
||||
|
@ -70,15 +80,18 @@ void Decoder::process()
|
|||
assert(!outOfBytes);
|
||||
assert(!instDone);
|
||||
|
||||
if (state == ResetState)
|
||||
state = doResetState();
|
||||
if (state == FromCacheState) {
|
||||
state = doFromCacheState();
|
||||
} else {
|
||||
instBytes->chunks.push_back(fetchChunk);
|
||||
}
|
||||
|
||||
//While there's still something to do...
|
||||
while(!instDone && !outOfBytes)
|
||||
{
|
||||
while (!instDone && !outOfBytes) {
|
||||
uint8_t nextByte = getNextByte();
|
||||
switch(state)
|
||||
{
|
||||
case ResetState:
|
||||
doReset();
|
||||
state = PrefixState;
|
||||
switch (state) {
|
||||
case PrefixState:
|
||||
state = doPrefixState(nextByte);
|
||||
break;
|
||||
|
@ -105,9 +118,42 @@ void Decoder::process()
|
|||
}
|
||||
}
|
||||
|
||||
Decoder::State
|
||||
Decoder::doFromCacheState()
|
||||
{
|
||||
DPRINTF(Decoder, "Looking at cache state.\n");
|
||||
if ((fetchChunk & instBytes->masks[chunkIdx]) !=
|
||||
instBytes->chunks[chunkIdx]) {
|
||||
DPRINTF(Decoder, "Decode cache miss.\n");
|
||||
// The chached chunks didn't match what was fetched. Fall back to the
|
||||
// predecoder.
|
||||
instBytes->chunks[chunkIdx] = fetchChunk;
|
||||
instBytes->chunks.resize(chunkIdx + 1);
|
||||
instBytes->si = NULL;
|
||||
chunkIdx = 0;
|
||||
fetchChunk = instBytes->chunks[0];
|
||||
offset = origPC % sizeof(MachInst);
|
||||
basePC = origPC - offset;
|
||||
return PrefixState;
|
||||
} else if (chunkIdx == instBytes->chunks.size() - 1) {
|
||||
// We matched the cache, so use its value.
|
||||
instDone = true;
|
||||
offset = instBytes->lastOffset;
|
||||
if (offset == sizeof(MachInst))
|
||||
outOfBytes = true;
|
||||
return ResetState;
|
||||
} else {
|
||||
// We matched so far, but need to check more chunks.
|
||||
chunkIdx++;
|
||||
outOfBytes = true;
|
||||
return FromCacheState;
|
||||
}
|
||||
}
|
||||
|
||||
//Either get a prefix and record it in the ExtMachInst, or send the
|
||||
//state machine on to get the opcode(s).
|
||||
Decoder::State Decoder::doPrefixState(uint8_t nextByte)
|
||||
Decoder::State
|
||||
Decoder::doPrefixState(uint8_t nextByte)
|
||||
{
|
||||
uint8_t prefix = Prefixes[nextByte];
|
||||
State nextState = PrefixState;
|
||||
|
@ -164,7 +210,8 @@ Decoder::State Decoder::doPrefixState(uint8_t nextByte)
|
|||
|
||||
//Load all the opcodes (currently up to 2) and then figure out
|
||||
//what immediate and/or ModRM is needed.
|
||||
Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
|
||||
Decoder::State
|
||||
Decoder::doOpcodeState(uint8_t nextByte)
|
||||
{
|
||||
State nextState = ErrorState;
|
||||
emi.opcode.num++;
|
||||
|
@ -194,9 +241,9 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
|
|||
if (emi.rex.w)
|
||||
logOpSize = 3; // 64 bit operand size
|
||||
else if (emi.legacy.op)
|
||||
logOpSize = m5Reg.altOp;
|
||||
logOpSize = altOp;
|
||||
else
|
||||
logOpSize = m5Reg.defOp;
|
||||
logOpSize = defOp;
|
||||
|
||||
//Set the actual op size
|
||||
emi.opSize = 1 << logOpSize;
|
||||
|
@ -205,16 +252,16 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
|
|||
//a fixed value at the decoder level.
|
||||
int logAddrSize;
|
||||
if(emi.legacy.addr)
|
||||
logAddrSize = m5Reg.altAddr;
|
||||
logAddrSize = altAddr;
|
||||
else
|
||||
logAddrSize = m5Reg.defAddr;
|
||||
logAddrSize = defAddr;
|
||||
|
||||
//Set the actual address size
|
||||
emi.addrSize = 1 << logAddrSize;
|
||||
|
||||
//Figure out the effective stack width. This can be overriden to
|
||||
//a fixed value at the decoder level.
|
||||
emi.stackSize = 1 << m5Reg.stack;
|
||||
emi.stackSize = 1 << stack;
|
||||
|
||||
//Figure out how big of an immediate we'll retreive based
|
||||
//on the opcode.
|
||||
|
@ -242,13 +289,14 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
|
|||
//Get the ModRM byte and determine what displacement, if any, there is.
|
||||
//Also determine whether or not to get the SIB byte, displacement, or
|
||||
//immediate next.
|
||||
Decoder::State Decoder::doModRMState(uint8_t nextByte)
|
||||
Decoder::State
|
||||
Decoder::doModRMState(uint8_t nextByte)
|
||||
{
|
||||
State nextState = ErrorState;
|
||||
ModRM modRM;
|
||||
modRM = nextByte;
|
||||
DPRINTF(Decoder, "Found modrm byte %#x.\n", nextByte);
|
||||
if (m5Reg.defOp == 1) {
|
||||
if (defOp == 1) {
|
||||
//figure out 16 bit displacement size
|
||||
if ((modRM.mod == 0 && modRM.rm == 6) || modRM.mod == 2)
|
||||
displacementSize = 2;
|
||||
|
@ -297,7 +345,8 @@ Decoder::State Decoder::doModRMState(uint8_t nextByte)
|
|||
//Get the SIB byte. We don't do anything with it at this point, other
|
||||
//than storing it in the ExtMachInst. Determine if we need to get a
|
||||
//displacement or immediate next.
|
||||
Decoder::State Decoder::doSIBState(uint8_t nextByte)
|
||||
Decoder::State
|
||||
Decoder::doSIBState(uint8_t nextByte)
|
||||
{
|
||||
State nextState = ErrorState;
|
||||
emi.sib = nextByte;
|
||||
|
@ -318,7 +367,8 @@ Decoder::State Decoder::doSIBState(uint8_t nextByte)
|
|||
|
||||
//Gather up the displacement, or at least as much of it
|
||||
//as we can get.
|
||||
Decoder::State Decoder::doDisplacementState()
|
||||
Decoder::State
|
||||
Decoder::doDisplacementState()
|
||||
{
|
||||
State nextState = ErrorState;
|
||||
|
||||
|
@ -365,7 +415,8 @@ Decoder::State Decoder::doDisplacementState()
|
|||
|
||||
//Gather up the immediate, or at least as much of it
|
||||
//as we can get
|
||||
Decoder::State Decoder::doImmediateState()
|
||||
Decoder::State
|
||||
Decoder::doImmediateState()
|
||||
{
|
||||
State nextState = ErrorState;
|
||||
|
||||
|
@ -408,24 +459,62 @@ Decoder::State Decoder::doImmediateState()
|
|||
return nextState;
|
||||
}
|
||||
|
||||
DecodeCache::InstMap Decoder::instMap;
|
||||
DecodeCache::AddrMap<StaticInstPtr> Decoder::decodePages;
|
||||
Decoder::InstBytes Decoder::dummy;
|
||||
Decoder::InstCacheMap Decoder::instCacheMap;
|
||||
|
||||
StaticInstPtr
|
||||
Decoder::decode(ExtMachInst mach_inst, Addr addr)
|
||||
{
|
||||
StaticInstPtr &si = decodePages.lookup(addr);
|
||||
if (si && (si->machInst == mach_inst))
|
||||
DecodeCache::InstMap::iterator iter = instMap->find(mach_inst);
|
||||
if (iter != instMap->end())
|
||||
return iter->second;
|
||||
|
||||
StaticInstPtr si = decodeInst(mach_inst);
|
||||
(*instMap)[mach_inst] = si;
|
||||
return si;
|
||||
}
|
||||
|
||||
StaticInstPtr
|
||||
Decoder::decode(PCState &nextPC)
|
||||
{
|
||||
if (!instDone)
|
||||
return NULL;
|
||||
instDone = false;
|
||||
updateNPC(nextPC);
|
||||
|
||||
StaticInstPtr &si = instBytes->si;
|
||||
if (si)
|
||||
return si;
|
||||
|
||||
DecodeCache::InstMap::iterator iter = instMap.find(mach_inst);
|
||||
if (iter != instMap.end()) {
|
||||
si = iter->second;
|
||||
return si;
|
||||
// We didn't match in the AddrMap, but we still populated an entry. Fix
|
||||
// up its byte masks.
|
||||
const int chunkSize = sizeof(MachInst);
|
||||
|
||||
instBytes->lastOffset = offset;
|
||||
|
||||
Addr firstBasePC = basePC - (instBytes->chunks.size() - 1) * chunkSize;
|
||||
Addr firstOffset = origPC - firstBasePC;
|
||||
Addr totalSize = instBytes->lastOffset - firstOffset +
|
||||
(instBytes->chunks.size() - 1) * chunkSize;
|
||||
int start = firstOffset;
|
||||
instBytes->masks.clear();
|
||||
|
||||
while (totalSize) {
|
||||
int end = start + totalSize;
|
||||
end = (chunkSize < end) ? chunkSize : end;
|
||||
int size = end - start;
|
||||
int idx = instBytes->masks.size();
|
||||
|
||||
MachInst maskVal = mask(size * 8) << (start * 8);
|
||||
assert(maskVal);
|
||||
|
||||
instBytes->masks.push_back(maskVal);
|
||||
instBytes->chunks[idx] &= instBytes->masks[idx];
|
||||
totalSize -= size;
|
||||
start = 0;
|
||||
}
|
||||
|
||||
si = decodeInst(mach_inst);
|
||||
instMap[mach_inst] = si;
|
||||
si = decode(emi, origPC);
|
||||
return si;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
#define __ARCH_X86_DECODER_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include "arch/x86/regs/misc.hh"
|
||||
#include "arch/x86/types.hh"
|
||||
|
@ -58,9 +59,24 @@ class Decoder
|
|||
static const uint8_t SizeTypeToSize[3][10];
|
||||
|
||||
protected:
|
||||
struct InstBytes
|
||||
{
|
||||
StaticInstPtr si;
|
||||
std::vector<MachInst> chunks;
|
||||
std::vector<MachInst> masks;
|
||||
int lastOffset;
|
||||
|
||||
InstBytes() : lastOffset(0)
|
||||
{}
|
||||
};
|
||||
|
||||
static InstBytes dummy;
|
||||
|
||||
ThreadContext * tc;
|
||||
//The bytes to be predecoded
|
||||
MachInst fetchChunk;
|
||||
InstBytes *instBytes;
|
||||
int chunkIdx;
|
||||
//The pc of the start of fetchChunk
|
||||
Addr basePC;
|
||||
//The pc the current instruction started at
|
||||
|
@ -69,9 +85,16 @@ class Decoder
|
|||
int offset;
|
||||
//The extended machine instruction being generated
|
||||
ExtMachInst emi;
|
||||
HandyM5Reg m5Reg;
|
||||
//Predecoding state
|
||||
X86Mode mode;
|
||||
X86SubMode submode;
|
||||
uint8_t altOp;
|
||||
uint8_t defOp;
|
||||
uint8_t altAddr;
|
||||
uint8_t defAddr;
|
||||
uint8_t stack;
|
||||
|
||||
inline uint8_t getNextByte()
|
||||
uint8_t getNextByte()
|
||||
{
|
||||
return ((uint8_t *)&fetchChunk)[offset];
|
||||
}
|
||||
|
@ -99,24 +122,35 @@ class Decoder
|
|||
consumeBytes(toGet);
|
||||
}
|
||||
|
||||
inline void consumeByte()
|
||||
void updateOffsetState()
|
||||
{
|
||||
assert(offset <= sizeof(MachInst));
|
||||
if (offset == sizeof(MachInst)) {
|
||||
DPRINTF(Decoder, "At the end of a chunk, idx = %d, chunks = %d.\n",
|
||||
chunkIdx, instBytes->chunks.size());
|
||||
chunkIdx++;
|
||||
if (chunkIdx == instBytes->chunks.size()) {
|
||||
outOfBytes = true;
|
||||
} else {
|
||||
offset = 0;
|
||||
fetchChunk = instBytes->chunks[chunkIdx];
|
||||
basePC += sizeof(MachInst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void consumeByte()
|
||||
{
|
||||
offset++;
|
||||
assert(offset <= sizeof(MachInst));
|
||||
if(offset == sizeof(MachInst))
|
||||
outOfBytes = true;
|
||||
updateOffsetState();
|
||||
}
|
||||
|
||||
inline void consumeBytes(int numBytes)
|
||||
void consumeBytes(int numBytes)
|
||||
{
|
||||
offset += numBytes;
|
||||
assert(offset <= sizeof(MachInst));
|
||||
if(offset == sizeof(MachInst))
|
||||
outOfBytes = true;
|
||||
updateOffsetState();
|
||||
}
|
||||
|
||||
void doReset();
|
||||
|
||||
//State machine state
|
||||
protected:
|
||||
//Whether or not we're out of bytes
|
||||
|
@ -133,6 +167,7 @@ class Decoder
|
|||
|
||||
enum State {
|
||||
ResetState,
|
||||
FromCacheState,
|
||||
PrefixState,
|
||||
OpcodeState,
|
||||
ModRMState,
|
||||
|
@ -146,6 +181,8 @@ class Decoder
|
|||
State state;
|
||||
|
||||
//Functions to handle each of the states
|
||||
State doResetState();
|
||||
State doFromCacheState();
|
||||
State doPrefixState(uint8_t);
|
||||
State doOpcodeState(uint8_t);
|
||||
State doModRMState(uint8_t);
|
||||
|
@ -153,6 +190,20 @@ class Decoder
|
|||
State doDisplacementState();
|
||||
State doImmediateState();
|
||||
|
||||
protected:
|
||||
/// Caching for decoded instruction objects.
|
||||
|
||||
typedef MiscReg CacheKey;
|
||||
|
||||
typedef DecodeCache::AddrMap<Decoder::InstBytes> DecodePages;
|
||||
DecodePages *decodePages;
|
||||
typedef m5::hash_map<CacheKey, DecodePages *> AddrCacheMap;
|
||||
AddrCacheMap addrCacheMap;
|
||||
|
||||
DecodeCache::InstMap *instMap;
|
||||
typedef m5::hash_map<CacheKey, DecodeCache::InstMap *> InstCacheMap;
|
||||
static InstCacheMap instCacheMap;
|
||||
|
||||
public:
|
||||
Decoder(ThreadContext * _tc) :
|
||||
tc(_tc), basePC(0), origPC(0), offset(0),
|
||||
|
@ -160,9 +211,47 @@ class Decoder
|
|||
state(ResetState)
|
||||
{
|
||||
memset(&emi, 0, sizeof(emi));
|
||||
emi.mode.mode = LongMode;
|
||||
emi.mode.submode = SixtyFourBitMode;
|
||||
m5Reg = 0;
|
||||
mode = LongMode;
|
||||
submode = SixtyFourBitMode;
|
||||
emi.mode.mode = mode;
|
||||
emi.mode.submode = submode;
|
||||
altOp = 0;
|
||||
defOp = 0;
|
||||
altAddr = 0;
|
||||
defAddr = 0;
|
||||
stack = 0;
|
||||
instBytes = &dummy;
|
||||
decodePages = NULL;
|
||||
instMap = NULL;
|
||||
}
|
||||
|
||||
void setM5Reg(HandyM5Reg m5Reg)
|
||||
{
|
||||
mode = (X86Mode)(uint64_t)m5Reg.mode;
|
||||
submode = (X86SubMode)(uint64_t)m5Reg.submode;
|
||||
emi.mode.mode = mode;
|
||||
emi.mode.submode = submode;
|
||||
altOp = m5Reg.altOp;
|
||||
defOp = m5Reg.defOp;
|
||||
altAddr = m5Reg.altAddr;
|
||||
defAddr = m5Reg.defAddr;
|
||||
stack = m5Reg.stack;
|
||||
|
||||
AddrCacheMap::iterator amIter = addrCacheMap.find(m5Reg);
|
||||
if (amIter != addrCacheMap.end()) {
|
||||
decodePages = amIter->second;
|
||||
} else {
|
||||
decodePages = new DecodePages;
|
||||
addrCacheMap[m5Reg] = decodePages;
|
||||
}
|
||||
|
||||
InstCacheMap::iterator imIter = instCacheMap.find(m5Reg);
|
||||
if (imIter != instCacheMap.end()) {
|
||||
instMap = imIter->second;
|
||||
} else {
|
||||
instMap = new DecodeCache::InstMap;
|
||||
instCacheMap[m5Reg] = instMap;
|
||||
}
|
||||
}
|
||||
|
||||
void reset()
|
||||
|
@ -218,11 +307,6 @@ class Decoder
|
|||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
/// Caching for decoded instruction objects.
|
||||
static DecodeCache::InstMap instMap;
|
||||
static DecodeCache::AddrMap<StaticInstPtr> decodePages;
|
||||
|
||||
public:
|
||||
StaticInstPtr decodeInst(ExtMachInst mach_inst);
|
||||
|
||||
|
@ -230,16 +314,7 @@ class Decoder
|
|||
/// @param mach_inst The binary instruction to decode.
|
||||
/// @retval A pointer to the corresponding StaticInst object.
|
||||
StaticInstPtr decode(ExtMachInst mach_inst, Addr addr);
|
||||
|
||||
StaticInstPtr
|
||||
decode(X86ISA::PCState &nextPC)
|
||||
{
|
||||
if (!instDone)
|
||||
return NULL;
|
||||
instDone = false;
|
||||
updateNPC(nextPC);
|
||||
return decode(emi, origPC);
|
||||
}
|
||||
StaticInstPtr decode(X86ISA::PCState &nextPC);
|
||||
};
|
||||
|
||||
} // namespace X86ISA
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
* Authors: Gabe Black
|
||||
*/
|
||||
|
||||
#include "arch/x86/decoder.hh"
|
||||
#include "arch/x86/isa.hh"
|
||||
#include "arch/x86/tlb.hh"
|
||||
#include "cpu/base.hh"
|
||||
|
@ -39,7 +40,8 @@ namespace X86ISA
|
|||
|
||||
void
|
||||
ISA::updateHandyM5Reg(Efer efer, CR0 cr0,
|
||||
SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags)
|
||||
SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags,
|
||||
ThreadContext *tc)
|
||||
{
|
||||
HandyM5Reg m5reg = 0;
|
||||
if (efer.lma) {
|
||||
|
@ -94,6 +96,8 @@ ISA::updateHandyM5Reg(Efer efer, CR0 cr0,
|
|||
}
|
||||
|
||||
regVal[MISCREG_M5_REG] = m5reg;
|
||||
if (tc)
|
||||
tc->getDecoderPtr()->setM5Reg(m5reg);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -184,7 +188,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
|
|||
newCR0,
|
||||
regVal[MISCREG_CS_ATTR],
|
||||
regVal[MISCREG_SS_ATTR],
|
||||
regVal[MISCREG_RFLAGS]);
|
||||
regVal[MISCREG_RFLAGS],
|
||||
tc);
|
||||
}
|
||||
break;
|
||||
case MISCREG_CR2:
|
||||
|
@ -225,7 +230,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
|
|||
regVal[MISCREG_CR0],
|
||||
newCSAttr,
|
||||
regVal[MISCREG_SS_ATTR],
|
||||
regVal[MISCREG_RFLAGS]);
|
||||
regVal[MISCREG_RFLAGS],
|
||||
tc);
|
||||
}
|
||||
break;
|
||||
case MISCREG_SS_ATTR:
|
||||
|
@ -233,7 +239,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
|
|||
regVal[MISCREG_CR0],
|
||||
regVal[MISCREG_CS_ATTR],
|
||||
val,
|
||||
regVal[MISCREG_RFLAGS]);
|
||||
regVal[MISCREG_RFLAGS],
|
||||
tc);
|
||||
break;
|
||||
// These segments always actually use their bases, or in other words
|
||||
// their effective bases must stay equal to their actual bases.
|
||||
|
@ -340,7 +347,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
|
|||
regVal[MISCREG_CR0],
|
||||
regVal[MISCREG_CS_ATTR],
|
||||
regVal[MISCREG_SS_ATTR],
|
||||
regVal[MISCREG_RFLAGS]);
|
||||
regVal[MISCREG_RFLAGS],
|
||||
tc);
|
||||
return;
|
||||
default:
|
||||
break;
|
||||
|
@ -363,7 +371,8 @@ ISA::unserialize(EventManager *em, Checkpoint * cp,
|
|||
regVal[MISCREG_CR0],
|
||||
regVal[MISCREG_CS_ATTR],
|
||||
regVal[MISCREG_SS_ATTR],
|
||||
regVal[MISCREG_RFLAGS]);
|
||||
regVal[MISCREG_RFLAGS],
|
||||
NULL);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -50,7 +50,8 @@ namespace X86ISA
|
|||
protected:
|
||||
MiscReg regVal[NUM_MISCREGS];
|
||||
void updateHandyM5Reg(Efer efer, CR0 cr0,
|
||||
SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags);
|
||||
SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags,
|
||||
ThreadContext *tc);
|
||||
|
||||
public:
|
||||
void clear();
|
||||
|
|
Loading…
Reference in a new issue