X86: Move address based decode caching in front of the predecoder.

The predecoder in x86 does a lot of work, most of which can be skipped if the
decoder cache is put in front of it.

Committed by: Nilay Vaish <nilay@cs.wisc.edu>
This commit is contained in:
Gabe Black 2013-01-04 19:00:44 -06:00
parent 63b10907ef
commit d1965af220
4 changed files with 245 additions and 71 deletions

View file

@ -38,10 +38,15 @@
namespace X86ISA
{
void Decoder::doReset()
Decoder::State
Decoder::doResetState()
{
origPC = basePC + offset;
DPRINTF(Decoder, "Setting origPC to %#x\n", origPC);
instBytes = &decodePages->lookup(origPC);
chunkIdx = 0;
emi.rex = 0;
emi.legacy = 0;
emi.opcode.num = 0;
@ -55,12 +60,17 @@ void Decoder::doReset()
emi.modRM = 0;
emi.sib = 0;
m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
emi.mode.mode = m5Reg.mode;
emi.mode.submode = m5Reg.submode;
if (instBytes->si) {
return FromCacheState;
} else {
instBytes->chunks.clear();
return PrefixState;
}
}
void Decoder::process()
void
Decoder::process()
{
//This function drives the decoder state machine.
@ -70,15 +80,18 @@ void Decoder::process()
assert(!outOfBytes);
assert(!instDone);
if (state == ResetState)
state = doResetState();
if (state == FromCacheState) {
state = doFromCacheState();
} else {
instBytes->chunks.push_back(fetchChunk);
}
//While there's still something to do...
while(!instDone && !outOfBytes)
{
while (!instDone && !outOfBytes) {
uint8_t nextByte = getNextByte();
switch(state)
{
case ResetState:
doReset();
state = PrefixState;
switch (state) {
case PrefixState:
state = doPrefixState(nextByte);
break;
@ -105,9 +118,42 @@ void Decoder::process()
}
}
Decoder::State
Decoder::doFromCacheState()
{
DPRINTF(Decoder, "Looking at cache state.\n");
if ((fetchChunk & instBytes->masks[chunkIdx]) !=
instBytes->chunks[chunkIdx]) {
DPRINTF(Decoder, "Decode cache miss.\n");
// The chached chunks didn't match what was fetched. Fall back to the
// predecoder.
instBytes->chunks[chunkIdx] = fetchChunk;
instBytes->chunks.resize(chunkIdx + 1);
instBytes->si = NULL;
chunkIdx = 0;
fetchChunk = instBytes->chunks[0];
offset = origPC % sizeof(MachInst);
basePC = origPC - offset;
return PrefixState;
} else if (chunkIdx == instBytes->chunks.size() - 1) {
// We matched the cache, so use its value.
instDone = true;
offset = instBytes->lastOffset;
if (offset == sizeof(MachInst))
outOfBytes = true;
return ResetState;
} else {
// We matched so far, but need to check more chunks.
chunkIdx++;
outOfBytes = true;
return FromCacheState;
}
}
//Either get a prefix and record it in the ExtMachInst, or send the
//state machine on to get the opcode(s).
Decoder::State Decoder::doPrefixState(uint8_t nextByte)
Decoder::State
Decoder::doPrefixState(uint8_t nextByte)
{
uint8_t prefix = Prefixes[nextByte];
State nextState = PrefixState;
@ -164,7 +210,8 @@ Decoder::State Decoder::doPrefixState(uint8_t nextByte)
//Load all the opcodes (currently up to 2) and then figure out
//what immediate and/or ModRM is needed.
Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
Decoder::State
Decoder::doOpcodeState(uint8_t nextByte)
{
State nextState = ErrorState;
emi.opcode.num++;
@ -194,9 +241,9 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
if (emi.rex.w)
logOpSize = 3; // 64 bit operand size
else if (emi.legacy.op)
logOpSize = m5Reg.altOp;
logOpSize = altOp;
else
logOpSize = m5Reg.defOp;
logOpSize = defOp;
//Set the actual op size
emi.opSize = 1 << logOpSize;
@ -205,16 +252,16 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
//a fixed value at the decoder level.
int logAddrSize;
if(emi.legacy.addr)
logAddrSize = m5Reg.altAddr;
logAddrSize = altAddr;
else
logAddrSize = m5Reg.defAddr;
logAddrSize = defAddr;
//Set the actual address size
emi.addrSize = 1 << logAddrSize;
//Figure out the effective stack width. This can be overriden to
//a fixed value at the decoder level.
emi.stackSize = 1 << m5Reg.stack;
emi.stackSize = 1 << stack;
//Figure out how big of an immediate we'll retreive based
//on the opcode.
@ -242,13 +289,14 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
//Get the ModRM byte and determine what displacement, if any, there is.
//Also determine whether or not to get the SIB byte, displacement, or
//immediate next.
Decoder::State Decoder::doModRMState(uint8_t nextByte)
Decoder::State
Decoder::doModRMState(uint8_t nextByte)
{
State nextState = ErrorState;
ModRM modRM;
modRM = nextByte;
DPRINTF(Decoder, "Found modrm byte %#x.\n", nextByte);
if (m5Reg.defOp == 1) {
if (defOp == 1) {
//figure out 16 bit displacement size
if ((modRM.mod == 0 && modRM.rm == 6) || modRM.mod == 2)
displacementSize = 2;
@ -297,7 +345,8 @@ Decoder::State Decoder::doModRMState(uint8_t nextByte)
//Get the SIB byte. We don't do anything with it at this point, other
//than storing it in the ExtMachInst. Determine if we need to get a
//displacement or immediate next.
Decoder::State Decoder::doSIBState(uint8_t nextByte)
Decoder::State
Decoder::doSIBState(uint8_t nextByte)
{
State nextState = ErrorState;
emi.sib = nextByte;
@ -318,7 +367,8 @@ Decoder::State Decoder::doSIBState(uint8_t nextByte)
//Gather up the displacement, or at least as much of it
//as we can get.
Decoder::State Decoder::doDisplacementState()
Decoder::State
Decoder::doDisplacementState()
{
State nextState = ErrorState;
@ -365,7 +415,8 @@ Decoder::State Decoder::doDisplacementState()
//Gather up the immediate, or at least as much of it
//as we can get
Decoder::State Decoder::doImmediateState()
Decoder::State
Decoder::doImmediateState()
{
State nextState = ErrorState;
@ -408,24 +459,62 @@ Decoder::State Decoder::doImmediateState()
return nextState;
}
DecodeCache::InstMap Decoder::instMap;
DecodeCache::AddrMap<StaticInstPtr> Decoder::decodePages;
Decoder::InstBytes Decoder::dummy;
Decoder::InstCacheMap Decoder::instCacheMap;
StaticInstPtr
Decoder::decode(ExtMachInst mach_inst, Addr addr)
{
StaticInstPtr &si = decodePages.lookup(addr);
if (si && (si->machInst == mach_inst))
DecodeCache::InstMap::iterator iter = instMap->find(mach_inst);
if (iter != instMap->end())
return iter->second;
StaticInstPtr si = decodeInst(mach_inst);
(*instMap)[mach_inst] = si;
return si;
}
StaticInstPtr
Decoder::decode(PCState &nextPC)
{
if (!instDone)
return NULL;
instDone = false;
updateNPC(nextPC);
StaticInstPtr &si = instBytes->si;
if (si)
return si;
DecodeCache::InstMap::iterator iter = instMap.find(mach_inst);
if (iter != instMap.end()) {
si = iter->second;
return si;
// We didn't match in the AddrMap, but we still populated an entry. Fix
// up its byte masks.
const int chunkSize = sizeof(MachInst);
instBytes->lastOffset = offset;
Addr firstBasePC = basePC - (instBytes->chunks.size() - 1) * chunkSize;
Addr firstOffset = origPC - firstBasePC;
Addr totalSize = instBytes->lastOffset - firstOffset +
(instBytes->chunks.size() - 1) * chunkSize;
int start = firstOffset;
instBytes->masks.clear();
while (totalSize) {
int end = start + totalSize;
end = (chunkSize < end) ? chunkSize : end;
int size = end - start;
int idx = instBytes->masks.size();
MachInst maskVal = mask(size * 8) << (start * 8);
assert(maskVal);
instBytes->masks.push_back(maskVal);
instBytes->chunks[idx] &= instBytes->masks[idx];
totalSize -= size;
start = 0;
}
si = decodeInst(mach_inst);
instMap[mach_inst] = si;
si = decode(emi, origPC);
return si;
}

View file

@ -32,6 +32,7 @@
#define __ARCH_X86_DECODER_HH__
#include <cassert>
#include <vector>
#include "arch/x86/regs/misc.hh"
#include "arch/x86/types.hh"
@ -58,9 +59,24 @@ class Decoder
static const uint8_t SizeTypeToSize[3][10];
protected:
struct InstBytes
{
StaticInstPtr si;
std::vector<MachInst> chunks;
std::vector<MachInst> masks;
int lastOffset;
InstBytes() : lastOffset(0)
{}
};
static InstBytes dummy;
ThreadContext * tc;
//The bytes to be predecoded
MachInst fetchChunk;
InstBytes *instBytes;
int chunkIdx;
//The pc of the start of fetchChunk
Addr basePC;
//The pc the current instruction started at
@ -69,9 +85,16 @@ class Decoder
int offset;
//The extended machine instruction being generated
ExtMachInst emi;
HandyM5Reg m5Reg;
//Predecoding state
X86Mode mode;
X86SubMode submode;
uint8_t altOp;
uint8_t defOp;
uint8_t altAddr;
uint8_t defAddr;
uint8_t stack;
inline uint8_t getNextByte()
uint8_t getNextByte()
{
return ((uint8_t *)&fetchChunk)[offset];
}
@ -99,24 +122,35 @@ class Decoder
consumeBytes(toGet);
}
inline void consumeByte()
void updateOffsetState()
{
assert(offset <= sizeof(MachInst));
if (offset == sizeof(MachInst)) {
DPRINTF(Decoder, "At the end of a chunk, idx = %d, chunks = %d.\n",
chunkIdx, instBytes->chunks.size());
chunkIdx++;
if (chunkIdx == instBytes->chunks.size()) {
outOfBytes = true;
} else {
offset = 0;
fetchChunk = instBytes->chunks[chunkIdx];
basePC += sizeof(MachInst);
}
}
}
void consumeByte()
{
offset++;
assert(offset <= sizeof(MachInst));
if(offset == sizeof(MachInst))
outOfBytes = true;
updateOffsetState();
}
inline void consumeBytes(int numBytes)
void consumeBytes(int numBytes)
{
offset += numBytes;
assert(offset <= sizeof(MachInst));
if(offset == sizeof(MachInst))
outOfBytes = true;
updateOffsetState();
}
void doReset();
//State machine state
protected:
//Whether or not we're out of bytes
@ -133,6 +167,7 @@ class Decoder
enum State {
ResetState,
FromCacheState,
PrefixState,
OpcodeState,
ModRMState,
@ -146,6 +181,8 @@ class Decoder
State state;
//Functions to handle each of the states
State doResetState();
State doFromCacheState();
State doPrefixState(uint8_t);
State doOpcodeState(uint8_t);
State doModRMState(uint8_t);
@ -153,6 +190,20 @@ class Decoder
State doDisplacementState();
State doImmediateState();
protected:
/// Caching for decoded instruction objects.
typedef MiscReg CacheKey;
typedef DecodeCache::AddrMap<Decoder::InstBytes> DecodePages;
DecodePages *decodePages;
typedef m5::hash_map<CacheKey, DecodePages *> AddrCacheMap;
AddrCacheMap addrCacheMap;
DecodeCache::InstMap *instMap;
typedef m5::hash_map<CacheKey, DecodeCache::InstMap *> InstCacheMap;
static InstCacheMap instCacheMap;
public:
Decoder(ThreadContext * _tc) :
tc(_tc), basePC(0), origPC(0), offset(0),
@ -160,9 +211,47 @@ class Decoder
state(ResetState)
{
memset(&emi, 0, sizeof(emi));
emi.mode.mode = LongMode;
emi.mode.submode = SixtyFourBitMode;
m5Reg = 0;
mode = LongMode;
submode = SixtyFourBitMode;
emi.mode.mode = mode;
emi.mode.submode = submode;
altOp = 0;
defOp = 0;
altAddr = 0;
defAddr = 0;
stack = 0;
instBytes = &dummy;
decodePages = NULL;
instMap = NULL;
}
void setM5Reg(HandyM5Reg m5Reg)
{
mode = (X86Mode)(uint64_t)m5Reg.mode;
submode = (X86SubMode)(uint64_t)m5Reg.submode;
emi.mode.mode = mode;
emi.mode.submode = submode;
altOp = m5Reg.altOp;
defOp = m5Reg.defOp;
altAddr = m5Reg.altAddr;
defAddr = m5Reg.defAddr;
stack = m5Reg.stack;
AddrCacheMap::iterator amIter = addrCacheMap.find(m5Reg);
if (amIter != addrCacheMap.end()) {
decodePages = amIter->second;
} else {
decodePages = new DecodePages;
addrCacheMap[m5Reg] = decodePages;
}
InstCacheMap::iterator imIter = instCacheMap.find(m5Reg);
if (imIter != instCacheMap.end()) {
instMap = imIter->second;
} else {
instMap = new DecodeCache::InstMap;
instCacheMap[m5Reg] = instMap;
}
}
void reset()
@ -218,11 +307,6 @@ class Decoder
}
}
protected:
/// Caching for decoded instruction objects.
static DecodeCache::InstMap instMap;
static DecodeCache::AddrMap<StaticInstPtr> decodePages;
public:
StaticInstPtr decodeInst(ExtMachInst mach_inst);
@ -230,16 +314,7 @@ class Decoder
/// @param mach_inst The binary instruction to decode.
/// @retval A pointer to the corresponding StaticInst object.
StaticInstPtr decode(ExtMachInst mach_inst, Addr addr);
StaticInstPtr
decode(X86ISA::PCState &nextPC)
{
if (!instDone)
return NULL;
instDone = false;
updateNPC(nextPC);
return decode(emi, origPC);
}
StaticInstPtr decode(X86ISA::PCState &nextPC);
};
} // namespace X86ISA

View file

@ -28,6 +28,7 @@
* Authors: Gabe Black
*/
#include "arch/x86/decoder.hh"
#include "arch/x86/isa.hh"
#include "arch/x86/tlb.hh"
#include "cpu/base.hh"
@ -39,7 +40,8 @@ namespace X86ISA
void
ISA::updateHandyM5Reg(Efer efer, CR0 cr0,
SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags)
SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags,
ThreadContext *tc)
{
HandyM5Reg m5reg = 0;
if (efer.lma) {
@ -94,6 +96,8 @@ ISA::updateHandyM5Reg(Efer efer, CR0 cr0,
}
regVal[MISCREG_M5_REG] = m5reg;
if (tc)
tc->getDecoderPtr()->setM5Reg(m5reg);
}
void
@ -184,7 +188,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
newCR0,
regVal[MISCREG_CS_ATTR],
regVal[MISCREG_SS_ATTR],
regVal[MISCREG_RFLAGS]);
regVal[MISCREG_RFLAGS],
tc);
}
break;
case MISCREG_CR2:
@ -225,7 +230,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
regVal[MISCREG_CR0],
newCSAttr,
regVal[MISCREG_SS_ATTR],
regVal[MISCREG_RFLAGS]);
regVal[MISCREG_RFLAGS],
tc);
}
break;
case MISCREG_SS_ATTR:
@ -233,7 +239,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
regVal[MISCREG_CR0],
regVal[MISCREG_CS_ATTR],
val,
regVal[MISCREG_RFLAGS]);
regVal[MISCREG_RFLAGS],
tc);
break;
// These segments always actually use their bases, or in other words
// their effective bases must stay equal to their actual bases.
@ -340,7 +347,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
regVal[MISCREG_CR0],
regVal[MISCREG_CS_ATTR],
regVal[MISCREG_SS_ATTR],
regVal[MISCREG_RFLAGS]);
regVal[MISCREG_RFLAGS],
tc);
return;
default:
break;
@ -363,7 +371,8 @@ ISA::unserialize(EventManager *em, Checkpoint * cp,
regVal[MISCREG_CR0],
regVal[MISCREG_CS_ATTR],
regVal[MISCREG_SS_ATTR],
regVal[MISCREG_RFLAGS]);
regVal[MISCREG_RFLAGS],
NULL);
}
}

View file

@ -50,7 +50,8 @@ namespace X86ISA
protected:
MiscReg regVal[NUM_MISCREGS];
void updateHandyM5Reg(Efer efer, CR0 cr0,
SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags);
SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags,
ThreadContext *tc);
public:
void clear();