x86 isa: This patch attempts an implementation at mwait.

Mwait works as follows:
1. A cpu monitors an address of interest (monitor instruction)
2. A cpu calls mwait - this loads the cache line into that cpu's cache.
3. The cpu goes to sleep.
4. When another processor requests write permission for the line, it is
   evicted from the sleeping cpu's cache. This eviction is forwarded to the
   sleeping cpu, which then wakes up.

Committed by: Nilay Vaish <nilay@cs.wisc.edu>
This commit is contained in:
Marc Orr 2014-11-06 05:42:22 -06:00
parent 3947f88d0f
commit bf80734b2c
26 changed files with 381 additions and 16 deletions

View file

@ -34,6 +34,7 @@ import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
#
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
@ -101,7 +102,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
l0_cntrl = L0Cache_Controller(version = i*num_cpus_per_cluster + j,
Icache = l0i_cache, Dcache = l0d_cache,
send_evictions = (options.cpu_type == "detailed"),
send_evictions = send_evicts(options),
clk_domain=system.cpu[i].clk_domain,
ruby_system = ruby_system)

View file

@ -32,6 +32,7 @@ import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
#
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
@ -91,8 +92,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
L1Icache = l1i_cache,
L1Dcache = l1d_cache,
l2_select_num_bits = l2_bits,
send_evictions = (
options.cpu_type == "detailed"),
send_evictions = send_evicts(options),
prefetcher = prefetcher,
ruby_system = ruby_system,
clk_domain=system.cpu[i].clk_domain,

View file

@ -32,6 +32,7 @@ import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
#
# Note: the cache latency is only used by the sequencer on fast path hits
@ -79,8 +80,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
#
l1_cntrl = L1Cache_Controller(version = i,
cacheMemory = cache,
send_evictions = (
options.cpu_type == "detailed"),
send_evictions = send_evicts(options),
transitions_per_cycle = options.ports,
clk_domain=system.cpu[i].clk_domain,
ruby_system = ruby_system)

View file

@ -32,6 +32,7 @@ import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
#
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
@ -89,8 +90,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
L1Icache = l1i_cache,
L1Dcache = l1d_cache,
l2_select_num_bits = l2_bits,
send_evictions = (
options.cpu_type == "detailed"),
send_evictions = send_evicts(options),
transitions_per_cycle = options.ports,
clk_domain=system.cpu[i].clk_domain,
ruby_system = ruby_system)

View file

@ -32,6 +32,7 @@ import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
#
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
@ -109,8 +110,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
not options.disable_dyn_timeouts,
no_mig_atomic = not \
options.allow_atomic_migration,
send_evictions = (
options.cpu_type == "detailed"),
send_evictions = send_evicts(options),
transitions_per_cycle = options.ports,
clk_domain=system.cpu[i].clk_domain,
ruby_system = ruby_system)

View file

@ -32,6 +32,7 @@ import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
#
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
@ -102,8 +103,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
L2cache = l2_cache,
no_mig_atomic = not \
options.allow_atomic_migration,
send_evictions = (
options.cpu_type == "detailed"),
send_evictions = send_evicts(options),
transitions_per_cycle = options.ports,
clk_domain=system.cpu[i].clk_domain,
ruby_system = ruby_system)

View file

@ -233,6 +233,14 @@ def create_system(options, full_system, system, piobus = None, dma_ports = []):
ruby.num_of_sequencers = len(cpu_sequencers)
ruby.random_seed = options.random_seed
def send_evicts(options):
# currently, 2 scenarios warrant forwarding evictions to the CPU:
# 1. The O3 model must keep the LSQ coherent with the caches
# 2. The x86 mwait instruction is built on top of coherence invalidations
if options.cpu_type == "detailed" or buildEnv['TARGET_ISA'] == 'x86':
return True
return False
# Create a backing copy of physical memory in case required
if options.access_backing_store:
ruby.phys_mem = SimpleMemory(range=AddrRange(options.mem_size),

View file

@ -71,8 +71,20 @@
}
0x1: decode MODRM_MOD {
0x3: decode MODRM_RM {
0x0: monitor();
0x1: mwait();
0x0: MonitorInst::monitor({{
xc->armMonitor(Rax);
}});
0x1: MwaitInst::mwait({{
uint64_t m = 0; //mem
unsigned s = 0x8; //size
unsigned f = 0; //flags
readMemAtomic(xc, traceData,
xc->getAddrMonitor()->vAddr,
m, s, f);
xc->mwaitAtomic(xc->tcBase());
MicroHalt hltObj(machInst, mnemonic, 0x0);
hltObj.execute(xc, traceData);
}});
default: Inst::UD2();
}
default: sidt_Ms();

View file

@ -45,6 +45,9 @@
//Include a format to generate a CPUID instruction.
##include "cpuid.isa"
//Include a format to generate a monitor/mwait instructions.
##include "monitor_mwait.isa"
//Include the "unknown" format
##include "unknown.isa"

View file

@ -0,0 +1,131 @@
// Copyright (c) AMD
// All rights reserved.
//
// Authors: Marc Orr
// Monitor Instruction
output header {{
class MonitorInst : public X86ISA::X86StaticInst
{
public:
static const RegIndex foldOBit = 0;
/// Constructor
MonitorInst(const char *_mnemonic, ExtMachInst _machInst,
OpClass __opClass) :
X86ISA::X86StaticInst(_mnemonic, _machInst, __opClass)
{ }
std::string generateDisassembly(Addr pc,
const SymbolTable *symtab) const;
};
}};
output decoder {{
std::string MonitorInst::generateDisassembly(Addr PC,
const SymbolTable *symtab) const
{
std::stringstream response;
printMnemonic(response, mnemonic);
ccprintf(response, " ");
printReg(response, _srcRegIdx[0], machInst.opSize);
return response.str();
}
}};
def format MonitorInst(code, *opt_flags) {{
iop = InstObjParams(name, Name, 'MonitorInst', code, opt_flags)
header_output = BasicDeclare.subst(iop)
decoder_output = BasicConstructor.subst(iop)
decode_block = BasicDecode.subst(iop)
exec_output = BasicExecute.subst(iop)
}};
// Mwait instruction
// Declarations for execute() methods.
def template MwaitExecDeclare {{
Fault execute(%(CPU_exec_context)s *, Trace::InstRecord *) const;
Fault initiateAcc(%(CPU_exec_context)s *, Trace::InstRecord *) const;
Fault completeAcc(PacketPtr, %(CPU_exec_context)s *,
Trace::InstRecord *) const;
}};
def template MwaitDeclare {{
class %(class_name)s : public %(base_class)s
{
public:
// Constructor.
%(class_name)s(ExtMachInst machInst);
%(MwaitExecDeclare)s
};
}};
def template MwaitInitiateAcc {{
Fault %(class_name)s::initiateAcc(CPU_EXEC_CONTEXT * xc,
Trace::InstRecord * traceData) const
{
uint64_t m = 0; //mem
unsigned s = 0x8; //size
unsigned f = 0; //flags
readMemTiming(xc, traceData, xc->getAddrMonitor()->vAddr, m, s, f);
return NoFault;
}
}};
def template MwaitCompleteAcc {{
Fault %(class_name)s::completeAcc(PacketPtr pkt, CPU_EXEC_CONTEXT *xc,
Trace::InstRecord *traceData) const
{
MicroHalt hltObj(machInst, mnemonic, 0x0);
if(xc->mwait(pkt)) {
hltObj.execute(xc, traceData);
}
return NoFault;
}
}};
output header {{
class MwaitInst : public X86ISA::X86StaticInst
{
public:
static const RegIndex foldOBit = 0;
/// Constructor
MwaitInst(const char *_mnemonic, ExtMachInst _machInst,
OpClass __opClass) :
X86ISA::X86StaticInst(_mnemonic, _machInst, __opClass)
{
flags[IsMemRef] = 1;
flags[IsLoad] = 1;
}
std::string generateDisassembly(Addr pc,
const SymbolTable *symtab) const;
};
}};
output decoder {{
std::string MwaitInst::generateDisassembly(Addr PC,
const SymbolTable *symtab) const
{
std::stringstream response;
printMnemonic(response, mnemonic);
ccprintf(response, " ");
printReg(response, _srcRegIdx[0], machInst.opSize);
return response.str();
}
}};
def format MwaitInst(code, *opt_flags) {{
iop = InstObjParams(name, Name, 'MwaitInst', code, opt_flags)
header_output = MwaitDeclare.subst(iop)
decoder_output = BasicConstructor.subst(iop)
decode_block = BasicDecode.subst(iop)
exec_output = BasicExecute.subst(iop)
exec_output += MwaitInitiateAcc.subst(iop)
exec_output += MwaitCompleteAcc.subst(iop)
}};

View file

@ -102,6 +102,7 @@ DebugFlag('IntrControl')
DebugFlag('O3PipeView')
DebugFlag('PCEvent')
DebugFlag('Quiesce')
DebugFlag('Mwait')
CompoundFlag('ExecAll', [ 'ExecEnable', 'ExecCPSeq', 'ExecEffAddr',
'ExecFaulting', 'ExecFetchSeq', 'ExecOpClass', 'ExecRegDelta',

View file

@ -55,12 +55,14 @@
#include "base/misc.hh"
#include "base/output.hh"
#include "base/trace.hh"
#include "cpu/base.hh"
#include "cpu/checker/cpu.hh"
#include "cpu/base.hh"
#include "cpu/cpuevent.hh"
#include "cpu/profile.hh"
#include "cpu/thread_context.hh"
#include "debug/Mwait.hh"
#include "debug/SyscallVerbose.hh"
#include "mem/page_table.hh"
#include "params/BaseCPU.hh"
#include "sim/full_system.hh"
#include "sim/process.hh"
@ -123,7 +125,8 @@ BaseCPU::BaseCPU(Params *p, bool is_checker)
_taskId(ContextSwitchTaskId::Unknown), _pid(Request::invldPid),
_switchedOut(p->switched_out), _cacheLineSize(p->system->cacheLineSize()),
interrupts(p->interrupts), profileEvent(NULL),
numThreads(p->numThreads), system(p->system)
numThreads(p->numThreads), system(p->system),
addressMonitor()
{
// if Python did not provide a valid ID, do it here
if (_cpuId == -1 ) {
@ -260,6 +263,63 @@ BaseCPU::~BaseCPU()
delete[] comInstEventQueue;
}
void
BaseCPU::armMonitor(Addr address)
{
addressMonitor.armed = true;
addressMonitor.vAddr = address;
addressMonitor.pAddr = 0x0;
DPRINTF(Mwait,"Armed monitor (vAddr=0x%lx)\n", address);
}
bool
BaseCPU::mwait(PacketPtr pkt)
{
if(addressMonitor.gotWakeup == false) {
int block_size = cacheLineSize();
uint64_t mask = ~((uint64_t)(block_size - 1));
assert(pkt->req->hasPaddr());
addressMonitor.pAddr = pkt->getAddr() & mask;
addressMonitor.waiting = true;
DPRINTF(Mwait,"mwait called (vAddr=0x%lx, line's paddr=0x%lx)\n",
addressMonitor.vAddr, addressMonitor.pAddr);
return true;
} else {
addressMonitor.gotWakeup = false;
return false;
}
}
void
BaseCPU::mwaitAtomic(ThreadContext *tc, TheISA::TLB *dtb)
{
Request req;
Addr addr = addressMonitor.vAddr;
int block_size = cacheLineSize();
uint64_t mask = ~((uint64_t)(block_size - 1));
int size = block_size;
//The address of the next line if it crosses a cache line boundary.
Addr secondAddr = roundDown(addr + size - 1, block_size);
if (secondAddr > addr)
size = secondAddr - addr;
req.setVirt(0, addr, size, 0x0, dataMasterId(), tc->instAddr());
// translate to physical address
Fault fault = dtb->translateAtomic(&req, tc, BaseTLB::Read);
assert(fault == NoFault);
addressMonitor.pAddr = req.getPaddr() & mask;
addressMonitor.waiting = true;
DPRINTF(Mwait,"mwait called (vAddr=0x%lx, line's paddr=0x%lx)\n",
addressMonitor.vAddr, addressMonitor.pAddr);
}
void
BaseCPU::init()
{
@ -618,6 +678,25 @@ BaseCPU::scheduleInstStop(ThreadID tid, Counter insts, const char *cause)
comInstEventQueue[tid]->schedule(event, now + insts);
}
AddressMonitor::AddressMonitor() {
armed = false;
waiting = false;
gotWakeup = false;
}
bool AddressMonitor::doMonitor(PacketPtr pkt) {
assert(pkt->req->hasPaddr());
if(armed && waiting) {
if(pAddr == pkt->getAddr()) {
DPRINTF(Mwait,"pAddr=0x%lx invalidated: waking up core\n",
pkt->getAddr());
waiting = false;
return true;
}
}
return false;
}
void
BaseCPU::scheduleLoadStop(ThreadID tid, Counter loads, const char *cause)
{

View file

@ -64,11 +64,26 @@
#include "sim/insttracer.hh"
#include "sim/probe/pmu.hh"
#include "sim/system.hh"
#include "debug/Mwait.hh"
class BaseCPU;
struct BaseCPUParams;
class CheckerCPU;
class ThreadContext;
struct AddressMonitor
{
AddressMonitor();
bool doMonitor(PacketPtr pkt);
bool armed;
Addr vAddr;
Addr pAddr;
uint64_t val;
bool waiting; // 0=normal, 1=mwaiting
bool gotWakeup;
};
class CPUProgressEvent : public Event
{
protected:
@ -536,6 +551,16 @@ class BaseCPU : public MemObject
Stats::Scalar numCycles;
Stats::Scalar numWorkItemsStarted;
Stats::Scalar numWorkItemsCompleted;
private:
AddressMonitor addressMonitor;
public:
void armMonitor(Addr address);
bool mwait(PacketPtr pkt);
void mwaitAtomic(ThreadContext *tc, TheISA::TLB *dtb);
AddressMonitor *getCpuAddrMonitor() { return &addressMonitor; }
void atomicNotify(Addr address);
};
#endif // THE_ISA == NULL_ISA

View file

@ -853,6 +853,14 @@ class BaseDynInst : public ExecContext, public RefCounted
/** Sets the number of consecutive store conditional failures. */
void setStCondFailures(unsigned int sc_failures)
{ thread->storeCondFailures = sc_failures; }
public:
// monitor/mwait funtions
void armMonitor(Addr address) { cpu->armMonitor(address); }
bool mwait(PacketPtr pkt) { return cpu->mwait(pkt); }
void mwaitAtomic(ThreadContext *tc)
{ return cpu->mwaitAtomic(tc, cpu->dtb); }
AddressMonitor *getAddrMonitor() { return cpu->getCpuAddrMonitor(); }
};
template<class Impl>

View file

@ -349,6 +349,13 @@ class CheckerCPU : public BaseCPU, public ExecContext
this->dtb->demapPage(vaddr, asn);
}
// monitor/mwait funtions
virtual void armMonitor(Addr address) { BaseCPU::armMonitor(address); }
bool mwait(PacketPtr pkt) { return BaseCPU::mwait(pkt); }
void mwaitAtomic(ThreadContext *tc)
{ return BaseCPU::mwaitAtomic(tc, thread->dtb); }
AddressMonitor *getAddrMonitor() { return BaseCPU::getCpuAddrMonitor(); }
void demapInstPage(Addr vaddr, uint64_t asn)
{
this->itb->demapPage(vaddr, asn);

View file

@ -47,6 +47,7 @@
#include "arch/registers.hh"
#include "base/types.hh"
#include "config/the_isa.hh"
#include "cpu/base.hh"
#include "cpu/static_inst_fwd.hh"
#include "cpu/translation.hh"
@ -243,6 +244,10 @@ class ExecContext {
* Invalidate a page in the DTLB <i>and</i> ITLB.
*/
virtual void demapPage(Addr vaddr, uint64_t asn) = 0;
virtual void armMonitor(Addr address) = 0;
virtual bool mwait(PacketPtr pkt) = 0;
virtual void mwaitAtomic(ThreadContext *tc) = 0;
virtual AddressMonitor *getAddrMonitor() = 0;
/** @} */

View file

@ -602,3 +602,25 @@ InOrderDynInst::dump(std::string &outstring)
outstring = s.str();
}
void
InOrderDynInst::armMonitor(Addr address) {
cpu->armMonitor(address);
}
bool
InOrderDynInst::mwait(PacketPtr pkt) {
return cpu->mwait(pkt);
}
void
InOrderDynInst::mwaitAtomic(ThreadContext *tc)
{
return cpu->mwaitAtomic(tc, cpu->getDTBPtr());
}
AddressMonitor *
InOrderDynInst::getAddrMonitor()
{
return cpu->getCpuAddrMonitor();
}

View file

@ -1077,6 +1077,13 @@ class InOrderDynInst : public ExecContext, public RefCounted
void demapPage(Addr vaddr, uint64_t asn) {
panic("demapPage unimplemented");
}
public:
// monitor/mwait funtions
void armMonitor(Addr address);
bool mwait(PacketPtr pkt);
void mwaitAtomic(ThreadContext *tc);
AddressMonitor *getAddrMonitor();
};

View file

@ -340,6 +340,15 @@ class ExecContext : public ::ExecContext
- TheISA::Misc_Reg_Base, val);
}
}
public:
// monitor/mwait funtions
void armMonitor(Addr address) { getCpuPtr()->armMonitor(address); }
bool mwait(PacketPtr pkt) { return getCpuPtr()->mwait(pkt); }
void mwaitAtomic(ThreadContext *tc)
{ return getCpuPtr()->mwaitAtomic(tc, thread.dtb); }
AddressMonitor *getAddrMonitor()
{ return getCpuPtr()->getCpuAddrMonitor(); }
};
}

View file

@ -117,6 +117,10 @@ template <class Impl>
void
FullO3CPU<Impl>::DcachePort::recvTimingSnoopReq(PacketPtr pkt)
{
// X86 ISA: Snooping an invalidation for monitor/mwait
if(cpu->getCpuAddrMonitor()->doMonitor(pkt)) {
cpu->wakeup();
}
lsq->recvTimingSnoopReq(pkt);
}

View file

@ -162,11 +162,13 @@ class FullO3CPU : public BaseO3CPU
/** Pointer to LSQ. */
LSQ<Impl> *lsq;
FullO3CPU<Impl> *cpu;
public:
/** Default constructor. */
DcachePort(LSQ<Impl> *_lsq, FullO3CPU<Impl>* _cpu)
: MasterPort(_cpu->name() + ".dcache_port", _cpu), lsq(_lsq)
: MasterPort(_cpu->name() + ".dcache_port", _cpu), lsq(_lsq),
cpu(_cpu)
{ }
protected:

View file

@ -272,6 +272,12 @@ AtomicSimpleCPU::AtomicCPUDPort::recvAtomicSnoop(PacketPtr pkt)
DPRINTF(SimpleCPU, "received snoop pkt for addr:%#x %s\n", pkt->getAddr(),
pkt->cmdString());
// X86 ISA: Snooping an invalidation for monitor/mwait
AtomicSimpleCPU *cpu = (AtomicSimpleCPU *)(&owner);
if(cpu->getAddrMonitor()->doMonitor(pkt)) {
cpu->wakeup();
}
// if snoop invalidates, release any associated locks
if (pkt->isInvalidate()) {
DPRINTF(SimpleCPU, "received invalidation for addr:%#x\n",
@ -288,6 +294,12 @@ AtomicSimpleCPU::AtomicCPUDPort::recvFunctionalSnoop(PacketPtr pkt)
DPRINTF(SimpleCPU, "received snoop pkt for addr:%#x %s\n", pkt->getAddr(),
pkt->cmdString());
// X86 ISA: Snooping an invalidation for monitor/mwait
AtomicSimpleCPU *cpu = (AtomicSimpleCPU *)(&owner);
if(cpu->getAddrMonitor()->doMonitor(pkt)) {
cpu->wakeup();
}
// if snoop invalidates, release any associated locks
if (pkt->isInvalidate()) {
DPRINTF(SimpleCPU, "received invalidation for addr:%#x\n",

View file

@ -347,6 +347,8 @@ BaseSimpleCPU::dbg_vtophys(Addr addr)
void
BaseSimpleCPU::wakeup()
{
getAddrMonitor()->gotWakeup = true;
if (thread->status() != ThreadContext::Suspended)
return;

View file

@ -462,6 +462,14 @@ class BaseSimpleCPU : public BaseCPU, public ExecContext
private:
TheISA::PCState pred_pc;
public:
// monitor/mwait funtions
void armMonitor(Addr address) { BaseCPU::armMonitor(address); }
bool mwait(PacketPtr pkt) { return BaseCPU::mwait(pkt); }
void mwaitAtomic(ThreadContext *tc)
{ return BaseCPU::mwaitAtomic(tc, thread->dtb); }
AddressMonitor *getAddrMonitor() { return BaseCPU::getCpuAddrMonitor(); }
};
#endif // __CPU_SIMPLE_BASE_HH__

View file

@ -58,6 +58,8 @@
#include "sim/full_system.hh"
#include "sim/system.hh"
#include "debug/Mwait.hh"
using namespace std;
using namespace TheISA;
@ -818,9 +820,21 @@ TimingSimpleCPU::updateCycleCounts()
void
TimingSimpleCPU::DcachePort::recvTimingSnoopReq(PacketPtr pkt)
{
// X86 ISA: Snooping an invalidation for monitor/mwait
if(cpu->getAddrMonitor()->doMonitor(pkt)) {
cpu->wakeup();
}
TheISA::handleLockedSnoop(cpu->thread, pkt, cacheBlockMask);
}
void
TimingSimpleCPU::DcachePort::recvFunctionalSnoop(PacketPtr pkt)
{
// X86 ISA: Snooping an invalidation for monitor/mwait
if(cpu->getAddrMonitor()->doMonitor(pkt)) {
cpu->wakeup();
}
}
bool
TimingSimpleCPU::DcachePort::recvTimingResp(PacketPtr pkt)

View file

@ -228,11 +228,16 @@ class TimingSimpleCPU : public BaseSimpleCPU
* a wakeup event on a cpu that is monitoring an address
*/
virtual void recvTimingSnoopReq(PacketPtr pkt);
virtual void recvFunctionalSnoop(PacketPtr pkt);
virtual bool recvTimingResp(PacketPtr pkt);
virtual void recvRetry();
virtual bool isSnooping() const {
return true;
}
struct DTickEvent : public TickEvent
{
DTickEvent(TimingSimpleCPU *_cpu)