From fafa83ed32933fe250d34dfca23fba348429b176 Mon Sep 17 00:00:00 2001 From: Mitch Hayenga Date: Wed, 30 Sep 2015 11:14:19 -0500 Subject: [PATCH] cpu: Add per-thread monitors Adds per-thread address monitors to support FullSystem SMT. --- src/cpu/base.cc | 47 +++++++++++++++++++------------- src/cpu/base.hh | 15 ++++++----- src/cpu/base_dyn_inst.hh | 9 ++++--- src/cpu/checker/cpu.hh | 8 +++--- src/cpu/minor/exec_context.hh | 8 +++--- src/cpu/minor/fetch1.cc | 3 ++- src/cpu/minor/lsq.cc | 3 ++- src/cpu/o3/cpu.cc | 7 ++--- src/cpu/simple/atomic.cc | 49 +++++++++++++++++++++++++++------- src/cpu/simple/atomic.hh | 3 +++ src/cpu/simple/base.cc | 3 +-- src/cpu/simple/exec_context.hh | 8 +++--- src/cpu/simple/timing.cc | 28 ++++++++++++++----- src/cpu/simple/timing.hh | 1 + 14 files changed, 128 insertions(+), 64 deletions(-) diff --git a/src/cpu/base.cc b/src/cpu/base.cc index 77ac5f2bb..3b0809d09 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -133,7 +133,7 @@ BaseCPU::BaseCPU(Params *p, bool is_checker) numThreads(p->numThreads), system(p->system), functionTraceStream(nullptr), currentFunctionStart(0), currentFunctionEnd(0), functionEntryTick(0), - addressMonitor() + addressMonitor(p->numThreads) { // if Python did not provide a valid ID, do it here if (_cpuId == -1 ) { @@ -271,39 +271,48 @@ BaseCPU::~BaseCPU() } void -BaseCPU::armMonitor(Addr address) +BaseCPU::armMonitor(ThreadID tid, Addr address) { - addressMonitor.armed = true; - addressMonitor.vAddr = address; - addressMonitor.pAddr = 0x0; - DPRINTF(Mwait,"Armed monitor (vAddr=0x%lx)\n", address); + assert(tid < numThreads); + AddressMonitor &monitor = addressMonitor[tid]; + + monitor.armed = true; + monitor.vAddr = address; + monitor.pAddr = 0x0; + DPRINTF(Mwait,"[tid:%d] Armed monitor (vAddr=0x%lx)\n", tid, address); } bool -BaseCPU::mwait(PacketPtr pkt) +BaseCPU::mwait(ThreadID tid, PacketPtr pkt) { - if(addressMonitor.gotWakeup == false) { + assert(tid < numThreads); + AddressMonitor &monitor = addressMonitor[tid]; + + if(monitor.gotWakeup == false) { int block_size = cacheLineSize(); uint64_t mask = ~((uint64_t)(block_size - 1)); assert(pkt->req->hasPaddr()); - addressMonitor.pAddr = pkt->getAddr() & mask; - addressMonitor.waiting = true; + monitor.pAddr = pkt->getAddr() & mask; + monitor.waiting = true; - DPRINTF(Mwait,"mwait called (vAddr=0x%lx, line's paddr=0x%lx)\n", - addressMonitor.vAddr, addressMonitor.pAddr); + DPRINTF(Mwait,"[tid:%d] mwait called (vAddr=0x%lx, " + "line's paddr=0x%lx)\n", tid, monitor.vAddr, monitor.pAddr); return true; } else { - addressMonitor.gotWakeup = false; + monitor.gotWakeup = false; return false; } } void -BaseCPU::mwaitAtomic(ThreadContext *tc, TheISA::TLB *dtb) +BaseCPU::mwaitAtomic(ThreadID tid, ThreadContext *tc, TheISA::TLB *dtb) { + assert(tid < numThreads); + AddressMonitor &monitor = addressMonitor[tid]; + Request req; - Addr addr = addressMonitor.vAddr; + Addr addr = monitor.vAddr; int block_size = cacheLineSize(); uint64_t mask = ~((uint64_t)(block_size - 1)); int size = block_size; @@ -320,11 +329,11 @@ BaseCPU::mwaitAtomic(ThreadContext *tc, TheISA::TLB *dtb) Fault fault = dtb->translateAtomic(&req, tc, BaseTLB::Read); assert(fault == NoFault); - addressMonitor.pAddr = req.getPaddr() & mask; - addressMonitor.waiting = true; + monitor.pAddr = req.getPaddr() & mask; + monitor.waiting = true; - DPRINTF(Mwait,"mwait called (vAddr=0x%lx, line's paddr=0x%lx)\n", - addressMonitor.vAddr, addressMonitor.pAddr); + DPRINTF(Mwait,"[tid:%d] mwait called (vAddr=0x%lx, line's paddr=0x%lx)\n", + tid, monitor.vAddr, monitor.pAddr); } void diff --git a/src/cpu/base.hh b/src/cpu/base.hh index 3a10841e0..0286ac45b 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -559,14 +559,17 @@ class BaseCPU : public MemObject Stats::Scalar numWorkItemsCompleted; private: - AddressMonitor addressMonitor; + std::vector addressMonitor; public: - void armMonitor(Addr address); - bool mwait(PacketPtr pkt); - void mwaitAtomic(ThreadContext *tc, TheISA::TLB *dtb); - AddressMonitor *getCpuAddrMonitor() { return &addressMonitor; } - void atomicNotify(Addr address); + void armMonitor(ThreadID tid, Addr address); + bool mwait(ThreadID tid, PacketPtr pkt); + void mwaitAtomic(ThreadID tid, ThreadContext *tc, TheISA::TLB *dtb); + AddressMonitor *getCpuAddrMonitor(ThreadID tid) + { + assert(tid < numThreads); + return &addressMonitor[tid]; + } }; #endif // THE_ISA == NULL_ISA diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index c2ef253a7..77117b892 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -863,11 +863,12 @@ class BaseDynInst : public ExecContext, public RefCounted public: // monitor/mwait funtions - void armMonitor(Addr address) { cpu->armMonitor(address); } - bool mwait(PacketPtr pkt) { return cpu->mwait(pkt); } + void armMonitor(Addr address) { cpu->armMonitor(threadNumber, address); } + bool mwait(PacketPtr pkt) { return cpu->mwait(threadNumber, pkt); } void mwaitAtomic(ThreadContext *tc) - { return cpu->mwaitAtomic(tc, cpu->dtb); } - AddressMonitor *getAddrMonitor() { return cpu->getCpuAddrMonitor(); } + { return cpu->mwaitAtomic(threadNumber, tc, cpu->dtb); } + AddressMonitor *getAddrMonitor() + { return cpu->getCpuAddrMonitor(threadNumber); } }; template diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh index a363b6d0f..69f47894b 100644 --- a/src/cpu/checker/cpu.hh +++ b/src/cpu/checker/cpu.hh @@ -350,11 +350,11 @@ class CheckerCPU : public BaseCPU, public ExecContext } // monitor/mwait funtions - virtual void armMonitor(Addr address) { BaseCPU::armMonitor(address); } - bool mwait(PacketPtr pkt) { return BaseCPU::mwait(pkt); } + virtual void armMonitor(Addr address) { BaseCPU::armMonitor(0, address); } + bool mwait(PacketPtr pkt) { return BaseCPU::mwait(0, pkt); } void mwaitAtomic(ThreadContext *tc) - { return BaseCPU::mwaitAtomic(tc, thread->dtb); } - AddressMonitor *getAddrMonitor() { return BaseCPU::getCpuAddrMonitor(); } + { return BaseCPU::mwaitAtomic(0, tc, thread->dtb); } + AddressMonitor *getAddrMonitor() { return BaseCPU::getCpuAddrMonitor(0); } void demapInstPage(Addr vaddr, uint64_t asn) { diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh index 3e4ea5ea9..625d2b877 100644 --- a/src/cpu/minor/exec_context.hh +++ b/src/cpu/minor/exec_context.hh @@ -343,12 +343,12 @@ class ExecContext : public ::ExecContext public: // monitor/mwait funtions - void armMonitor(Addr address) { getCpuPtr()->armMonitor(address); } - bool mwait(PacketPtr pkt) { return getCpuPtr()->mwait(pkt); } + void armMonitor(Addr address) { getCpuPtr()->armMonitor(0, address); } + bool mwait(PacketPtr pkt) { return getCpuPtr()->mwait(0, pkt); } void mwaitAtomic(ThreadContext *tc) - { return getCpuPtr()->mwaitAtomic(tc, thread.dtb); } + { return getCpuPtr()->mwaitAtomic(0, tc, thread.dtb); } AddressMonitor *getAddrMonitor() - { return getCpuPtr()->getCpuAddrMonitor(); } + { return getCpuPtr()->getCpuAddrMonitor(0); } }; } diff --git a/src/cpu/minor/fetch1.cc b/src/cpu/minor/fetch1.cc index 81fc99d37..84aaf02f5 100644 --- a/src/cpu/minor/fetch1.cc +++ b/src/cpu/minor/fetch1.cc @@ -135,7 +135,8 @@ Fetch1::fetchLine() "%s addr: 0x%x pc: %s line_offset: %d request_size: %d\n", request_id, aligned_pc, pc, line_offset, request_size); - request->request.setThreadContext(cpu.cpuId(), /* thread id */ 0); + request->request.setThreadContext(cpu.threads[0]->getTC()->contextId(), + /* thread id */ 0); request->request.setVirt(0 /* asid */, aligned_pc, request_size, Request::INST_FETCH, cpu.instMasterId(), /* I've no idea why we need the PC, but give it */ diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc index 376e8a0ff..e644951f8 100644 --- a/src/cpu/minor/lsq.cc +++ b/src/cpu/minor/lsq.cc @@ -1501,7 +1501,8 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, if (inst->traceData) inst->traceData->setMem(addr, size, flags); - request->request.setThreadContext(cpu.cpuId(), /* thread id */ 0); + int cid = cpu.threads[inst->id.threadId]->getTC()->contextId(); + request->request.setThreadContext(cid, /* thread id */ 0); request->request.setVirt(0 /* asid */, addr, size, flags, cpu.dataMasterId(), /* I've no idea why we need the PC, but give it */ diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 026907a94..4ab004817 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -118,9 +118,10 @@ template void FullO3CPU::DcachePort::recvTimingSnoopReq(PacketPtr pkt) { - // X86 ISA: Snooping an invalidation for monitor/mwait - if(cpu->getCpuAddrMonitor()->doMonitor(pkt)) { - cpu->wakeup(); + for (ThreadID tid = 0; tid < cpu->numThreads; tid++) { + if (cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) { + cpu->wakeup(); + } } lsq->recvTimingSnoopReq(pkt); } diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc index 6690c1da6..2d9da2587 100644 --- a/src/cpu/simple/atomic.cc +++ b/src/cpu/simple/atomic.cc @@ -86,9 +86,10 @@ AtomicSimpleCPU::init() { BaseSimpleCPU::init(); - ifetch_req.setThreadContext(_cpuId, 0); - data_read_req.setThreadContext(_cpuId, 0); - data_write_req.setThreadContext(_cpuId, 0); + int cid = threadContexts[0]->contextId(); + ifetch_req.setThreadContext(cid, 0); + data_read_req.setThreadContext(cid, 0); + data_write_req.setThreadContext(cid, 0); } AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p) @@ -130,6 +131,24 @@ AtomicSimpleCPU::drain() } } +void +AtomicSimpleCPU::threadSnoop(PacketPtr pkt, ThreadID sender) +{ + DPRINTF(SimpleCPU, "received snoop pkt for addr:%#x %s\n", pkt->getAddr(), + pkt->cmdString()); + + for (ThreadID tid = 0; tid < numThreads; tid++) { + if (tid != sender) { + if(getCpuAddrMonitor(tid)->doMonitor(pkt)) { + wakeup(); + } + + TheISA::handleLockedSnoop(threadInfo[tid]->thread, + pkt, dcachePort.cacheBlockMask); + } + } +} + void AtomicSimpleCPU::drainResume() { @@ -265,8 +284,11 @@ AtomicSimpleCPU::AtomicCPUDPort::recvAtomicSnoop(PacketPtr pkt) // X86 ISA: Snooping an invalidation for monitor/mwait AtomicSimpleCPU *cpu = (AtomicSimpleCPU *)(&owner); - if(cpu->getCpuAddrMonitor()->doMonitor(pkt)) { - cpu->wakeup(); + + for (ThreadID tid = 0; tid < cpu->numThreads; tid++) { + if (cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) { + cpu->wakeup(); + } } // if snoop invalidates, release any associated locks @@ -289,8 +311,10 @@ AtomicSimpleCPU::AtomicCPUDPort::recvFunctionalSnoop(PacketPtr pkt) // X86 ISA: Snooping an invalidation for monitor/mwait AtomicSimpleCPU *cpu = (AtomicSimpleCPU *)(&owner); - if(cpu->getCpuAddrMonitor()->doMonitor(pkt)) { - cpu->wakeup(); + for (ThreadID tid = 0; tid < cpu->numThreads; tid++) { + if(cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) { + cpu->wakeup(); + } } // if snoop invalidates, release any associated locks @@ -460,6 +484,9 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, system->getPhysMem().access(&pkt); else dcache_latency += dcachePort.sendAtomic(&pkt); + + // Notify other threads on this CPU of write + threadSnoop(&pkt, curThread); } dcache_access = true; assert(!pkt.isError()); @@ -516,9 +543,11 @@ AtomicSimpleCPU::tick() // Set memroy request ids to current thread if (numThreads > 1) { - ifetch_req.setThreadContext(_cpuId, curThread); - data_read_req.setThreadContext(_cpuId, curThread); - data_write_req.setThreadContext(_cpuId, curThread); + ContextID cid = threadContexts[curThread]->contextId(); + + ifetch_req.setThreadContext(cid, curThread); + data_read_req.setThreadContext(cid, curThread); + data_write_req.setThreadContext(cid, curThread); } SimpleExecContext& t_info = *threadInfo[curThread]; diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh index 76ee9f897..2bea12ab2 100644 --- a/src/cpu/simple/atomic.hh +++ b/src/cpu/simple/atomic.hh @@ -186,6 +186,9 @@ class AtomicSimpleCPU : public BaseSimpleCPU /** Return a reference to the instruction port. */ virtual MasterPort &getInstPort() { return icachePort; } + /** Perform snoop for other cpu-local thread contexts. */ + void threadSnoop(PacketPtr pkt, ThreadID sender); + public: DrainState drain() M5_ATTR_OVERRIDE; diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc index 673cadd77..6e8845bf7 100644 --- a/src/cpu/simple/base.cc +++ b/src/cpu/simple/base.cc @@ -418,9 +418,8 @@ BaseSimpleCPU::dbg_vtophys(Addr addr) void BaseSimpleCPU::wakeup() { - getCpuAddrMonitor()->gotWakeup = true; - for (ThreadID tid = 0; tid < numThreads; tid++) { + getCpuAddrMonitor(tid)->gotWakeup = true; if (threadInfo[tid]->thread->status() == ThreadContext::Suspended) { DPRINTF(Quiesce,"Suspended Processor awoke\n"); threadInfo[tid]->thread->activate(); diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh index f474cc358..591cf8227 100644 --- a/src/cpu/simple/exec_context.hh +++ b/src/cpu/simple/exec_context.hh @@ -376,22 +376,22 @@ class SimpleExecContext : public ExecContext { void armMonitor(Addr address) M5_ATTR_OVERRIDE { - cpu->armMonitor(address); + cpu->armMonitor(thread->threadId(), address); } bool mwait(PacketPtr pkt) M5_ATTR_OVERRIDE { - return cpu->mwait(pkt); + return cpu->mwait(thread->threadId(), pkt); } void mwaitAtomic(ThreadContext *tc) M5_ATTR_OVERRIDE { - cpu->mwaitAtomic(tc, thread->dtb); + cpu->mwaitAtomic(thread->threadId(), tc, thread->dtb); } AddressMonitor *getAddrMonitor() M5_ATTR_OVERRIDE { - return cpu->getCpuAddrMonitor(); + return cpu->getCpuAddrMonitor(thread->threadId()); } #if THE_ISA == MIPS_ISA diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc index 487da36ea..f3241f7e5 100644 --- a/src/cpu/simple/timing.cc +++ b/src/cpu/simple/timing.cc @@ -302,6 +302,7 @@ TimingSimpleCPU::sendData(RequestPtr req, uint8_t *data, uint64_t *res, if (do_access) { dcache_pkt = pkt; handleWritePacket(); + threadSnoop(pkt, curThread); } else { _status = DcacheWaitResponse; completeDataAccess(pkt); @@ -538,6 +539,19 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size, return NoFault; } +void +TimingSimpleCPU::threadSnoop(PacketPtr pkt, ThreadID sender) +{ + for (ThreadID tid = 0; tid < numThreads; tid++) { + if (tid != sender) { + if(getCpuAddrMonitor(tid)->doMonitor(pkt)) { + wakeup(); + } + TheISA::handleLockedSnoop(threadInfo[tid]->thread, pkt, + dcachePort.cacheBlockMask); + } + } +} void TimingSimpleCPU::finishTranslation(WholeTranslationState *state) @@ -849,9 +863,10 @@ TimingSimpleCPU::updateCycleCounts() void TimingSimpleCPU::DcachePort::recvTimingSnoopReq(PacketPtr pkt) { - // X86 ISA: Snooping an invalidation for monitor/mwait - if(cpu->getCpuAddrMonitor()->doMonitor(pkt)) { - cpu->wakeup(); + for (ThreadID tid = 0; tid < cpu->numThreads; tid++) { + if (cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) { + cpu->wakeup(); + } } for (auto &t_info : cpu->threadInfo) { @@ -862,9 +877,10 @@ TimingSimpleCPU::DcachePort::recvTimingSnoopReq(PacketPtr pkt) void TimingSimpleCPU::DcachePort::recvFunctionalSnoop(PacketPtr pkt) { - // X86 ISA: Snooping an invalidation for monitor/mwait - if(cpu->getCpuAddrMonitor()->doMonitor(pkt)) { - cpu->wakeup(); + for (ThreadID tid = 0; tid < cpu->numThreads; tid++) { + if(cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) { + cpu->wakeup(); + } } } diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh index d409ac5d2..f1cc09e42 100644 --- a/src/cpu/simple/timing.hh +++ b/src/cpu/simple/timing.hh @@ -132,6 +132,7 @@ class TimingSimpleCPU : public BaseSimpleCPU }; FetchTranslation fetchTranslation; + void threadSnoop(PacketPtr pkt, ThreadID sender); void sendData(RequestPtr req, uint8_t *data, uint64_t *res, bool read); void sendSplitData(RequestPtr req1, RequestPtr req2, RequestPtr req, uint8_t *data, bool read);