MOESI_hammer: break down miss latency stalled cycles

This patch tracks the number of cycles a transaction is delayed at different
points of the request-forward-response loop.
This commit is contained in:
Brad Beckmann 2010-08-20 11:46:14 -07:00
parent 8b28848321
commit f57053473a
8 changed files with 245 additions and 7 deletions

View file

@ -136,6 +136,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for";
bool Sharers, desc="On a GetS, did we find any other sharers in the system";
MachineID LastResponder, desc="last machine to send a response for this request";
Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
Time FirstResponseTime, default="0", desc="the time the first response was received";
}
external_type(TBETable) {
@ -424,6 +427,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
out_msg.Requestor := machineID;
out_msg.Destination.add(map_Address_to_Directory(address));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := get_time();
TBEs[address].NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
}
}
@ -435,6 +439,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
out_msg.Requestor := machineID;
out_msg.Destination.add(map_Address_to_Directory(address));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := get_time();
TBEs[address].NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
}
}
@ -454,6 +459,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
out_msg.Acks := 2;
}
out_msg.MessageSize := MessageSizeType:Response_Data;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
}
}
}
@ -483,6 +490,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
out_msg.Acks := 2;
}
out_msg.MessageSize := MessageSizeType:Response_Data;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
}
}
}
@ -503,6 +512,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
out_msg.Acks := 2;
}
out_msg.MessageSize := MessageSizeType:Response_Data;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
}
}
}
@ -517,6 +528,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
out_msg.Acks := 1;
assert(in_msg.DirectedProbe == false);
out_msg.MessageSize := MessageSizeType:Response_Control;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
}
}
}
@ -531,6 +544,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
out_msg.Acks := 1;
assert(in_msg.DirectedProbe == false);
out_msg.MessageSize := MessageSizeType:Response_Control;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
}
}
}
@ -580,7 +595,10 @@ machine(L1Cache, "AMD Hammer-like protocol")
sequencer.readCallback(address,
getNondirectHitMachType(in_msg.Address, in_msg.Sender),
getCacheEntry(address).DataBlk);
getCacheEntry(address).DataBlk,
TBEs[address].InitialRequestTime,
TBEs[address].ForwardRequestTime,
TBEs[address].FirstResponseTime);
}
}
@ -605,7 +623,10 @@ machine(L1Cache, "AMD Hammer-like protocol")
sequencer.writeCallback(address,
getNondirectHitMachType(address, in_msg.Sender),
getCacheEntry(address).DataBlk);
getCacheEntry(address).DataBlk,
TBEs[address].InitialRequestTime,
TBEs[address].ForwardRequestTime,
TBEs[address].FirstResponseTime);
}
getCacheEntry(address).Dirty := true;
@ -617,7 +638,10 @@ machine(L1Cache, "AMD Hammer-like protocol")
sequencer.writeCallback(address,
getNondirectHitMachType(address,
TBEs[address].LastResponder),
getCacheEntry(address).DataBlk);
getCacheEntry(address).DataBlk,
TBEs[address].InitialRequestTime,
TBEs[address].ForwardRequestTime,
TBEs[address].FirstResponseTime);
getCacheEntry(address).Dirty := true;
}
@ -649,6 +673,21 @@ machine(L1Cache, "AMD Hammer-like protocol")
TBEs[address].NumPendingMsgs := TBEs[address].NumPendingMsgs - in_msg.Acks;
DEBUG_EXPR(TBEs[address].NumPendingMsgs);
TBEs[address].LastResponder := in_msg.Sender;
if (TBEs[address].InitialRequestTime != zero_time() && in_msg.InitialRequestTime != zero_time()) {
assert(TBEs[address].InitialRequestTime == in_msg.InitialRequestTime);
}
if (in_msg.InitialRequestTime != zero_time()) {
TBEs[address].InitialRequestTime := in_msg.InitialRequestTime;
}
if (TBEs[address].ForwardRequestTime != zero_time() && in_msg.ForwardRequestTime != zero_time()) {
assert(TBEs[address].ForwardRequestTime == in_msg.ForwardRequestTime);
}
if (in_msg.ForwardRequestTime != zero_time()) {
TBEs[address].ForwardRequestTime := in_msg.ForwardRequestTime;
}
if (TBEs[address].FirstResponseTime == zero_time()) {
TBEs[address].FirstResponseTime := get_time();
}
}
}
@ -700,6 +739,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
out_msg.Acks := 2;
}
out_msg.MessageSize := MessageSizeType:Response_Data;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
}
}
}

View file

@ -645,6 +645,8 @@ machine(Directory, "AMD Hammer-like protocol")
out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor
out_msg.MessageSize := MessageSizeType:Broadcast_Control;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := get_time();
}
}
}
@ -685,6 +687,8 @@ machine(Directory, "AMD Hammer-like protocol")
out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor
out_msg.MessageSize := MessageSizeType:Broadcast_Control;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := get_time();
}
}
}
@ -701,6 +705,8 @@ machine(Directory, "AMD Hammer-like protocol")
out_msg.Destination.add(getPfEntry(address).Owner);
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.DirectedProbe := true;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := get_time();
}
}
} else {
@ -712,6 +718,8 @@ machine(Directory, "AMD Hammer-like protocol")
out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor
out_msg.MessageSize := MessageSizeType:Broadcast_Control;
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := get_time();
}
}
}

View file

@ -78,6 +78,8 @@ structure(RequestMsg, desc="...", interface="NetworkMessage") {
NetDest Destination, desc="Multicast destination mask";
MessageSizeType MessageSize, desc="size category of the message";
bool DirectedProbe, default="false", desc="probe filter directed probe";
Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
}
// ResponseMsg (and also unblock requests)
@ -90,6 +92,8 @@ structure(ResponseMsg, desc="...", interface="NetworkMessage") {
bool Dirty, desc="Is the data dirty (different than memory)?";
int Acks, desc="How many messages this counts as";
MessageSizeType MessageSize, desc="size category of the message";
Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
}
enumeration(DMARequestType, desc="...", default="DMARequestType_NULL") {

View file

@ -101,8 +101,10 @@ external_type(NetDest, non_obj="yes") {
external_type(Sequencer) {
void readCallback(Address, DataBlock);
void readCallback(Address, GenericMachineType, DataBlock);
void readCallback(Address, GenericMachineType, DataBlock, Time, Time, Time);
void writeCallback(Address, DataBlock);
void writeCallback(Address, GenericMachineType, DataBlock);
void writeCallback(Address, GenericMachineType, DataBlock, Time, Time, Time);
void checkCoherence(Address);
void profileNack(Address, int, int, uint64);
}

View file

@ -286,6 +286,35 @@ Profiler::printStats(ostream& out, bool short_stats)
}
}
out << "miss_latency_wCC_issue_to_initial_request: "
<< m_wCCIssueToInitialRequestHistogram << endl;
out << "miss_latency_wCC_initial_forward_request: "
<< m_wCCInitialRequestToForwardRequestHistogram << endl;
out << "miss_latency_wCC_forward_to_first_response: "
<< m_wCCForwardRequestToFirstResponseHistogram << endl;
out << "miss_latency_wCC_first_response_to_completion: "
<< m_wCCFirstResponseToCompleteHistogram << endl;
out << "imcomplete_wCC_Times: " << m_wCCIncompleteTimes << endl;
out << "miss_latency_dir_issue_to_initial_request: "
<< m_dirIssueToInitialRequestHistogram << endl;
out << "miss_latency_dir_initial_forward_request: "
<< m_dirInitialRequestToForwardRequestHistogram << endl;
out << "miss_latency_dir_forward_to_first_response: "
<< m_dirForwardRequestToFirstResponseHistogram << endl;
out << "miss_latency_dir_first_response_to_completion: "
<< m_dirFirstResponseToCompleteHistogram << endl;
out << "imcomplete_dir_Times: " << m_dirIncompleteTimes << endl;
for (int i = 0; i < m_missMachLatencyHistograms.size(); i++) {
for (int j = 0; j < m_missMachLatencyHistograms[i].size(); j++) {
if (m_missMachLatencyHistograms[i][j].size() > 0) {
out << "miss_latency_" << RubyRequestType(i)
<< "_" << GenericMachineType(j) << ": "
<< m_missMachLatencyHistograms[i][j] << endl;
}
}
}
out << endl;
out << "All Non-Zero Cycle SW Prefetch Requests" << endl;
@ -454,7 +483,24 @@ Profiler::clearStats()
for (int i = 0; i < m_machLatencyHistograms.size(); i++) {
m_machLatencyHistograms[i].clear(200);
}
m_missMachLatencyHistograms.resize(RubyRequestType_NUM);
for (int i = 0; i < m_missLatencyHistograms.size(); i++) {
m_missMachLatencyHistograms[i].resize(GenericMachineType_NUM+1);
for (int j = 0; j < m_missMachLatencyHistograms[i].size(); j++) {
m_missMachLatencyHistograms[i][j].clear(200);
}
}
m_allMissLatencyHistogram.clear(200);
m_wCCIssueToInitialRequestHistogram.clear(200);
m_wCCInitialRequestToForwardRequestHistogram.clear(200);
m_wCCForwardRequestToFirstResponseHistogram.clear(200);
m_wCCFirstResponseToCompleteHistogram.clear(200);
m_wCCIncompleteTimes = 0;
m_dirIssueToInitialRequestHistogram.clear(200);
m_dirInitialRequestToForwardRequestHistogram.clear(200);
m_dirForwardRequestToFirstResponseHistogram.clear(200);
m_dirFirstResponseToCompleteHistogram.clear(200);
m_dirIncompleteTimes = 0;
m_SWPrefetchLatencyHistograms.resize(CacheRequestType_NUM);
for (int i = 0; i < m_SWPrefetchLatencyHistograms.size(); i++) {
@ -581,6 +627,59 @@ Profiler::missLatency(Time cycles,
m_allMissLatencyHistogram.add(cycles);
m_missLatencyHistograms[type].add(cycles);
m_machLatencyHistograms[respondingMach].add(cycles);
m_missMachLatencyHistograms[type][respondingMach].add(cycles);
}
void
Profiler::missLatencyWcc(Time issuedTime,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime,
Time completionTime)
{
if ((issuedTime <= initialRequestTime) &&
(initialRequestTime <= forwardRequestTime) &&
(forwardRequestTime <= firstResponseTime) &&
(firstResponseTime <= completionTime)) {
m_wCCIssueToInitialRequestHistogram.add(initialRequestTime - issuedTime);
m_wCCInitialRequestToForwardRequestHistogram.add(forwardRequestTime -
initialRequestTime);
m_wCCForwardRequestToFirstResponseHistogram.add(firstResponseTime -
forwardRequestTime);
m_wCCFirstResponseToCompleteHistogram.add(completionTime -
firstResponseTime);
} else {
m_wCCIncompleteTimes++;
}
}
void
Profiler::missLatencyDir(Time issuedTime,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime,
Time completionTime)
{
if ((issuedTime <= initialRequestTime) &&
(initialRequestTime <= forwardRequestTime) &&
(forwardRequestTime <= firstResponseTime) &&
(firstResponseTime <= completionTime)) {
m_dirIssueToInitialRequestHistogram.add(initialRequestTime - issuedTime);
m_dirInitialRequestToForwardRequestHistogram.add(forwardRequestTime -
initialRequestTime);
m_dirForwardRequestToFirstResponseHistogram.add(firstResponseTime -
forwardRequestTime);
m_dirFirstResponseToCompleteHistogram.add(completionTime -
firstResponseTime);
} else {
m_dirIncompleteTimes++;
}
}
// non-zero cycle prefetch request

View file

@ -138,6 +138,18 @@ class Profiler : public SimObject, public Consumer
RubyRequestType type,
const GenericMachineType respondingMach);
void missLatencyWcc(Time issuedTime,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime,
Time completionTime);
void missLatencyDir(Time issuedTime,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime,
Time completionTime);
void swPrefetchLatency(Time t,
CacheRequestType type,
const GenericMachineType respondingMach);
@ -200,6 +212,18 @@ class Profiler : public SimObject, public Consumer
std::vector<Histogram> m_missLatencyHistograms;
std::vector<Histogram> m_machLatencyHistograms;
std::vector< std::vector<Histogram> > m_missMachLatencyHistograms;
Histogram m_wCCIssueToInitialRequestHistogram;
Histogram m_wCCInitialRequestToForwardRequestHistogram;
Histogram m_wCCForwardRequestToFirstResponseHistogram;
Histogram m_wCCFirstResponseToCompleteHistogram;
int64 m_wCCIncompleteTimes;
Histogram m_dirIssueToInitialRequestHistogram;
Histogram m_dirInitialRequestToForwardRequestHistogram;
Histogram m_dirForwardRequestToFirstResponseHistogram;
Histogram m_dirFirstResponseToCompleteHistogram;
int64 m_dirIncompleteTimes;
Histogram m_allMissLatencyHistogram;
Histogram m_allSWPrefetchLatencyHistogram;

View file

@ -356,6 +356,17 @@ void
Sequencer::writeCallback(const Address& address,
GenericMachineType mach,
DataBlock& data)
{
writeCallback(address, mach, data, 0, 0, 0);
}
void
Sequencer::writeCallback(const Address& address,
GenericMachineType mach,
DataBlock& data,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime)
{
assert(address == line_address(address));
assert(m_writeRequestTable.count(line_address(address)));
@ -385,7 +396,8 @@ Sequencer::writeCallback(const Address& address,
m_controller->unblock(address);
}
hitCallback(request, mach, data, success);
hitCallback(request, mach, data, success,
initialRequestTime, forwardRequestTime, firstResponseTime);
}
void
@ -398,6 +410,17 @@ void
Sequencer::readCallback(const Address& address,
GenericMachineType mach,
DataBlock& data)
{
readCallback(address, mach, data, 0, 0, 0);
}
void
Sequencer::readCallback(const Address& address,
GenericMachineType mach,
DataBlock& data,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime)
{
assert(address == line_address(address));
assert(m_readRequestTable.count(line_address(address)));
@ -413,14 +436,18 @@ Sequencer::readCallback(const Address& address,
(request->ruby_request.type == RubyRequestType_RMW_Read) ||
(request->ruby_request.type == RubyRequestType_IFETCH));
hitCallback(request, mach, data, true);
hitCallback(request, mach, data, true,
initialRequestTime, forwardRequestTime, firstResponseTime);
}
void
Sequencer::hitCallback(SequencerRequest* srequest,
GenericMachineType mach,
DataBlock& data,
bool success)
bool success,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime)
{
const RubyRequest & ruby_request = srequest->ruby_request;
Address request_address(ruby_request.paddr);
@ -445,6 +472,22 @@ Sequencer::hitCallback(SequencerRequest* srequest,
if (miss_latency != 0) {
g_system_ptr->getProfiler()->missLatency(miss_latency, type, mach);
if (mach == GenericMachineType_L1Cache_wCC) {
g_system_ptr->getProfiler()->missLatencyWcc(issued_time,
initialRequestTime,
forwardRequestTime,
firstResponseTime,
g_eventQueue_ptr->getTime());
}
if (mach == GenericMachineType_Directory) {
g_system_ptr->getProfiler()->missLatencyDir(issued_time,
initialRequestTime,
forwardRequestTime,
firstResponseTime,
g_eventQueue_ptr->getTime());
}
if (Debug::getProtocolTrace()) {
if (success) {
g_system_ptr->getProfiler()->

View file

@ -80,12 +80,26 @@ class Sequencer : public RubyPort, public Consumer
GenericMachineType mach,
DataBlock& data);
void writeCallback(const Address& address,
GenericMachineType mach,
DataBlock& data,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime);
void readCallback(const Address& address, DataBlock& data);
void readCallback(const Address& address,
GenericMachineType mach,
DataBlock& data);
void readCallback(const Address& address,
GenericMachineType mach,
DataBlock& data,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime);
RequestStatus makeRequest(const RubyRequest & request);
RequestStatus getRequestStatus(const RubyRequest& request);
bool empty() const;
@ -106,7 +120,10 @@ class Sequencer : public RubyPort, public Consumer
void hitCallback(SequencerRequest* request,
GenericMachineType mach,
DataBlock& data,
bool success);
bool success,
Time initialRequestTime,
Time forwardRequestTime,
Time firstResponseTime);
bool insertRequest(SequencerRequest* request);