From 8b28848321f301e6b13cab55e539f86a0e6c71ca Mon Sep 17 00:00:00 2001 From: Brad Beckmann Date: Fri, 20 Aug 2010 11:46:14 -0700 Subject: [PATCH] ruby: added probe filter support to hammer --- configs/ruby/MOESI_CMP_token.py | 8 +- configs/ruby/MOESI_hammer.py | 41 +- src/mem/protocol/MOESI_hammer-cache.sm | 91 +++- src/mem/protocol/MOESI_hammer-dir.sm | 557 ++++++++++++++++++++++--- src/mem/protocol/MOESI_hammer-msg.sm | 7 +- src/mem/ruby/system/Cache.py | 1 + src/mem/ruby/system/CacheMemory.cc | 5 +- src/mem/ruby/system/CacheMemory.hh | 1 + 8 files changed, 613 insertions(+), 98 deletions(-) diff --git a/configs/ruby/MOESI_CMP_token.py b/configs/ruby/MOESI_CMP_token.py index ef110d682..ba61c727a 100644 --- a/configs/ruby/MOESI_CMP_token.py +++ b/configs/ruby/MOESI_CMP_token.py @@ -81,6 +81,7 @@ def create_system(options, system, piobus, dma_devices): # Must create the individual controllers before the network to ensure the # controller constructors are called before the network constructor # + l2_bits = int(math.log(options.num_l2caches, 2)) for i in xrange(options.num_cpus): # @@ -104,9 +105,7 @@ def create_system(options, system, piobus, dma_devices): sequencer = cpu_seq, L1IcacheMemory = l1i_cache, L1DcacheMemory = l1d_cache, - l2_select_num_bits = \ - math.log(options.num_l2caches, - 2), + l2_select_num_bits = l2_bits, N_tokens = n_tokens, retry_threshold = \ options.l1_retries, @@ -129,7 +128,8 @@ def create_system(options, system, piobus, dma_devices): # First create the Ruby objects associated with this cpu # l2_cache = L2Cache(size = options.l2_size, - assoc = options.l2_assoc) + assoc = options.l2_assoc, + start_index_bit = l2_bits) l2_cntrl = L2Cache_Controller(version = i, L2cacheMemory = l2_cache, diff --git a/configs/ruby/MOESI_hammer.py b/configs/ruby/MOESI_hammer.py index 02d958b09..00908ae8b 100644 --- a/configs/ruby/MOESI_hammer.py +++ b/configs/ruby/MOESI_hammer.py @@ -27,6 +27,7 @@ # # Authors: Brad Beckmann +import math import m5 from m5.objects import * from m5.defines import buildEnv @@ -43,10 +44,18 @@ class L1Cache(RubyCache): class L2Cache(RubyCache): latency = 10 +# +# Probe filter is a cache, latency is not used +# +class ProbeFilter(RubyCache): + latency = 1 + def define_options(parser): parser.add_option("--allow-atomic-migration", action="store_true", help="allow migratory sharing for atomic only accessed blocks") - + parser.add_option("--pf-on", action="store_true", + help="Hammer: enable Probe Filter") + def create_system(options, system, piobus, dma_devices): if buildEnv['PROTOCOL'] != 'MOESI_hammer': @@ -107,6 +116,29 @@ def create_system(options, system, piobus, dma_devices): long(system.physmem.range.first) + 1 mem_module_size = phys_mem_size / options.num_dirs + # + # determine size and index bits for probe filter + # By default, the probe filter size is configured to be twice the + # size of the L2 cache. + # + pf_size = MemorySize(options.l2_size) + pf_size.value = pf_size.value * 2 + dir_bits = int(math.log(options.num_dirs, 2)) + pf_bits = int(math.log(pf_size.value, 2)) + if options.numa_high_bit: + if options.numa_high_bit > 0: + # if numa high bit explicitly set, make sure it does not overlap + # with the probe filter index + assert(options.numa_high_bit - dir_bits > pf_bits) + + # set the probe filter start bit to just above the block offset + pf_start_bit = 6 + else: + if dir_bits > 0: + pf_start_bit = dir_bits + 5 + else: + pf_start_bit = 6 + for i in xrange(options.num_dirs): # # Create the Ruby objects associated with the directory controller @@ -117,6 +149,8 @@ def create_system(options, system, piobus, dma_devices): dir_size = MemorySize('0B') dir_size.value = mem_module_size + pf = ProbeFilter(size = pf_size, assoc = 4) + dir_cntrl = Directory_Controller(version = i, directory = \ RubyDirectoryMemory( \ @@ -125,7 +159,10 @@ def create_system(options, system, piobus, dma_devices): use_map = options.use_map, map_levels = \ options.map_levels), - memBuffer = mem_cntrl) + probeFilter = pf, + memBuffer = mem_cntrl, + probe_filter_enabled = \ + options.pf_on) exec("system.dir_cntrl%d = dir_cntrl" % i) dir_cntrl_nodes.append(dir_cntrl) diff --git a/src/mem/protocol/MOESI_hammer-cache.sm b/src/mem/protocol/MOESI_hammer-cache.sm index 7b49c075c..06ce69624 100644 --- a/src/mem/protocol/MOESI_hammer-cache.sm +++ b/src/mem/protocol/MOESI_hammer-cache.sm @@ -96,6 +96,7 @@ machine(L1Cache, "AMD Hammer-like protocol") Other_GETX, desc="A GetX from another processor"; Other_GETS, desc="A GetS from another processor"; Other_GETS_No_Mig, desc="A GetS from another processor"; + Invalidate, desc="Invalidate block"; // Responses Ack, desc="Received an ack message"; @@ -292,6 +293,8 @@ machine(L1Cache, "AMD Hammer-like protocol") } else { trigger(Event:Other_GETS, in_msg.Address); } + } else if (in_msg.Type == CoherenceRequestType:INV) { + trigger(Event:Invalidate, in_msg.Address); } else if (in_msg.Type == CoherenceRequestType:WB_ACK) { trigger(Event:Writeback_Ack, in_msg.Address); } else if (in_msg.Type == CoherenceRequestType:WB_NACK) { @@ -445,7 +448,11 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Destination.add(in_msg.Requestor); out_msg.DataBlk := getCacheEntry(address).DataBlk; out_msg.Dirty := getCacheEntry(address).Dirty; - out_msg.Acks := 2; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } out_msg.MessageSize := MessageSizeType:Response_Data; } } @@ -470,7 +477,11 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Destination.add(in_msg.Requestor); out_msg.DataBlk := getCacheEntry(address).DataBlk; out_msg.Dirty := getCacheEntry(address).Dirty; - out_msg.Acks := 2; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } out_msg.MessageSize := MessageSizeType:Response_Data; } } @@ -484,8 +495,13 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Sender := machineID; out_msg.Destination.add(in_msg.Requestor); out_msg.DataBlk := getCacheEntry(address).DataBlk; + DEBUG_EXPR(out_msg.DataBlk); out_msg.Dirty := getCacheEntry(address).Dirty; - out_msg.Acks := 2; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } out_msg.MessageSize := MessageSizeType:Response_Data; } } @@ -499,6 +515,7 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Sender := machineID; out_msg.Destination.add(in_msg.Requestor); out_msg.Acks := 1; + assert(in_msg.DirectedProbe == false); out_msg.MessageSize := MessageSizeType:Response_Control; } } @@ -512,6 +529,7 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Sender := machineID; out_msg.Destination.add(in_msg.Requestor); out_msg.Acks := 1; + assert(in_msg.DirectedProbe == false); out_msg.MessageSize := MessageSizeType:Response_Control; } } @@ -527,6 +545,26 @@ machine(L1Cache, "AMD Hammer-like protocol") } } + action(gm_sendUnblockM, "gm", desc="Send unblock to memory and indicate M/O/E state") { + enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceResponseType:UNBLOCKM; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + } + } + + action(gs_sendUnblockS, "gs", desc="Send unblock to memory and indicate S state") { + enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceResponseType:UNBLOCKS; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + } + } + action(h_load_hit, "h", desc="Notify sequencer the load completed.") { DEBUG_EXPR(getCacheEntry(address).DataBlk); @@ -653,9 +691,14 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Type := CoherenceResponseType:DATA; out_msg.Sender := machineID; out_msg.Destination.add(in_msg.Requestor); + DEBUG_EXPR(out_msg.Destination); out_msg.DataBlk := TBEs[address].DataBlk; out_msg.Dirty := TBEs[address].Dirty; - out_msg.Acks := 2; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } out_msg.MessageSize := MessageSizeType:Response_Data; } } @@ -719,9 +762,11 @@ machine(L1Cache, "AMD Hammer-like protocol") action(v_writeDataToCacheVerify, "v", desc="Write data to cache, assert it was same as before") { peek(responseToCache_in, ResponseMsg) { + DEBUG_EXPR(getCacheEntry(address).DataBlk); + DEBUG_EXPR(in_msg.DataBlk); assert(getCacheEntry(address).DataBlk == in_msg.DataBlk); getCacheEntry(address).DataBlk := in_msg.DataBlk; - getCacheEntry(address).Dirty := in_msg.Dirty; + getCacheEntry(address).Dirty := in_msg.Dirty || getCacheEntry(address).Dirty; } } @@ -813,7 +858,7 @@ machine(L1Cache, "AMD Hammer-like protocol") zz_recycleMandatoryQueue; } - transition({IT, ST, OT, MT, MMT}, {Other_GETX, Other_GETS, Other_GETS_No_Mig}) { + transition({IT, ST, OT, MT, MMT}, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) { // stall } @@ -963,7 +1008,7 @@ machine(L1Cache, "AMD Hammer-like protocol") rr_deallocateL2CacheBlock; } - transition(I, {Other_GETX, Other_GETS, Other_GETS_No_Mig}) { + transition(I, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) { f_sendAck; l_popForwardQueue; } @@ -985,7 +1030,7 @@ machine(L1Cache, "AMD Hammer-like protocol") rr_deallocateL2CacheBlock; } - transition(S, Other_GETX, I) { + transition(S, {Other_GETX, Invalidate}, I) { f_sendAck; l_popForwardQueue; } @@ -1015,7 +1060,7 @@ machine(L1Cache, "AMD Hammer-like protocol") rr_deallocateL2CacheBlock; } - transition(O, Other_GETX, I) { + transition(O, {Other_GETX, Invalidate}, I) { e_sendData; l_popForwardQueue; } @@ -1042,7 +1087,7 @@ machine(L1Cache, "AMD Hammer-like protocol") rr_deallocateL2CacheBlock; } - transition(MM, Other_GETX, I) { + transition(MM, {Other_GETX, Invalidate}, I) { c_sendExclusiveData; l_popForwardQueue; } @@ -1074,7 +1119,7 @@ machine(L1Cache, "AMD Hammer-like protocol") rr_deallocateL2CacheBlock; } - transition(M, Other_GETX, I) { + transition(M, {Other_GETX, Invalidate}, I) { c_sendExclusiveData; l_popForwardQueue; } @@ -1086,7 +1131,7 @@ machine(L1Cache, "AMD Hammer-like protocol") // Transitions from IM - transition(IM, {Other_GETX, Other_GETS, Other_GETS_No_Mig}) { + transition(IM, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) { f_sendAck; l_popForwardQueue; } @@ -1118,7 +1163,7 @@ machine(L1Cache, "AMD Hammer-like protocol") l_popForwardQueue; } - transition(SM, Other_GETX, IM) { + transition(SM, {Other_GETX, Invalidate}, IM) { f_sendAck; l_popForwardQueue; } @@ -1145,14 +1190,14 @@ machine(L1Cache, "AMD Hammer-like protocol") transition(ISM, All_acks_no_sharers, MM) { sxt_trig_ext_store_hit; - g_sendUnblock; + gm_sendUnblockM; s_deallocateTBE; j_popTriggerQueue; } // Transitions from OM - transition(OM, Other_GETX, IM) { + transition(OM, {Other_GETX, Invalidate}, IM) { e_sendData; pp_incrementNumberOfMessagesByOne; l_popForwardQueue; @@ -1171,14 +1216,14 @@ machine(L1Cache, "AMD Hammer-like protocol") transition(OM, {All_acks, All_acks_no_sharers}, MM) { sxt_trig_ext_store_hit; - g_sendUnblock; + gm_sendUnblockM; s_deallocateTBE; j_popTriggerQueue; } // Transitions from IS - transition(IS, {Other_GETX, Other_GETS, Other_GETS_No_Mig}) { + transition(IS, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) { f_sendAck; l_popForwardQueue; } @@ -1237,14 +1282,14 @@ machine(L1Cache, "AMD Hammer-like protocol") } transition(SS, All_acks, S) { - g_sendUnblock; + gs_sendUnblockS; s_deallocateTBE; j_popTriggerQueue; } transition(SS, All_acks_no_sharers, S) { // Note: The directory might still be the owner, so that is why we go to S - g_sendUnblock; + gs_sendUnblockS; s_deallocateTBE; j_popTriggerQueue; } @@ -1263,7 +1308,7 @@ machine(L1Cache, "AMD Hammer-like protocol") } transition(MM_W, All_acks_no_sharers, MM) { - g_sendUnblock; + gm_sendUnblockM; s_deallocateTBE; j_popTriggerQueue; } @@ -1282,14 +1327,14 @@ machine(L1Cache, "AMD Hammer-like protocol") } transition(M_W, All_acks_no_sharers, M) { - g_sendUnblock; + gm_sendUnblockM; s_deallocateTBE; j_popTriggerQueue; } // Transitions from OI/MI - transition({OI, MI}, Other_GETX, II) { + transition({OI, MI}, {Other_GETX, Invalidate}, II) { q_sendDataFromTBEToCache; l_popForwardQueue; } @@ -1312,7 +1357,7 @@ machine(L1Cache, "AMD Hammer-like protocol") } // Transitions from II - transition(II, {Other_GETS, Other_GETS_No_Mig, Other_GETX}, II) { + transition(II, {Other_GETS, Other_GETS_No_Mig, Other_GETX, Invalidate}, II) { f_sendAck; l_popForwardQueue; } diff --git a/src/mem/protocol/MOESI_hammer-dir.sm b/src/mem/protocol/MOESI_hammer-dir.sm index d967c813c..ae282ba3a 100644 --- a/src/mem/protocol/MOESI_hammer-dir.sm +++ b/src/mem/protocol/MOESI_hammer-dir.sm @@ -35,8 +35,10 @@ machine(Directory, "AMD Hammer-like protocol") : DirectoryMemory * directory, + CacheMemory * probeFilter, MemoryControl * memBuffer, - int memory_controller_latency = 2 + int memory_controller_latency = 2, + bool probe_filter_enabled = false { MessageBuffer forwardFromDir, network="To", virtual_network="3", ordered="false"; @@ -56,9 +58,16 @@ machine(Directory, "AMD Hammer-like protocol") // STATES enumeration(State, desc="Directory states", default="Directory_State_E") { // Base states - NO, desc="Not Owner"; - O, desc="Owner"; - E, desc="Exclusive Owner (we can provide the data in exclusive)"; + NX, desc="Not Owner, probe filter entry exists, block in O at Owner"; + NO, desc="Not Owner, probe filter entry exists, block in E/M at Owner"; + S, desc="Data clean, probe filter entry exists pointing to the current owner"; + O, desc="Data clean, probe filter entry exists"; + E, desc="Exclusive Owner, no probe filter entry"; + + O_R, desc="Was data Owner, replacing probe filter entry"; + S_R, desc="Was Not Owner or Sharer, replacing probe filter entry"; + NO_R, desc="Was Not Owner or Sharer, replacing probe filter entry"; + NO_B, "NO^B", desc="Not Owner, Blocked"; O_B, "O^B", desc="Owner, Blocked"; NO_B_W, desc="Not Owner, Blocked, waiting for Dram"; @@ -83,11 +92,16 @@ machine(Directory, "AMD Hammer-like protocol") GETS, desc="A GETS arrives"; PUT, desc="A PUT arrives"; Unblock, desc="An unblock message arrives"; + UnblockS, desc="An unblock message arrives"; + UnblockM, desc="An unblock message arrives"; Writeback_Clean, desc="The final part of a PutX (no data)"; Writeback_Dirty, desc="The final part of a PutX (data)"; Writeback_Exclusive_Clean, desc="The final part of a PutX (no data, exclusive)"; Writeback_Exclusive_Dirty, desc="The final part of a PutX (data, exclusive)"; + // Probe filter + Pf_Replacement, desc="probe filter replacement"; + // DMA requests DMA_READ, desc="A DMA Read memory request"; DMA_WRITE, desc="A DMA Write memory request"; @@ -100,10 +114,12 @@ machine(Directory, "AMD Hammer-like protocol") Ack, desc="Received an ack message"; Shared_Ack, desc="Received an ack message, responder has a shared copy"; Shared_Data, desc="Received a data message, responder has a shared copy"; + Data, desc="Received a data message, responder had a owner or exclusive copy, they gave it to us"; Exclusive_Data, desc="Received a data message, responder had an exclusive copy, they gave it to us"; // Triggers - All_acks_and_data, desc="Received all required data and message acks"; + All_acks_and_shared_data, desc="Received shared data and message acks"; + All_acks_and_owner_data, desc="Received shared data and message acks"; All_acks_and_data_no_sharers, desc="Received all acks and no other processor has a shared copy"; } @@ -115,18 +131,27 @@ machine(Directory, "AMD Hammer-like protocol") DataBlock DataBlk, desc="data for the block"; } + // ProbeFilterEntry + structure(PfEntry, desc="...", interface="AbstractCacheEntry") { + State PfState, desc="Directory state"; + MachineID Owner, desc="Owner node"; + DataBlock DataBlk, desc="data for the block"; + } + // TBE entries for DMA requests structure(TBE, desc="TBE entries for outstanding DMA requests") { Address PhysicalAddress, desc="physical address"; State TBEState, desc="Transient State"; CoherenceResponseType ResponseType, desc="The type for the subsequent response message"; + int Acks, default="0", desc="The number of acks that the waiting response represents"; DataBlock DmaDataBlk, desc="DMA Data to be written. Partial blocks need to merged with system memory"; DataBlock DataBlk, desc="The current view of system memory"; int Len, desc="..."; MachineID DmaRequestor, desc="DMA requestor"; int NumPendingMsgs, desc="Number of pending acks/messages"; - bool CacheDirty, desc="Indicates whether a cache has responded with dirty data"; - bool Sharers, desc="Indicates whether a cache has indicated it is currently a sharer"; + bool CacheDirty, default="false", desc="Indicates whether a cache has responded with dirty data"; + bool Sharers, default="false", desc="Indicates whether a cache has indicated it is currently a sharer"; + bool Owned, default="false", desc="Indicates whether a cache has indicated it is currently a sharer"; } external_type(TBETable) { @@ -144,10 +169,21 @@ machine(Directory, "AMD Hammer-like protocol") return static_cast(Entry, directory[addr]); } + PfEntry getPfEntry(Address addr), return_by_ref="yes" { + return static_cast(PfEntry, probeFilter[addr]); + } + State getState(Address addr) { if (TBEs.isPresent(addr)) { return TBEs[addr].TBEState; } else { + if (probe_filter_enabled) { + if (probeFilter.isTagPresent(addr)) { + assert(getPfEntry(addr).PfState == getDirectoryEntry(addr).DirectoryState); + } else { + assert(getDirectoryEntry(addr).DirectoryState == State:E); + } + } return getDirectoryEntry(addr).DirectoryState; } } @@ -156,9 +192,31 @@ machine(Directory, "AMD Hammer-like protocol") if (TBEs.isPresent(addr)) { TBEs[addr].TBEState := state; } + if (probe_filter_enabled) { + if (probeFilter.isTagPresent(addr)) { + getPfEntry(addr).PfState := state; + } + if (state == State:NX || state == State:NO || state == State:S || state == State:O) { + assert(probeFilter.isTagPresent(addr)); + } + } + if (state == State:E || state == State:NX || state == State:NO || state == State:S || + state == State:O) { + assert(TBEs.isPresent(addr) == false); + } getDirectoryEntry(addr).DirectoryState := state; } + Event cache_request_to_event(CoherenceRequestType type) { + if (type == CoherenceRequestType:GETS) { + return Event:GETS; + } else if (type == CoherenceRequestType:GETX) { + return Event:GETX; + } else { + error("Invalid CoherenceRequestType"); + } + } + MessageBuffer triggerQueue, ordered="true"; // ** OUT_PORTS ** @@ -180,7 +238,9 @@ machine(Directory, "AMD Hammer-like protocol") if (triggerQueue_in.isReady()) { peek(triggerQueue_in, TriggerMsg) { if (in_msg.Type == TriggerType:ALL_ACKS) { - trigger(Event:All_acks_and_data, in_msg.Address); + trigger(Event:All_acks_and_owner_data, in_msg.Address); + } else if (in_msg.Type == TriggerType:ALL_ACKS_OWNER_EXISTS) { + trigger(Event:All_acks_and_shared_data, in_msg.Address); } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) { trigger(Event:All_acks_and_data_no_sharers, in_msg.Address); } else { @@ -195,6 +255,10 @@ machine(Directory, "AMD Hammer-like protocol") peek(unblockNetwork_in, ResponseMsg) { if (in_msg.Type == CoherenceResponseType:UNBLOCK) { trigger(Event:Unblock, in_msg.Address); + } else if (in_msg.Type == CoherenceResponseType:UNBLOCKS) { + trigger(Event:UnblockS, in_msg.Address); + } else if (in_msg.Type == CoherenceResponseType:UNBLOCKM) { + trigger(Event:UnblockM, in_msg.Address); } else if (in_msg.Type == CoherenceResponseType:WB_CLEAN) { trigger(Event:Writeback_Clean, in_msg.Address); } else if (in_msg.Type == CoherenceResponseType:WB_DIRTY) { @@ -220,8 +284,9 @@ machine(Directory, "AMD Hammer-like protocol") trigger(Event:Shared_Ack, in_msg.Address); } else if (in_msg.Type == CoherenceResponseType:DATA_SHARED) { trigger(Event:Shared_Data, in_msg.Address); - } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE || - in_msg.Type == CoherenceResponseType:DATA) { + } else if (in_msg.Type == CoherenceResponseType:DATA) { + trigger(Event:Data, in_msg.Address); + } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) { trigger(Event:Exclusive_Data, in_msg.Address); } else { error("Unexpected message"); @@ -247,14 +312,22 @@ machine(Directory, "AMD Hammer-like protocol") in_port(requestQueue_in, RequestMsg, requestToDir) { if (requestQueue_in.isReady()) { peek(requestQueue_in, RequestMsg) { - if (in_msg.Type == CoherenceRequestType:GETS) { - trigger(Event:GETS, in_msg.Address); - } else if (in_msg.Type == CoherenceRequestType:GETX) { - trigger(Event:GETX, in_msg.Address); - } else if (in_msg.Type == CoherenceRequestType:PUT) { + if (in_msg.Type == CoherenceRequestType:PUT) { trigger(Event:PUT, in_msg.Address); } else { - error("Invalid message"); + if (probe_filter_enabled) { + if (probeFilter.isTagPresent(in_msg.Address)) { + trigger(cache_request_to_event(in_msg.Type), in_msg.Address); + } else { + if (probeFilter.cacheAvail(in_msg.Address)) { + trigger(cache_request_to_event(in_msg.Type), in_msg.Address); + } else { + trigger(Event:Pf_Replacement, probeFilter.cacheProbe(in_msg.Address)); + } + } + } else { + trigger(cache_request_to_event(in_msg.Type), in_msg.Address); + } } } } @@ -278,6 +351,31 @@ machine(Directory, "AMD Hammer-like protocol") // Actions + action(r_setMRU, "\rr", desc="manually set the MRU bit for pf entry" ) { + if (probe_filter_enabled) { + assert(probeFilter.isTagPresent(address)); + probeFilter.setMRU(address); + } + } + + action(auno_assertUnblockerNotOwner, "auno", desc="assert unblocker not owner") { + if (probe_filter_enabled) { + assert(probeFilter.isTagPresent(address)); + peek(unblockNetwork_in, ResponseMsg) { + assert(getPfEntry(address).Owner != in_msg.Sender); + } + } + } + + action(uo_updateOwnerIfPf, "uo", desc="update owner") { + if (probe_filter_enabled) { + assert(probeFilter.isTagPresent(address)); + peek(unblockNetwork_in, ResponseMsg) { + getPfEntry(address).Owner := in_msg.Sender; + } + } + } + action(a_sendWriteBackAck, "a", desc="Send writeback ack to requestor") { peek(requestQueue_in, RequestMsg) { enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { @@ -302,6 +400,27 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(pfa_probeFilterAllocate, "pfa", desc="Allocate ProbeFilterEntry") { + if (probe_filter_enabled) { + peek(requestQueue_in, RequestMsg) { + probeFilter.allocate(address, new PfEntry); + getPfEntry(in_msg.Address).Owner := in_msg.Requestor; + } + } + } + + action(pfd_probeFilterDeallocate, "pfd", desc="Deallocate ProbeFilterEntry") { + if (probe_filter_enabled) { + probeFilter.deallocate(address); + } + } + + action(ppfd_possibleProbeFilterDeallocate, "ppfd", desc="Deallocate ProbeFilterEntry") { + if (probe_filter_enabled && probeFilter.isTagPresent(address)) { + probeFilter.deallocate(address); + } + } + action(v_allocateTBE, "v", desc="Allocate TBE") { peek(requestQueue_in, RequestMsg) { TBEs.allocate(address); @@ -330,10 +449,30 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(pa_setPendingMsgsToAll, "pa", desc="set pending msgs to all") { + TBEs[address].NumPendingMsgs := machineCount(MachineType:L1Cache); + } + + action(po_setPendingMsgsToOne, "po", desc="set pending msgs to one") { + TBEs[address].NumPendingMsgs := 1; + } + action(w_deallocateTBE, "w", desc="Deallocate TBE") { TBEs.deallocate(address); } + action(sa_setAcksToOne, "sa", desc="Forwarded request, set the ack amount to one") { + TBEs[address].Acks := 1; + } + + action(saa_setAcksToAllIfPF, "saa", desc="Non-forwarded request, set the ack amount to all") { + if (probe_filter_enabled) { + TBEs[address].Acks := machineCount(MachineType:L1Cache); + } else { + TBEs[address].Acks := 1; + } + } + action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") { peek(responseToDir_in, ResponseMsg) { assert(in_msg.Acks > 0); @@ -357,7 +496,11 @@ machine(Directory, "AMD Hammer-like protocol") enqueue(triggerQueue_out, TriggerMsg) { out_msg.Address := address; if (TBEs[address].Sharers) { - out_msg.Type := TriggerType:ALL_ACKS; + if (TBEs[address].Owned) { + out_msg.Type := TriggerType:ALL_ACKS_OWNER_EXISTS; + } else { + out_msg.Type := TriggerType:ALL_ACKS; + } } else { out_msg.Type := TriggerType:ALL_ACKS_NO_SHARERS; } @@ -365,6 +508,22 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(spa_setPendingAcksToZeroIfPF, "spa", desc="if probe filter, no need to wait for acks") { + if (probe_filter_enabled) { + TBEs[address].NumPendingMsgs := 0; + } + } + + action(sc_signalCompletionIfPF, "sc", desc="indicate that we should skip waiting for cpu acks") { + if (TBEs[address].NumPendingMsgs == 0) { + assert(probe_filter_enabled); + enqueue(triggerQueue_out, TriggerMsg) { + out_msg.Address := address; + out_msg.Type := TriggerType:ALL_ACKS_NO_SHARERS; + } + } + } + action(d_sendData, "d", desc="Send data to requestor") { peek(memQueue_in, MemoryMsg) { enqueue(responseNetwork_out, ResponseMsg, latency="1") { @@ -373,8 +532,11 @@ machine(Directory, "AMD Hammer-like protocol") out_msg.Sender := machineID; out_msg.Destination.add(in_msg.OriginalRequestorMachId); out_msg.DataBlk := in_msg.DataBlk; + DEBUG_EXPR(out_msg.DataBlk); out_msg.Dirty := false; // By definition, the block is now clean - out_msg.Acks := 1; + out_msg.Acks := TBEs[address].Acks; + DEBUG_EXPR(out_msg.Acks); + assert(out_msg.Acks > 0); out_msg.MessageSize := MessageSizeType:Response_Data; } } @@ -440,6 +602,11 @@ machine(Directory, "AMD Hammer-like protocol") TBEs[address].Sharers := true; } + action(so_setOwnerBit, "so", desc="We saw other sharers") { + TBEs[address].Sharers := true; + TBEs[address].Owned := true; + } + action(qf_queueMemoryFetchRequest, "qf", desc="Queue off-chip fetch request") { peek(requestQueue_in, RequestMsg) { enqueue(memQueue_out, MemoryMsg, latency="1") { @@ -468,7 +635,47 @@ machine(Directory, "AMD Hammer-like protocol") } } - action(f_forwardRequest, "f", desc="Forward requests") { + action(fn_forwardRequestIfNecessary, "fn", desc="Forward requests if necessary") { + if ((machineCount(MachineType:L1Cache) > 1) && (TBEs[address].Acks <= 1)) { + peek(requestQueue_in, RequestMsg) { + enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { + out_msg.Address := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches + out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor + out_msg.MessageSize := MessageSizeType:Broadcast_Control; + } + } + } + } + + action(ia_invalidateAllRequest, "ia", desc="invalidate all copies") { + if (machineCount(MachineType:L1Cache) > 1) { + enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceRequestType:INV; + out_msg.Requestor := machineID; + out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches + out_msg.MessageSize := MessageSizeType:Broadcast_Control; + } + } + } + + action(io_invalidateOwnerRequest, "io", desc="invalidate all copies") { + if (machineCount(MachineType:L1Cache) > 1) { + enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceRequestType:INV; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPfEntry(address).Owner); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.DirectedProbe := true; + } + } + } + + action(fb_forwardRequestBcast, "fb", desc="Forward requests to all nodes") { if (machineCount(MachineType:L1Cache) > 1) { peek(requestQueue_in, RequestMsg) { enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { @@ -483,34 +690,65 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(fc_forwardRequestConditionalOwner, "fc", desc="Forward request to one or more nodes") { + assert(machineCount(MachineType:L1Cache) > 1); + if (probe_filter_enabled) { + peek(requestQueue_in, RequestMsg) { + enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { + out_msg.Address := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(getPfEntry(address).Owner); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.DirectedProbe := true; + } + } + } else { + peek(requestQueue_in, RequestMsg) { + enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { + out_msg.Address := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches + out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor + out_msg.MessageSize := MessageSizeType:Broadcast_Control; + } + } + } + } + action(f_forwardWriteFromDma, "fw", desc="Forward requests") { - peek(dmaRequestQueue_in, DMARequestMsg) { - enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { - out_msg.Address := address; - out_msg.Type := CoherenceRequestType:GETX; - // - // Send to all L1 caches, since the requestor is the memory controller - // itself - // - out_msg.Requestor := machineID; - out_msg.Destination.broadcast(MachineType:L1Cache); - out_msg.MessageSize := MessageSizeType:Broadcast_Control; + if (TBEs[address].NumPendingMsgs > 0) { + peek(dmaRequestQueue_in, DMARequestMsg) { + enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceRequestType:GETX; + // + // Send to all L1 caches, since the requestor is the memory controller + // itself + // + out_msg.Requestor := machineID; + out_msg.Destination.broadcast(MachineType:L1Cache); + out_msg.MessageSize := MessageSizeType:Broadcast_Control; + } } } } action(f_forwardReadFromDma, "fr", desc="Forward requests") { - peek(dmaRequestQueue_in, DMARequestMsg) { - enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { - out_msg.Address := address; - out_msg.Type := CoherenceRequestType:GETS; - // - // Send to all L1 caches, since the requestor is the memory controller - // itself - // - out_msg.Requestor := machineID; - out_msg.Destination.broadcast(MachineType:L1Cache); - out_msg.MessageSize := MessageSizeType:Broadcast_Control; + if (TBEs[address].NumPendingMsgs > 0) { + peek(dmaRequestQueue_in, DMARequestMsg) { + enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceRequestType:GETS; + // + // Send to all L1 caches, since the requestor is the memory controller + // itself + // + out_msg.Requestor := machineID; + out_msg.Destination.broadcast(MachineType:L1Cache); + out_msg.MessageSize := MessageSizeType:Broadcast_Control; + } } } } @@ -554,6 +792,14 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(wr_writeResponseDataToMemory, "wr", desc="Write response data to memory") { + peek(responseToDir_in, ResponseMsg) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + DEBUG_EXPR(in_msg.Address); + DEBUG_EXPR(in_msg.DataBlk); + } + } + action(l_writeDataToMemory, "l", desc="Write PUTX/PUTO data to memory") { peek(unblockNetwork_in, ResponseMsg) { assert(in_msg.Dirty); @@ -565,14 +811,31 @@ machine(Directory, "AMD Hammer-like protocol") } action(dwt_writeDmaDataFromTBE, "dwt", desc="DMA Write data to memory from TBE") { + DEBUG_EXPR(getDirectoryEntry(address).DataBlk); getDirectoryEntry(address).DataBlk := TBEs[address].DataBlk; + DEBUG_EXPR(getDirectoryEntry(address).DataBlk); getDirectoryEntry(address).DataBlk.copyPartial(TBEs[address].DmaDataBlk, addressOffset(TBEs[address].PhysicalAddress), TBEs[address].Len); + DEBUG_EXPR(getDirectoryEntry(address).DataBlk); + } + + action(wdt_writeDataFromTBE, "wdt", desc="DMA Write data to memory from TBE") { + DEBUG_EXPR(getDirectoryEntry(address).DataBlk); + getDirectoryEntry(address).DataBlk := TBEs[address].DataBlk; + DEBUG_EXPR(getDirectoryEntry(address).DataBlk); } action(a_assertCacheData, "ac", desc="Assert that a cache provided the data") { assert(TBEs[address].CacheDirty); } + action(ano_assertNotOwner, "ano", desc="Assert that request is not current owner") { + if (probe_filter_enabled) { + peek(requestQueue_in, RequestMsg) { + assert(getPfEntry(address).Owner != in_msg.Requestor); + } + } + } + action(l_queueMemoryWBRequest, "lq", desc="Write PUTX data to memory") { peek(unblockNetwork_in, ResponseMsg) { enqueue(memQueue_out, MemoryMsg, latency="1") { @@ -616,75 +879,152 @@ machine(Directory, "AMD Hammer-like protocol") // Transitions out of E state transition(E, GETX, NO_B_W) { + pfa_probeFilterAllocate; v_allocateTBE; rx_recordExclusiveInTBE; + saa_setAcksToAllIfPF; qf_queueMemoryFetchRequest; - f_forwardRequest; + fn_forwardRequestIfNecessary; i_popIncomingRequestQueue; } transition(E, GETS, NO_B_W) { + pfa_probeFilterAllocate; v_allocateTBE; rx_recordExclusiveInTBE; + saa_setAcksToAllIfPF; qf_queueMemoryFetchRequest; - f_forwardRequest; + fn_forwardRequestIfNecessary; i_popIncomingRequestQueue; } transition(E, DMA_READ, NO_DR_B_W) { vd_allocateDmaRequestInTBE; qd_queueMemoryRequestFromDmaRead; + spa_setPendingAcksToZeroIfPF; f_forwardReadFromDma; p_popDmaRequestQueue; } + transition(E, DMA_WRITE, NO_DW_B_W) { + vd_allocateDmaRequestInTBE; + spa_setPendingAcksToZeroIfPF; + sc_signalCompletionIfPF; + f_forwardWriteFromDma; + p_popDmaRequestQueue; + } + // Transitions out of O state transition(O, GETX, NO_B_W) { + r_setMRU; v_allocateTBE; r_recordDataInTBE; + sa_setAcksToOne; qf_queueMemoryFetchRequest; - f_forwardRequest; + fb_forwardRequestBcast; i_popIncomingRequestQueue; } + // This transition is dumb, if a shared copy exists on-chip, then that should + // provide data, not slow off-chip dram. The problem is that the current + // caches don't provide data in S state transition(O, GETS, O_B_W) { + r_setMRU; v_allocateTBE; r_recordDataInTBE; + saa_setAcksToAllIfPF; qf_queueMemoryFetchRequest; - f_forwardRequest; + fn_forwardRequestIfNecessary; i_popIncomingRequestQueue; } transition(O, DMA_READ, O_DR_B_W) { vd_allocateDmaRequestInTBE; + spa_setPendingAcksToZeroIfPF; qd_queueMemoryRequestFromDmaRead; f_forwardReadFromDma; p_popDmaRequestQueue; } - transition({E, O, NO}, DMA_WRITE, NO_DW_B_W) { + transition(O, Pf_Replacement, O_R) { + v_allocateTBE; + pa_setPendingMsgsToAll; + ia_invalidateAllRequest; + pfd_probeFilterDeallocate; + } + + transition(S, Pf_Replacement, S_R) { + v_allocateTBE; + pa_setPendingMsgsToAll; + ia_invalidateAllRequest; + pfd_probeFilterDeallocate; + } + + transition(NO, Pf_Replacement, NO_R) { + v_allocateTBE; + po_setPendingMsgsToOne; + io_invalidateOwnerRequest; + pfd_probeFilterDeallocate; + } + + transition(NX, Pf_Replacement, NO_R) { + v_allocateTBE; + pa_setPendingMsgsToAll; + ia_invalidateAllRequest; + pfd_probeFilterDeallocate; + } + + transition({O, S, NO, NX}, DMA_WRITE, NO_DW_B_W) { vd_allocateDmaRequestInTBE; f_forwardWriteFromDma; p_popDmaRequestQueue; } + // Transitions out of NO state + transition(NX, GETX, NO_B) { + r_setMRU; + fb_forwardRequestBcast; + i_popIncomingRequestQueue; + } + // Transitions out of NO state transition(NO, GETX, NO_B) { - f_forwardRequest; + r_setMRU; + ano_assertNotOwner; + fc_forwardRequestConditionalOwner; i_popIncomingRequestQueue; } - transition(NO, GETS, NO_B) { - f_forwardRequest; + transition(S, GETX, NO_B) { + r_setMRU; + fb_forwardRequestBcast; i_popIncomingRequestQueue; } - transition(NO, PUT, WB) { + transition(S, GETS, NO_B) { + r_setMRU; + ano_assertNotOwner; + fb_forwardRequestBcast; + i_popIncomingRequestQueue; + } + + transition({NX, NO}, GETS, NO_B) { + r_setMRU; + ano_assertNotOwner; + fc_forwardRequestConditionalOwner; + i_popIncomingRequestQueue; + } + + transition({NO, NX, S}, PUT, WB) { + // + // note that the PUT requestor may not be the current owner if an invalidate + // raced with PUT + // a_sendWriteBackAck; i_popIncomingRequestQueue; } - transition(NO, DMA_READ, NO_DR_B_D) { + transition({NO, NX, S}, DMA_READ, NO_DR_B_D) { vd_allocateDmaRequestInTBE; f_forwardReadFromDma; p_popDmaRequestQueue; @@ -699,23 +1039,28 @@ machine(Directory, "AMD Hammer-like protocol") // Blocked transient states transition({NO_B, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D, NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, - NO_W, O_W, WB, WB_E_W, WB_O_W}, - {GETS, GETX, PUT}) { + NO_W, O_W, WB, WB_E_W, WB_O_W, O_R, S_R, NO_R}, + {GETS, GETX, PUT, Pf_Replacement}) { zz_recycleRequest; } transition({NO_B, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D, NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, - NO_W, O_W, WB, WB_E_W, WB_O_W}, + NO_W, O_W, WB, WB_E_W, WB_O_W, O_R, S_R, NO_R}, {DMA_READ, DMA_WRITE}) { y_recycleDmaRequestQueue; } - transition(NO_B, Unblock, NO) { + transition(NO_B, UnblockS, NX) { j_popIncomingUnblockQueue; } - transition(O_B, Unblock, O) { + transition(NO_B, UnblockM, NO) { + uo_updateOwnerIfPf; + j_popIncomingUnblockQueue; + } + + transition(O_B, UnblockS, O) { j_popIncomingUnblockQueue; } @@ -744,7 +1089,32 @@ machine(Directory, "AMD Hammer-like protocol") n_popResponseQueue; } - transition(NO_DR_B_W, Ack) { + transition({O_R, S_R, NO_R}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(S_R, Data) { + wr_writeResponseDataToMemory; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(NO_R, {Data, Exclusive_Data}) { + wr_writeResponseDataToMemory; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition({O_R, S_R, NO_R}, All_acks_and_data_no_sharers, E) { + w_deallocateTBE; + g_popTriggerQueue; + } + + transition({NO_DR_B_W, O_DR_B_W}, Ack) { m_decrementNumberOfMessages; n_popResponseQueue; } @@ -755,6 +1125,19 @@ machine(Directory, "AMD Hammer-like protocol") n_popResponseQueue; } + transition(O_DR_B, Shared_Ack) { + m_decrementNumberOfMessages; + so_setOwnerBit; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(O_DR_B_W, Shared_Ack) { + m_decrementNumberOfMessages; + so_setOwnerBit; + n_popResponseQueue; + } + transition({NO_DR_B, NO_DR_B_D}, Shared_Ack) { m_decrementNumberOfMessages; r_setSharerBit; @@ -765,7 +1148,7 @@ machine(Directory, "AMD Hammer-like protocol") transition(NO_DR_B_W, Shared_Data) { r_recordCacheData; m_decrementNumberOfMessages; - r_setSharerBit; + so_setOwnerBit; o_checkForCompletion; n_popResponseQueue; } @@ -773,51 +1156,85 @@ machine(Directory, "AMD Hammer-like protocol") transition({NO_DR_B, NO_DR_B_D}, Shared_Data) { r_recordCacheData; m_decrementNumberOfMessages; - r_setSharerBit; + so_setOwnerBit; o_checkForCompletion; n_popResponseQueue; } - transition(NO_DR_B_W, Exclusive_Data) { + transition(NO_DR_B_W, {Exclusive_Data, Data}) { r_recordCacheData; m_decrementNumberOfMessages; n_popResponseQueue; } - transition({NO_DR_B, NO_DR_B_D, NO_DW_B_W}, Exclusive_Data) { + transition({NO_DR_B, NO_DR_B_D, NO_DW_B_W}, {Exclusive_Data, Data}) { r_recordCacheData; m_decrementNumberOfMessages; o_checkForCompletion; n_popResponseQueue; } - transition(NO_DR_B, All_acks_and_data, O) { + transition(NO_DR_B, All_acks_and_owner_data, O) { // // Note that the DMA consistency model allows us to send the DMA device // a response as soon as we receive valid data and prior to receiving // all acks. However, to simplify the protocol we wait for all acks. // dt_sendDmaDataFromTbe; + wdt_writeDataFromTBE; w_deallocateTBE; g_popTriggerQueue; } - transition(NO_DR_B_D, All_acks_and_data, O) { + transition(NO_DR_B, All_acks_and_shared_data, S) { // // Note that the DMA consistency model allows us to send the DMA device // a response as soon as we receive valid data and prior to receiving // all acks. However, to simplify the protocol we wait for all acks. // dt_sendDmaDataFromTbe; + wdt_writeDataFromTBE; w_deallocateTBE; g_popTriggerQueue; } - transition(O_DR_B, All_acks_and_data_no_sharers, O) { + transition(NO_DR_B_D, All_acks_and_owner_data, O) { + // + // Note that the DMA consistency model allows us to send the DMA device + // a response as soon as we receive valid data and prior to receiving + // all acks. However, to simplify the protocol we wait for all acks. + // + dt_sendDmaDataFromTbe; + wdt_writeDataFromTBE; w_deallocateTBE; g_popTriggerQueue; } + transition(NO_DR_B_D, All_acks_and_shared_data, S) { + // + // Note that the DMA consistency model allows us to send the DMA device + // a response as soon as we receive valid data and prior to receiving + // all acks. However, to simplify the protocol we wait for all acks. + // + dt_sendDmaDataFromTbe; + wdt_writeDataFromTBE; + w_deallocateTBE; + g_popTriggerQueue; + } + + transition(O_DR_B, All_acks_and_owner_data, O) { + wdt_writeDataFromTBE; + w_deallocateTBE; + g_popTriggerQueue; + } + + transition(O_DR_B, All_acks_and_data_no_sharers, E) { + wdt_writeDataFromTBE; + w_deallocateTBE; + pfd_probeFilterDeallocate; + g_popTriggerQueue; + } + transition(NO_DR_B, All_acks_and_data_no_sharers, E) { // // Note that the DMA consistency model allows us to send the DMA device @@ -825,7 +1242,9 @@ machine(Directory, "AMD Hammer-like protocol") // all acks. However, to simplify the protocol we wait for all acks. // dt_sendDmaDataFromTbe; + wdt_writeDataFromTBE; w_deallocateTBE; + ppfd_possibleProbeFilterDeallocate; g_popTriggerQueue; } @@ -837,7 +1256,9 @@ machine(Directory, "AMD Hammer-like protocol") // all acks. However, to simplify the protocol we wait for all acks. // dt_sendDmaDataFromTbe; + wdt_writeDataFromTBE; w_deallocateTBE; + ppfd_possibleProbeFilterDeallocate; g_popTriggerQueue; } @@ -850,6 +1271,7 @@ machine(Directory, "AMD Hammer-like protocol") transition(NO_DW_W, Memory_Ack, E) { da_sendDmaAck; w_deallocateTBE; + ppfd_possibleProbeFilterDeallocate; l_popMemQueue; } @@ -859,11 +1281,11 @@ machine(Directory, "AMD Hammer-like protocol") l_popMemQueue; } - transition(NO_B_W, Unblock, NO_W) { + transition(NO_B_W, {UnblockM, UnblockS}, NO_W) { j_popIncomingUnblockQueue; } - transition(O_B_W, Unblock, O_W) { + transition(O_B_W, UnblockS, O_W) { j_popIncomingUnblockQueue; } @@ -891,6 +1313,7 @@ machine(Directory, "AMD Hammer-like protocol") } transition(WB_E_W, Memory_Ack, E) { + pfd_probeFilterDeallocate; l_popMemQueue; } @@ -905,10 +1328,12 @@ machine(Directory, "AMD Hammer-like protocol") transition(WB, Writeback_Exclusive_Clean, E) { ll_checkIncomingWriteback; + pfd_probeFilterDeallocate; j_popIncomingUnblockQueue; } transition(WB, Unblock, NO) { + auno_assertUnblockerNotOwner; j_popIncomingUnblockQueue; } } diff --git a/src/mem/protocol/MOESI_hammer-msg.sm b/src/mem/protocol/MOESI_hammer-msg.sm index 4856178a1..f414d599d 100644 --- a/src/mem/protocol/MOESI_hammer-msg.sm +++ b/src/mem/protocol/MOESI_hammer-msg.sm @@ -36,6 +36,7 @@ enumeration(CoherenceRequestType, desc="...") { PUT, desc="Put Ownership"; WB_ACK, desc="Writeback ack"; WB_NACK, desc="Writeback neg. ack"; + INV, desc="Invalidate"; } // CoherenceResponseType @@ -49,7 +50,9 @@ enumeration(CoherenceResponseType, desc="...") { WB_DIRTY, desc="Dirty writeback"; WB_EXCLUSIVE_CLEAN, desc="Clean writeback of exclusive data"; WB_EXCLUSIVE_DIRTY, desc="Dirty writeback of exclusive data"; - UNBLOCK, desc="Unblock"; + UNBLOCK, desc="Unblock for writeback"; + UNBLOCKS, desc="Unblock now in S"; + UNBLOCKM, desc="Unblock now in M/O/E"; NULL, desc="Null value"; } @@ -57,6 +60,7 @@ enumeration(CoherenceResponseType, desc="...") { enumeration(TriggerType, desc="...") { L2_to_L1, desc="L2 to L1 transfer"; ALL_ACKS, desc="See corresponding event"; + ALL_ACKS_OWNER_EXISTS,desc="See corresponding event"; ALL_ACKS_NO_SHARERS, desc="See corresponding event"; } @@ -73,6 +77,7 @@ structure(RequestMsg, desc="...", interface="NetworkMessage") { MachineID Requestor, desc="Node who initiated the request"; NetDest Destination, desc="Multicast destination mask"; MessageSizeType MessageSize, desc="size category of the message"; + bool DirectedProbe, default="false", desc="probe filter directed probe"; } // ResponseMsg (and also unblock requests) diff --git a/src/mem/ruby/system/Cache.py b/src/mem/ruby/system/Cache.py index 06952afd1..ab3ec4b29 100644 --- a/src/mem/ruby/system/Cache.py +++ b/src/mem/ruby/system/Cache.py @@ -38,3 +38,4 @@ class RubyCache(SimObject): latency = Param.Int(""); assoc = Param.Int(""); replacement_policy = Param.String("PSEUDO_LRU", ""); + start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line"); diff --git a/src/mem/ruby/system/CacheMemory.cc b/src/mem/ruby/system/CacheMemory.cc index 604113238..59f97e5fe 100644 --- a/src/mem/ruby/system/CacheMemory.cc +++ b/src/mem/ruby/system/CacheMemory.cc @@ -53,6 +53,7 @@ CacheMemory::CacheMemory(const Params *p) m_cache_assoc = p->assoc; m_policy = p->replacement_policy; m_profiler_ptr = new CacheProfiler(name()); + m_start_index_bit = p->start_index_bit; } void @@ -127,8 +128,8 @@ Index CacheMemory::addressToCacheSet(const Address& address) const { assert(address == line_address(address)); - return address.bitSelect(RubySystem::getBlockSizeBits(), - RubySystem::getBlockSizeBits() + m_cache_num_set_bits - 1); + return address.bitSelect(m_start_index_bit, + m_start_index_bit + m_cache_num_set_bits - 1); } // Given a cache index: returns the index of the tag in a set. diff --git a/src/mem/ruby/system/CacheMemory.hh b/src/mem/ruby/system/CacheMemory.hh index c1d49f784..3ef951821 100644 --- a/src/mem/ruby/system/CacheMemory.hh +++ b/src/mem/ruby/system/CacheMemory.hh @@ -169,6 +169,7 @@ class CacheMemory : public SimObject int m_cache_num_sets; int m_cache_num_set_bits; int m_cache_assoc; + int m_start_index_bit; }; #endif // __MEM_RUBY_SYSTEM_CACHEMEMORY_HH__