ruby: more flexible ruby tester support

This patch allows the ruby random tester to use ruby ports that may only support instr or data requests. This patch is similar to a previous changeset (8932:1b2c17565ac8) that was unfortunately broken by subsequent changesets. This current patch implements the support in a more straight-forward way. Since retries are now tested when running the ruby random tester, this patch splits up the retry and drain check behavior so that RubyPort children, such as the GPUCoalescer, can perform those operations correctly without having to duplicate code. Finally, the patch also includes better DPRINTFs for debugging the tester.
2015-07-20 09:15:18 -05:00 · 2015-07-20 09:15:18 -05:00 · 173a786921
commit 173a786921
parent 4e6241007c
16 changed files with 322 additions and 179 deletions
--- a/configs/example/ruby_random_test.py
+++ b/configs/example/ruby_random_test.py
@ -125,10 +125,15 @@ for ruby_port in system.ruby._cpu_ports:
    #
    # Tie the ruby tester ports to the ruby cpu read and write ports
    #
-    if ruby_port.support_data_reqs:
-         tester.cpuDataPort = ruby_port.slave
-    if ruby_port.support_inst_reqs:
-         tester.cpuInstPort = ruby_port.slave
+    if ruby_port.support_data_reqs and ruby_port.support_inst_reqs:
+        tester.cpuInstDataPort = ruby_port.slave
+    elif ruby_port.support_data_reqs:
+        tester.cpuDataPort = ruby_port.slave
+    elif ruby_port.support_inst_reqs:
+        tester.cpuInstPort = ruby_port.slave
+
+    # Do not automatically retry stalled Ruby requests
+    ruby_port.no_retry_on_stall = True

    #
    # Tell each sequencer this is the ruby tester so that it
--- a/configs/ruby/MESI_Three_Level.py
+++ b/configs/ruby/MESI_Three_Level.py
@ -1,5 +1,5 @@
 # Copyright (c) 2006-2007 The Regents of The University of Michigan
-# Copyright (c) 2009 Advanced Micro Devices, Inc.
+# Copyright (c) 2009,2015 Advanced Micro Devices, Inc.
 # Copyright (c) 2013 Mark D. Hill and David A. Wood
 # All rights reserved.
 #
@ -44,22 +44,24 @@ class L1Cache(RubyCache): pass
 class L2Cache(RubyCache): pass

 def define_options(parser):
-    parser.add_option("--num-clusters", type="int", default=1,
-            help="number of clusters in a design in which there are shared\
+    parser.add_option("--num-clusters", type = "int", default = 1,
+            help = "number of clusters in a design in which there are shared\
            caches private to clusters")
    return

 def create_system(options, full_system, system, dma_ports, ruby_system):

    if buildEnv['PROTOCOL'] != 'MESI_Three_Level':
-        fatal("This script requires the MESI_Three_Level protocol to be built.")
+        fatal("This script requires the MESI_Three_Level protocol to be\
+               built.")

    cpu_sequencers = []

    #
    # The ruby network creation expects the list of nodes in the system to be
-    # consistent with the NetDest list.  Therefore the l1 controller nodes must be
-    # listed before the directory nodes and directory nodes before dma nodes, etc.
+    # consistent with the NetDest list.  Therefore the l1 controller nodes
+    # must be listed before the directory nodes and directory nodes before
+    # dma nodes, etc.
    #
    l0_cntrl_nodes = []
    l1_cntrl_nodes = []
@ -94,30 +96,45 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
                start_index_bit = block_size_bits,
                replacement_policy = LRUReplacementPolicy())

-            l0_cntrl = L0Cache_Controller(version = i*num_cpus_per_cluster + j,
-                          Icache = l0i_cache, Dcache = l0d_cache,
-                          send_evictions = send_evicts(options),
-                          clk_domain=system.cpu[i].clk_domain,
-                          ruby_system = ruby_system)
+            # the ruby random tester reuses num_cpus to specify the
+            # number of cpu ports connected to the tester object, which
+            # is stored in system.cpu. because there is only ever one
+            # tester object, num_cpus is not necessarily equal to the
+            # size of system.cpu; therefore if len(system.cpu) == 1
+            # we use system.cpu[0] to set the clk_domain, thereby ensuring
+            # we don't index off the end of the cpu list.
+            if len(system.cpu) == 1:
+                clk_domain = system.cpu[0].clk_domain
+            else:
+                clk_domain = system.cpu[i].clk_domain
+
+            l0_cntrl = L0Cache_Controller(
+                   version = i * num_cpus_per_cluster + j, Icache = l0i_cache,
+                   Dcache = l0d_cache, send_evictions = send_evicts(options),
+                   clk_domain = clk_domain, ruby_system = ruby_system)

            cpu_seq = RubySequencer(version = i * num_cpus_per_cluster + j,
-                        icache = l0i_cache,
-                        clk_domain=system.cpu[i].clk_domain,
-                        dcache = l0d_cache, ruby_system = ruby_system)
+                                    icache = l0i_cache,
+                                    clk_domain = clk_domain,
+                                    dcache = l0d_cache,
+                                    ruby_system = ruby_system)

            l0_cntrl.sequencer = cpu_seq

-            l1_cache = L1Cache(size = options.l1d_size, assoc = options.l1d_assoc,
-                            start_index_bit = block_size_bits, is_icache = False)
+            l1_cache = L1Cache(size = options.l1d_size,
+                               assoc = options.l1d_assoc,
+                               start_index_bit = block_size_bits,
+                               is_icache = False)

-            l1_cntrl = L1Cache_Controller(version = i*num_cpus_per_cluster+j,
-                          cache = l1_cache, l2_select_num_bits = l2_bits,
-                          cluster_id = i, ruby_system = ruby_system)
+            l1_cntrl = L1Cache_Controller(
+                    version = i * num_cpus_per_cluster + j,
+                    cache = l1_cache, l2_select_num_bits = l2_bits,
+                    cluster_id = i, ruby_system = ruby_system)

-            exec("ruby_system.l0_cntrl%d = l0_cntrl" % (
-                        i*num_cpus_per_cluster+j))
-            exec("ruby_system.l1_cntrl%d = l1_cntrl" % (
-                        i*num_cpus_per_cluster+j))
+            exec("ruby_system.l0_cntrl%d = l0_cntrl"
+                 % ( i * num_cpus_per_cluster + j))
+            exec("ruby_system.l1_cntrl%d = l1_cntrl"
+                 % ( i * num_cpus_per_cluster + j))

            #
            # Add controllers and sequencers to the appropriate lists
@ -155,11 +172,11 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
            l2_cntrl = L2Cache_Controller(
                        version = i * num_l2caches_per_cluster + j,
                        L2cache = l2_cache, cluster_id = i,
-                        transitions_per_cycle=options.ports,
+                        transitions_per_cycle = options.ports,
                        ruby_system = ruby_system)

-            exec("ruby_system.l2_cntrl%d = l2_cntrl" % (
-                        i * num_l2caches_per_cluster + j))
+            exec("ruby_system.l2_cntrl%d = l2_cntrl"
+                 % (i * num_l2caches_per_cluster + j))
            l2_cntrl_nodes.append(l2_cntrl)

            # Connect the L2 controllers and the network
@ -185,8 +202,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
    # the ruby system
    # clk_divider value is a fix to pass regression.
    ruby_system.memctrl_clk_domain = DerivedClockDomain(
-                                          clk_domain=ruby_system.clk_domain,
-                                          clk_divider=3)
+            clk_domain = ruby_system.clk_domain, clk_divider = 3)

    for i in xrange(options.num_dirs):
        #
@ -196,10 +212,9 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
        dir_size.value = mem_module_size

        dir_cntrl = Directory_Controller(version = i,
-                                         directory = RubyDirectoryMemory(
-                                             version = i, size = dir_size),
-                                         transitions_per_cycle = options.ports,
-                                         ruby_system = ruby_system)
+                directory = RubyDirectoryMemory(version = i, size = dir_size),
+                transitions_per_cycle = options.ports,
+                ruby_system = ruby_system)

        exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
        dir_cntrl_nodes.append(dir_cntrl)
@ -217,8 +232,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
        #
        # Create the Ruby objects associated with the dma controller
        #
-        dma_seq = DMASequencer(version = i,
-                               ruby_system = ruby_system)
+        dma_seq = DMASequencer(version = i, ruby_system = ruby_system)

        dma_cntrl = DMA_Controller(version = i,
                                   dma_sequencer = dma_seq,
--- a/configs/ruby/MESI_Two_Level.py
+++ b/configs/ruby/MESI_Two_Level.py
@ -82,23 +82,33 @@ def create_system(options, full_system, system, dma_ports, ruby_system):

        prefetcher = RubyPrefetcher.Prefetcher()

-        l1_cntrl = L1Cache_Controller(version = i,
-                                      L1Icache = l1i_cache,
+        # the ruby random tester reuses num_cpus to specify the
+        # number of cpu ports connected to the tester object, which
+        # is stored in system.cpu. because there is only ever one
+        # tester object, num_cpus is not necessarily equal to the
+        # size of system.cpu; therefore if len(system.cpu) == 1
+        # we use system.cpu[0] to set the clk_domain, thereby ensuring
+        # we don't index off the end of the cpu list.
+        if len(system.cpu) == 1:
+            clk_domain = system.cpu[0].clk_domain
+        else:
+            clk_domain = system.cpu[i].clk_domain
+
+        l1_cntrl = L1Cache_Controller(version = i, L1Icache = l1i_cache,
                                      L1Dcache = l1d_cache,
                                      l2_select_num_bits = l2_bits,
                                      send_evictions = send_evicts(options),
                                      prefetcher = prefetcher,
                                      ruby_system = ruby_system,
-                                      clk_domain=system.cpu[i].clk_domain,
-                                      transitions_per_cycle=options.ports,
+                                      clk_domain = clk_domain,
+                                      transitions_per_cycle = options.ports,
                                      enable_prefetch = False)

-        cpu_seq = RubySequencer(version = i,
-                                icache = l1i_cache,
-                                dcache = l1d_cache,
-                                clk_domain=system.cpu[i].clk_domain,
+        cpu_seq = RubySequencer(version = i, icache = l1i_cache,
+                                dcache = l1d_cache, clk_domain = clk_domain,
                                ruby_system = ruby_system)

+
        l1_cntrl.sequencer = cpu_seq
        exec("ruby_system.l1_cntrl%d = l1_cntrl" % i)

@ -135,7 +145,7 @@ def create_system(options, full_system, system, dma_ports, ruby_system):

        l2_cntrl = L2Cache_Controller(version = i,
                                      L2cache = l2_cache,
-                                      transitions_per_cycle=options.ports,
+                                      transitions_per_cycle = options.ports,
                                      ruby_system = ruby_system)

        exec("ruby_system.l2_cntrl%d = l2_cntrl" % i)
@ -166,18 +176,17 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
    # the ruby system
    # clk_divider value is a fix to pass regression.
    ruby_system.memctrl_clk_domain = DerivedClockDomain(
-                                          clk_domain=ruby_system.clk_domain,
-                                          clk_divider=3)
+                                          clk_domain = ruby_system.clk_domain,
+                                          clk_divider = 3)

    for i in xrange(options.num_dirs):
        dir_size = MemorySize('0B')
        dir_size.value = mem_module_size

        dir_cntrl = Directory_Controller(version = i,
-                                         directory = RubyDirectoryMemory(
-                                             version = i, size = dir_size),
-                                         transitions_per_cycle = options.ports,
-                                         ruby_system = ruby_system)
+                directory = RubyDirectoryMemory(version = i, size = dir_size),
+                transitions_per_cycle = options.ports,
+                ruby_system = ruby_system)

        exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
        dir_cntrl_nodes.append(dir_cntrl)
@ -194,12 +203,10 @@ def create_system(options, full_system, system, dma_ports, ruby_system):

    for i, dma_port in enumerate(dma_ports):
        # Create the Ruby objects associated with the dma controller
-        dma_seq = DMASequencer(version = i,
-                               ruby_system = ruby_system,
+        dma_seq = DMASequencer(version = i, ruby_system = ruby_system,
                               slave = dma_port)

-        dma_cntrl = DMA_Controller(version = i,
-                                   dma_sequencer = dma_seq,
+        dma_cntrl = DMA_Controller(version = i, dma_sequencer = dma_seq,
                                   transitions_per_cycle = options.ports,
                                   ruby_system = ruby_system)

@ -220,7 +227,8 @@ def create_system(options, full_system, system, dma_ports, ruby_system):

    # Create the io controller and the sequencer
    if full_system:
-        io_seq = DMASequencer(version=len(dma_ports), ruby_system=ruby_system)
+        io_seq = DMASequencer(version = len(dma_ports),
+                              ruby_system = ruby_system)
        ruby_system._io_port = io_seq
        io_controller = DMA_Controller(version = len(dma_ports),
                                       dma_sequencer = io_seq,
--- a/configs/ruby/MI_example.py
+++ b/configs/ruby/MI_example.py
@ -74,21 +74,28 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
                        assoc = options.l1d_assoc,
                        start_index_bit = block_size_bits)

-        #
-        # Only one unified L1 cache exists.  Can cache instructions and data.
-        #
-        l1_cntrl = L1Cache_Controller(version = i,
-                                      cacheMemory = cache,
-                                      send_evictions = send_evicts(options),
-                                      transitions_per_cycle = options.ports,
-                                      clk_domain=system.cpu[i].clk_domain,
-                                      ruby_system = ruby_system)

-        cpu_seq = RubySequencer(version = i,
-                                icache = cache,
-                                dcache = cache,
-                                clk_domain=system.cpu[i].clk_domain,
-                                ruby_system = ruby_system)
+        # the ruby random tester reuses num_cpus to specify the
+        # number of cpu ports connected to the tester object, which
+        # is stored in system.cpu. because there is only ever one
+        # tester object, num_cpus is not necessarily equal to the
+        # size of system.cpu; therefore if len(system.cpu) == 1
+        # we use system.cpu[0] to set the clk_domain, thereby ensuring
+        # we don't index off the end of the cpu list.
+        if len(system.cpu) == 1:
+            clk_domain = system.cpu[0].clk_domain
+        else:
+            clk_domain = system.cpu[i].clk_domain
+
+        # Only one unified L1 cache exists. Can cache instructions and data.
+        l1_cntrl = L1Cache_Controller(version=i, cacheMemory=cache,
+                                      send_evictions=send_evicts(options),
+                                      transitions_per_cycle=options.ports,
+                                      clk_domain=clk_domain,
+                                      ruby_system=ruby_system)
+
+        cpu_seq = RubySequencer(version=i, icache=cache, dcache=cache,
+                                clk_domain=clk_domain, ruby_system=ruby_system)

        l1_cntrl.sequencer = cpu_seq
        exec("ruby_system.l1_cntrl%d = l1_cntrl" % i)
--- a/configs/ruby/MOESI_CMP_directory.py
+++ b/configs/ruby/MOESI_CMP_directory.py
@ -80,20 +80,29 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
                            start_index_bit = block_size_bits,
                            is_icache = False)

-        l1_cntrl = L1Cache_Controller(version = i,
-                                      L1Icache = l1i_cache,
-                                      L1Dcache = l1d_cache,
-                                      l2_select_num_bits = l2_bits,
-                                      send_evictions = send_evicts(options),
-                                      transitions_per_cycle = options.ports,
-                                      clk_domain=system.cpu[i].clk_domain,
-                                      ruby_system = ruby_system)
+        # the ruby random tester reuses num_cpus to specify the
+        # number of cpu ports connected to the tester object, which
+        # is stored in system.cpu. because there is only ever one
+        # tester object, num_cpus is not necessarily equal to the
+        # size of system.cpu; therefore if len(system.cpu) == 1
+        # we use system.cpu[0] to set the clk_domain, thereby ensuring
+        # we don't index off the end of the cpu list.
+        if len(system.cpu) == 1:
+            clk_domain = system.cpu[0].clk_domain
+        else:
+            clk_domain = system.cpu[i].clk_domain

-        cpu_seq = RubySequencer(version = i,
-                                icache = l1i_cache,
-                                dcache = l1d_cache,
-                                clk_domain=system.cpu[i].clk_domain,
-                                ruby_system = ruby_system)
+        l1_cntrl = L1Cache_Controller(version=i, L1Icache=l1i_cache,
+                                      L1Dcache=l1d_cache,
+                                      l2_select_num_bits=l2_bits,
+                                      send_evictions=send_evicts(options),
+                                      transitions_per_cycle=options.ports,
+                                      clk_domain=clk_domain,
+                                      ruby_system=ruby_system)
+
+        cpu_seq = RubySequencer(version=i, icache=l1i_cache,
+                                dcache=l1d_cache, clk_domain=clk_domain,
+                                ruby_system=ruby_system)

        l1_cntrl.sequencer = cpu_seq
        exec("ruby_system.l1_cntrl%d = l1_cntrl" % i)
--- a/configs/ruby/MOESI_CMP_token.py
+++ b/configs/ruby/MOESI_CMP_token.py
@ -91,29 +91,37 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
                            assoc = options.l1d_assoc,
                            start_index_bit = block_size_bits)

-        l1_cntrl = L1Cache_Controller(version = i,
-                                      L1Icache = l1i_cache,
-                                      L1Dcache = l1d_cache,
-                                      l2_select_num_bits = l2_bits,
-                                      N_tokens = n_tokens,
-                                      retry_threshold = \
-                                        options.l1_retries,
-                                      fixed_timeout_latency = \
-                                        options.timeout_latency,
-                                      dynamic_timeout_enabled = \
-                                        not options.disable_dyn_timeouts,
-                                      no_mig_atomic = not \
-                                        options.allow_atomic_migration,
-                                      send_evictions = send_evicts(options),
-                                      transitions_per_cycle = options.ports,
-                                      clk_domain=system.cpu[i].clk_domain,
-                                      ruby_system = ruby_system)
+        # the ruby random tester reuses num_cpus to specify the
+        # number of cpu ports connected to the tester object, which
+        # is stored in system.cpu. because there is only ever one
+        # tester object, num_cpus is not necessarily equal to the
+        # size of system.cpu; therefore if len(system.cpu) == 1
+        # we use system.cpu[0] to set the clk_domain, thereby ensuring
+        # we don't index off the end of the cpu list.
+        if len(system.cpu) == 1:
+            clk_domain = system.cpu[0].clk_domain
+        else:
+            clk_domain = system.cpu[i].clk_domain

-        cpu_seq = RubySequencer(version = i,
-                                icache = l1i_cache,
-                                dcache = l1d_cache,
-                                clk_domain=system.cpu[i].clk_domain,
-                                ruby_system = ruby_system)
+        l1_cntrl = L1Cache_Controller(version=i, L1Icache=l1i_cache,
+                                      L1Dcache=l1d_cache,
+                                      l2_select_num_bits=l2_bits,
+                                      N_tokens=n_tokens,
+                                      retry_threshold=options.l1_retries,
+                                      fixed_timeout_latency=\
+                                      options.timeout_latency,
+                                      dynamic_timeout_enabled=\
+                                      not options.disable_dyn_timeouts,
+                                      no_mig_atomic=not \
+                                      options.allow_atomic_migration,
+                                      send_evictions=send_evicts(options),
+                                      transitions_per_cycle=options.ports,
+                                      clk_domain=clk_domain,
+                                      ruby_system=ruby_system)
+
+        cpu_seq = RubySequencer(version=i, icache=l1i_cache,
+                                dcache=l1d_cache, clk_domain=clk_domain,
+                                ruby_system=ruby_system)

        l1_cntrl.sequencer = cpu_seq
        exec("ruby_system.l1_cntrl%d = l1_cntrl" % i)
--- a/configs/ruby/MOESI_hammer.py
+++ b/configs/ruby/MOESI_hammer.py
@ -89,22 +89,30 @@ def create_system(options, full_system, system, dma_ports, ruby_system):
                           assoc = options.l2_assoc,
                           start_index_bit = block_size_bits)

-        l1_cntrl = L1Cache_Controller(version = i,
-                                      L1Icache = l1i_cache,
-                                      L1Dcache = l1d_cache,
-                                      L2cache = l2_cache,
-                                      no_mig_atomic = not \
-                                        options.allow_atomic_migration,
-                                      send_evictions = send_evicts(options),
-                                      transitions_per_cycle = options.ports,
-                                      clk_domain=system.cpu[i].clk_domain,
-                                      ruby_system = ruby_system)
+        # the ruby random tester reuses num_cpus to specify the
+        # number of cpu ports connected to the tester object, which
+        # is stored in system.cpu. because there is only ever one
+        # tester object, num_cpus is not necessarily equal to the
+        # size of system.cpu; therefore if len(system.cpu) == 1
+        # we use system.cpu[0] to set the clk_domain, thereby ensuring
+        # we don't index off the end of the cpu list.
+        if len(system.cpu) == 1:
+            clk_domain = system.cpu[0].clk_domain
+        else:
+            clk_domain = system.cpu[i].clk_domain

-        cpu_seq = RubySequencer(version = i,
-                                icache = l1i_cache,
-                                dcache = l1d_cache,
-                                clk_domain=system.cpu[i].clk_domain,
-                                ruby_system = ruby_system)
+        l1_cntrl = L1Cache_Controller(version=i, L1Icache=l1i_cache,
+                                      L1Dcache=l1d_cache, L2cache=l2_cache,
+                                      no_mig_atomic=not \
+                                      options.allow_atomic_migration,
+                                      send_evictions=send_evicts(options),
+                                      transitions_per_cycle=options.ports,
+                                      clk_domain=clk_domain,
+                                      ruby_system=ruby_system)
+
+        cpu_seq = RubySequencer(version=i, icache=l1i_cache,
+                                dcache=l1d_cache,clk_domain=clk_domain,
+                                ruby_system=ruby_system)

        l1_cntrl.sequencer = cpu_seq
        if options.recycle_latency:
--- a/src/cpu/testers/rubytest/Check.cc
+++ b/src/cpu/testers/rubytest/Check.cc
@ -94,7 +94,9 @@ Check::initiatePrefetch()
        cmd = MemCmd::ReadReq;

        // if necessary, make the request an instruction fetch
-        if (m_tester_ptr->isInstReadableCpuPort(index)) {
+        if (m_tester_ptr->isInstOnlyCpuPort(index) ||
+            (m_tester_ptr->isInstDataCpuPort(index) &&
+             (random_mt.random(0, 0x1)))) {
            flags.set(Request::INST_FETCH);
        }
    } else {
@ -193,7 +195,7 @@ Check::initiateAction()
    *writeData = m_value + m_store_count;
    pkt->dataDynamic(writeData);

-    DPRINTF(RubyTest, "data 0x%x check 0x%x\n",
+    DPRINTF(RubyTest, "Seq write: index %d data 0x%x check 0x%x\n", index,
            *(pkt->getConstPtr<uint8_t>()), *writeData);

    // push the subblock onto the sender state.  The sequencer will
@ -205,6 +207,7 @@ Check::initiateAction()
        DPRINTF(RubyTest, "status before action update: %s\n",
                (TesterStatus_to_string(m_status)).c_str());
        m_status = TesterStatus_Action_Pending;
+        DPRINTF(RubyTest, "Check %s, State=Action_Pending\n", m_address);
    } else {
        // If the packet did not issue, must delete
        // Note: No need to delete the data, the packet destructor
@ -232,7 +235,9 @@ Check::initiateCheck()
    Request::Flags flags;

    // If necessary, make the request an instruction fetch
-    if (m_tester_ptr->isInstReadableCpuPort(index)) {
+    if (m_tester_ptr->isInstOnlyCpuPort(index) ||
+        (m_tester_ptr->isInstDataCpuPort(index) &&
+         (random_mt.random(0, 0x1)))) {
        flags.set(Request::INST_FETCH);
    }

@ -245,6 +250,8 @@ Check::initiateCheck()
    uint8_t *dataArray = new uint8_t[CHECK_SIZE];
    pkt->dataDynamic(dataArray);

+    DPRINTF(RubyTest, "Seq read: index %d\n", index);
+
    // push the subblock onto the sender state.  The sequencer will
    // update the subblock on the return
    pkt->senderState = new SenderState(m_address, req->getSize());
@ -254,6 +261,7 @@ Check::initiateCheck()
        DPRINTF(RubyTest, "status before check update: %s\n",
                TesterStatus_to_string(m_status).c_str());
        m_status = TesterStatus_Check_Pending;
+        DPRINTF(RubyTest, "Check %s, State=Check_Pending\n", m_address);
    } else {
        // If the packet did not issue, must delete
        // Note: No need to delete the data, the packet destructor
@ -291,8 +299,11 @@ Check::performCallback(NodeID proc, SubBlock* data, Cycles curTime)
        m_store_count++;
        if (m_store_count == CHECK_SIZE) {
            m_status = TesterStatus_Ready;
+            DPRINTF(RubyTest, "Check %s, State=Ready\n", m_address);
        } else {
            m_status = TesterStatus_Idle;
+            DPRINTF(RubyTest, "Check %s, State=Idle store_count: %d\n",
+                    m_address, m_store_count);
        }
        DPRINTF(RubyTest, "Action callback return data now %d\n",
                data->getByte(0));
@ -316,6 +327,7 @@ Check::performCallback(NodeID proc, SubBlock* data, Cycles curTime)
        m_tester_ptr->incrementCheckCompletions();

        m_status = TesterStatus_Idle;
+        DPRINTF(RubyTest, "Check %s, State=Idle\n", m_address);
        pickValue();

    } else {
@ -335,6 +347,7 @@ Check::changeAddress(Addr address)
    assert(m_status == TesterStatus_Idle || m_status == TesterStatus_Ready);
    m_status = TesterStatus_Idle;
    m_address = address;
+    DPRINTF(RubyTest, "Check %s, State=Idle\n", m_address);
    m_store_count = 0;
 }

@ -342,7 +355,6 @@ void
 Check::pickValue()
 {
    assert(m_status == TesterStatus_Idle);
-    m_status = TesterStatus_Idle;
    m_value = random_mt.random(0, 0xff); // One byte
    m_store_count = 0;
 }
@ -353,7 +365,8 @@ Check::pickInitiatingNode()
    assert(m_status == TesterStatus_Idle || m_status == TesterStatus_Ready);
    m_status = TesterStatus_Idle;
    m_initiatingNode = (random_mt.random(0, m_num_writers - 1));
-    DPRINTF(RubyTest, "picked initiating node %d\n", m_initiatingNode);
+    DPRINTF(RubyTest, "Check %s, State=Idle, picked initiating node %d\n",
+            m_address, m_initiatingNode);
    m_store_count = 0;
 }

--- a/src/cpu/testers/rubytest/CheckTable.cc
+++ b/src/cpu/testers/rubytest/CheckTable.cc
@ -42,6 +42,7 @@ CheckTable::CheckTable(int _num_writers, int _num_readers, RubyTester* _tester)
    const int size1 = 32;
    const int size2 = 100;

+    DPRINTF(RubyTest, "Adding false sharing checks\n");
    // The first set is to get some false sharing
    physical = 1000;
    for (int i = 0; i < size1; i++) {
@ -50,6 +51,7 @@ CheckTable::CheckTable(int _num_writers, int _num_readers, RubyTester* _tester)
        physical += CHECK_SIZE;
    }

+    DPRINTF(RubyTest, "Adding cache conflict checks\n");
    // The next two sets are to get some limited false sharing and
    // cache conflicts
    physical = 1000;
@ -59,6 +61,7 @@ CheckTable::CheckTable(int _num_writers, int _num_readers, RubyTester* _tester)
        physical += 256;
    }

+    DPRINTF(RubyTest, "Adding cache conflict checks2\n");
    physical = 1000 + CHECK_SIZE;
    for (int i = 0; i < size2; i++) {
        // Setup linear addresses
@ -91,6 +94,8 @@ CheckTable::addCheck(Addr address)
        }
    }

+    DPRINTF(RubyTest, "Adding check for address: %s\n", address);
+
    Check* check_ptr = new Check(address, 100 + m_check_vector.size(),
                                 m_num_writers, m_num_readers, m_tester_ptr);
    for (int i = 0; i < CHECK_SIZE; i++) {
@ -110,7 +115,7 @@ CheckTable::getRandomCheck()
 Check*
 CheckTable::getCheck(const Addr address)
 {
-    DPRINTF(RubyTest, "Looking for check by address: %s", address);
+    DPRINTF(RubyTest, "Looking for check by address: %s\n", address);

    auto i = m_lookup_map.find(address);

--- a/src/cpu/testers/rubytest/RubyTester.cc
+++ b/src/cpu/testers/rubytest/RubyTester.cc
@ -58,7 +58,8 @@ RubyTester::RubyTester(const Params *p)
    m_num_readers(0),
    m_wakeup_frequency(p->wakeup_frequency),
    m_check_flush(p->check_flush),
-    m_num_inst_ports(p->port_cpuInstPort_connection_count)
+    m_num_inst_only_ports(p->port_cpuInstPort_connection_count),
+    m_num_inst_data_ports(p->port_cpuInstDataPort_connection_count)
 {
    m_checks_completed = 0;

@ -73,15 +74,25 @@ RubyTester::RubyTester(const Params *p)
    // Note: the inst ports are the lowest elements of the readPort vector,
    // then the data ports are added to the readPort vector
    //
+    int idx = 0;
    for (int i = 0; i < p->port_cpuInstPort_connection_count; ++i) {
        readPorts.push_back(new CpuPort(csprintf("%s-instPort%d", name(), i),
-                                        this, i));
+                                        this, i, idx));
+        idx++;
+    }
+    for (int i = 0; i < p->port_cpuInstDataPort_connection_count; ++i) {
+        CpuPort *port = new CpuPort(csprintf("%s-instDataPort%d", name(), i),
+                                    this, i, idx);
+        readPorts.push_back(port);
+        writePorts.push_back(port);
+        idx++;
    }
    for (int i = 0; i < p->port_cpuDataPort_connection_count; ++i) {
        CpuPort *port = new CpuPort(csprintf("%s-dataPort%d", name(), i),
-                                    this, i);
+                                    this, i, idx);
        readPorts.push_back(port);
        writePorts.push_back(port);
+        idx++;
    }

    // add the check start event to the event queue
@ -108,6 +119,7 @@ RubyTester::init()

    m_num_writers = writePorts.size();
    m_num_readers = readPorts.size();
+    assert(m_num_readers == m_num_cpus);

    m_checkTable_ptr = new CheckTable(m_num_writers, m_num_readers, this);
 }
@ -115,32 +127,45 @@ RubyTester::init()
 BaseMasterPort &
 RubyTester::getMasterPort(const std::string &if_name, PortID idx)
 {
-    if (if_name != "cpuInstPort" && if_name != "cpuDataPort") {
+    if (if_name != "cpuInstPort" && if_name != "cpuInstDataPort" &&
+        if_name != "cpuDataPort") {
        // pass it along to our super class
        return MemObject::getMasterPort(if_name, idx);
    } else {
        if (if_name == "cpuInstPort") {
-            if (idx > m_num_inst_ports) {
-                panic("RubyTester::getMasterPort: unknown inst port idx %d\n",
+            if (idx > m_num_inst_only_ports) {
+                panic("RubyTester::getMasterPort: unknown inst port %d\n",
                      idx);
            }
            //
-            // inst ports directly map to the lowest readPort elements
+            // inst ports map to the lowest readPort elements
            //
            return *readPorts[idx];
+        } else if (if_name == "cpuInstDataPort") {
+            if (idx > m_num_inst_data_ports) {
+                panic("RubyTester::getMasterPort: unknown inst+data port %d\n",
+                      idx);
+            }
+            int read_idx = idx + m_num_inst_only_ports;
+            //
+            // inst+data ports map to the next readPort elements
+            //
+            return *readPorts[read_idx];
        } else {
            assert(if_name == "cpuDataPort");
            //
-            // add the inst port offset to translate to the correct read port
-            // index
+            // data only ports map to the final readPort elements
            //
-            int read_idx = idx + m_num_inst_ports;
-            if (read_idx >= static_cast<PortID>(readPorts.size())) {
-                panic("RubyTester::getMasterPort: unknown data port idx %d\n",
+            if (idx > (static_cast<int>(readPorts.size()) -
+                       (m_num_inst_only_ports + m_num_inst_data_ports))) {
+                panic("RubyTester::getMasterPort: unknown data port %d\n",
                      idx);
            }
+            int read_idx = idx + m_num_inst_only_ports + m_num_inst_data_ports;
            return *readPorts[read_idx];
        }
+        // Note: currently the Ruby Tester does not support write only ports
+        // but that could easily be added here
    }
 }

@ -152,7 +177,7 @@ RubyTester::CpuPort::recvTimingResp(PacketPtr pkt)
        safe_cast<RubyTester::SenderState*>(pkt->senderState);
    SubBlock& subblock = senderState->subBlock;

-    tester->hitCallback(id, &subblock);
+    tester->hitCallback(globalIdx, &subblock);

    // Now that the tester has completed, delete the senderState
    // (includes sublock) and the packet, then return
@ -163,9 +188,16 @@ RubyTester::CpuPort::recvTimingResp(PacketPtr pkt)
 }

 bool
-RubyTester::isInstReadableCpuPort(int idx)
+RubyTester::isInstOnlyCpuPort(int idx)
 {
-    return idx < m_num_inst_ports;
+    return idx < m_num_inst_only_ports;
+}
+
+bool
+RubyTester::isInstDataCpuPort(int idx)
+{
+    return ((idx >= m_num_inst_only_ports) &&
+            (idx < (m_num_inst_only_ports + m_num_inst_data_ports)));
 }

 MasterPort*
@ -190,13 +222,13 @@ RubyTester::hitCallback(NodeID proc, SubBlock* data)
    // Mark that we made progress
    m_last_progress_vector[proc] = curCycle();

-    DPRINTF(RubyTest, "completed request for proc: %d\n", proc);
-    DPRINTF(RubyTest, "addr: 0x%x, size: %d, data: ",
+    DPRINTF(RubyTest, "completed request for proc: %d", proc);
+    DPRINTFR(RubyTest, " addr: 0x%x, size: %d, data: ",
            data->getAddress(), data->getSize());
    for (int byte = 0; byte < data->getSize(); byte++) {
-        DPRINTF(RubyTest, "%d", data->getByte(byte));
+        DPRINTFR(RubyTest, "%d ", data->getByte(byte));
    }
-    DPRINTF(RubyTest, "\n");
+    DPRINTFR(RubyTest, "\n");

    // This tells us our store has 'completed' or for a load gives us
    // back the data to make the check
--- a/src/cpu/testers/rubytest/RubyTester.hh
+++ b/src/cpu/testers/rubytest/RubyTester.hh
@ -60,6 +60,8 @@ class RubyTester : public MemObject
    {
      private:
        RubyTester *tester;
+        // index for m_last_progress_vector and hitCallback
+        PortID globalIdx;

      public:
        //
@ -68,8 +70,10 @@ class RubyTester : public MemObject
        // RubyPorts that support both types of requests, separate InstOnly
        // and DataOnly CpuPorts will map to that RubyPort

-        CpuPort(const std::string &_name, RubyTester *_tester, PortID _id)
-            : MasterPort(_name, _tester, _id), tester(_tester)
+        CpuPort(const std::string &_name, RubyTester *_tester, PortID _id,
+                PortID _index)
+            : MasterPort(_name, _tester, _id), tester(_tester),
+              globalIdx(_index)
        {}

      protected:
@ -93,7 +97,8 @@ class RubyTester : public MemObject
    virtual BaseMasterPort &getMasterPort(const std::string &if_name,
                                          PortID idx = InvalidPortID);

-    bool isInstReadableCpuPort(int idx);
+    bool isInstOnlyCpuPort(int idx);
+    bool isInstDataCpuPort(int idx);

    MasterPort* getReadableCpuPort(int idx);
    MasterPort* getWritableCpuPort(int idx);
@ -152,7 +157,8 @@ class RubyTester : public MemObject
    int m_num_readers;
    int m_wakeup_frequency;
    bool m_check_flush;
-    int m_num_inst_ports;
+    int m_num_inst_only_ports;
+    int m_num_inst_data_ports;
 };

 inline std::ostream&
--- a/src/cpu/testers/rubytest/RubyTester.py
+++ b/src/cpu/testers/rubytest/RubyTester.py
@ -34,8 +34,9 @@ class RubyTester(MemObject):
    type = 'RubyTester'
    cxx_header = "cpu/testers/rubytest/RubyTester.hh"
    num_cpus = Param.Int("number of cpus / RubyPorts")
-    cpuDataPort = VectorMasterPort("the cpu data cache ports")
-    cpuInstPort = VectorMasterPort("the cpu inst cache ports")
+    cpuInstDataPort = VectorMasterPort("cpu combo ports to inst & data caches")
+    cpuInstPort = VectorMasterPort("cpu ports to only inst caches")
+    cpuDataPort = VectorMasterPort("cpu ports to only data caches")
    checks_to_complete = Param.Int(100, "checks to complete")
    deadlock_threshold = Param.Int(50000, "how often to check for deadlock")
    wakeup_frequency = Param.Int(10, "number of cycles between wakeups")
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@ -11,7 +11,7 @@
 * unmodified and in its entirety in all distributions of the software,
 * modified or unmodified, in source code or in binary form.
 *
- * Copyright (c) 2009 Advanced Micro Devices, Inc.
+ * Copyright (c) 2009-2013 Advanced Micro Devices, Inc.
 * Copyright (c) 2011 Mark D. Hill and David A. Wood
 * All rights reserved.
 *
@ -58,7 +58,8 @@ RubyPort::RubyPort(const Params *p)
      pioSlavePort(csprintf("%s.pio-slave-port", name()), this),
      memMasterPort(csprintf("%s.mem-master-port", name()), this),
      memSlavePort(csprintf("%s-mem-slave-port", name()), this,
-                   p->ruby_system->getAccessBackingStore(), -1),
+                   p->ruby_system->getAccessBackingStore(), -1,
+                   p->no_retry_on_stall),
      gotAddrRanges(p->port_master_connection_count)
 {
    assert(m_version != -1);
@ -66,7 +67,8 @@ RubyPort::RubyPort(const Params *p)
    // create the slave ports based on the number of connected ports
    for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
        slave_ports.push_back(new MemSlavePort(csprintf("%s.slave%d", name(),
-            i), this, p->ruby_system->getAccessBackingStore(), i));
+            i), this, p->ruby_system->getAccessBackingStore(),
+            i, p->no_retry_on_stall));
    }

    // create the master ports based on the number of connected ports
@ -156,9 +158,11 @@ RubyPort::MemMasterPort::MemMasterPort(const std::string &_name,
 }

 RubyPort::MemSlavePort::MemSlavePort(const std::string &_name, RubyPort *_port,
-                                     bool _access_backing_store, PortID id)
+                                     bool _access_backing_store, PortID id,
+                                     bool _no_retry_on_stall)
    : QueuedSlavePort(_name, _port, queue, id), queue(*_port, *this),
-      access_backing_store(_access_backing_store)
+      access_backing_store(_access_backing_store),
+      no_retry_on_stall(_no_retry_on_stall)
 {
    DPRINTF(RubyPort, "Created slave memport on ruby sequencer %s\n", _name);
 }
@ -267,20 +271,30 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
        return true;
    }

-    //
-    // Unless one is using the ruby tester, record the stalled M5 port for
-    // later retry when the sequencer becomes free.
-    //
-    if (!ruby_port->m_usingRubyTester) {
-        ruby_port->addToRetryList(this);
-    }

    DPRINTF(RubyPort, "Request for address %#x did not issued because %s\n",
            pkt->getAddr(), RequestStatus_to_string(requestStatus));

+    addToRetryList();
+
    return false;
 }

+void
+RubyPort::MemSlavePort::addToRetryList()
+{
+    RubyPort *ruby_port = static_cast<RubyPort *>(&owner);
+
+    //
+    // Unless the requestor do not want retries (e.g., the Ruby tester),
+    // record the stalled M5 port for later retry when the sequencer
+    // becomes free.
+    //
+    if (!no_retry_on_stall && !ruby_port->onRetryList(this)) {
+        ruby_port->addToRetryList(this);
+    }
+}
+
 void
 RubyPort::MemSlavePort::recvFunctional(PacketPtr pkt)
 {
@ -356,31 +370,33 @@ RubyPort::ruby_hit_callback(PacketPtr pkt)

    port->hitCallback(pkt);

+    trySendRetries();
+}
+
+void
+RubyPort::trySendRetries()
+{
    //
    // If we had to stall the MemSlavePorts, wake them up because the sequencer
    // likely has free resources now.
    //
    if (!retryList.empty()) {
-        //
-        // Record the current list of ports to retry on a temporary list before
-        // calling sendRetry on those ports.  sendRetry will cause an
-        // immediate retry, which may result in the ports being put back on the
-        // list. Therefore we want to clear the retryList before calling
-        // sendRetry.
-        //
+        // Record the current list of ports to retry on a temporary list
+        // before calling sendRetryReq on those ports. sendRetryReq will cause
+        // an immediate retry, which may result in the ports being put back on
+        // the list. Therefore we want to clear the retryList before calling
+        // sendRetryReq.
        std::vector<MemSlavePort *> curRetryList(retryList);

        retryList.clear();

        for (auto i = curRetryList.begin(); i != curRetryList.end(); ++i) {
            DPRINTF(RubyPort,
-                    "Sequencer may now be free.  SendRetry to port %s\n",
+                    "Sequencer may now be free. SendRetry to port %s\n",
                    (*i)->name());
            (*i)->sendRetryReq();
        }
    }
-
-    testDrainComplete();
 }

 void
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@ -11,7 +11,7 @@
 * unmodified and in its entirety in all distributions of the software,
 * modified or unmodified, in source code or in binary form.
 *
- * Copyright (c) 2009 Advanced Micro Devices, Inc.
+ * Copyright (c) 2009-2013 Advanced Micro Devices, Inc.
 * Copyright (c) 2011 Mark D. Hill and David A. Wood
 * All rights reserved.
 *
@ -76,10 +76,12 @@ class RubyPort : public MemObject
      private:
        RespPacketQueue queue;
        bool access_backing_store;
+        bool no_retry_on_stall;

      public:
        MemSlavePort(const std::string &_name, RubyPort *_port,
-                     bool _access_backing_store, PortID id);
+                     bool _access_backing_store,
+                     PortID id, bool _no_retry_on_stall);
        void hitCallback(PacketPtr pkt);
        void evictionCallback(Addr address);

@ -94,6 +96,8 @@ class RubyPort : public MemObject
        AddrRangeList getAddrRanges() const
        { AddrRangeList ranges; return ranges; }

+        void addToRetryList();
+
      private:
        bool isPhysMemAddress(Addr addr) const;
    };
@ -164,6 +168,7 @@ class RubyPort : public MemObject
    DrainState drain() override;

  protected:
+    void trySendRetries();
    void ruby_hit_callback(PacketPtr pkt);
    void testDrainComplete();
    void ruby_eviction_callback(Addr address);
@ -186,10 +191,14 @@ class RubyPort : public MemObject
    System* system;

  private:
+    bool onRetryList(MemSlavePort * port)
+    {
+        return (std::find(retryList.begin(), retryList.end(), port) !=
+                retryList.end());
+    }
    void addToRetryList(MemSlavePort * port)
    {
-        if (std::find(retryList.begin(), retryList.end(), port) !=
-               retryList.end()) return;
+        if (onRetryList(port)) return;
        retryList.push_back(port);
    }

--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@ -491,6 +491,7 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
        rs->m_cache_recorder->enqueueNextFlushRequest();
    } else {
        ruby_hit_callback(pkt);
+        testDrainComplete();
    }
 }

--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@ -45,6 +45,7 @@ class RubyPort(MemObject):
    mem_slave_port = SlavePort("Ruby memory port")

    using_ruby_tester = Param.Bool(False, "")
+    no_retry_on_stall = Param.Bool(False, "")
    ruby_system = Param.RubySystem(Parent.any, "")
    system = Param.System(Parent.any, "system object")
    support_data_reqs = Param.Bool(True, "data cache requests supported")