mem: Move crossbar default latencies to subclasses

This patch introduces a few subclasses to the CoherentXBar and
NoncoherentXBar to distinguish the different uses in the system. We
use the crossbar in a wide range of places: interfacing cores to the
L2, as a system interconnect, connecting I/O and peripherals,
etc. Needless to say, these crossbars have very different performance,
and the clock frequency alone is not enough to distinguish these
scenarios.

Instead of trying to capture every possible case, this patch
introduces dedicated subclasses for the three primary use-cases:
L2XBar, SystemXBar and IOXbar. More can be added if needed, and the
defaults can be overridden.
This commit is contained in:
Andreas Hansson 2015-03-02 04:00:47 -05:00
parent d35dd71ab4
commit 36dc93a5fa
20 changed files with 84 additions and 48 deletions

View file

@ -65,14 +65,12 @@ def config_cache(options, system):
if options.l2cache:
# Provide a clock for the L2 and the L1-to-L2 bus here as they
# are not connected using addTwoLevelCacheHierarchy. Use the
# same clock as the CPUs, and set the L1-to-L2 bus width to 32
# bytes (256 bits).
# same clock as the CPUs.
system.l2 = l2_cache_class(clk_domain=system.cpu_clk_domain,
size=options.l2_size,
assoc=options.l2_assoc)
system.tol2bus = CoherentXBar(clk_domain = system.cpu_clk_domain,
width = 32)
system.tol2bus = L2XBar(clk_domain = system.cpu_clk_domain)
system.l2.cpu_side = system.tol2bus.master
system.l2.mem_side = system.membus.slave

View file

@ -50,7 +50,7 @@ class CowIdeDisk(IdeDisk):
def childImage(self, ci):
self.image.child.image_file = ci
class MemBus(CoherentXBar):
class MemBus(SystemXBar):
badaddr_responder = BadAddr()
default = Self.badaddr_responder.pio
@ -78,7 +78,7 @@ def makeLinuxAlphaSystem(mem_mode, mdesc=None, ruby=False, cmdline=None):
self.tsunami = BaseTsunami()
# Create the io bus to connect all device ports
self.iobus = NoncoherentXBar()
self.iobus = IOXBar()
self.tsunami.attachIO(self.iobus)
self.tsunami.ide.pio = self.iobus.master
@ -143,7 +143,7 @@ def makeSparcSystem(mem_mode, mdesc=None):
# generic system
mdesc = SysConfig()
self.readfile = mdesc.script()
self.iobus = NoncoherentXBar()
self.iobus = IOXBar()
self.membus = MemBus()
self.bridge = Bridge(delay='50ns')
self.t1000 = T1000()
@ -205,7 +205,7 @@ def makeArmSystem(mem_mode, machine_type, num_cpus=1, mdesc=None,
mdesc = SysConfig()
self.readfile = mdesc.script()
self.iobus = NoncoherentXBar()
self.iobus = IOXBar()
self.membus = MemBus()
self.membus.badaddr_responder.warn_access = "warn"
self.bridge = Bridge(delay='50ns')
@ -311,7 +311,7 @@ def makeLinuxMipsSystem(mem_mode, mdesc=None, cmdline=None):
# generic system
mdesc = SysConfig()
self.readfile = mdesc.script()
self.iobus = NoncoherentXBar()
self.iobus = IOXBar()
self.membus = MemBus()
self.bridge = Bridge(delay='50ns')
self.mem_ranges = [AddrRange('1GB')]
@ -358,7 +358,7 @@ def connectX86ClassicSystem(x86_sys, numCPUs):
x86_sys.membus = MemBus()
# North Bridge
x86_sys.iobus = NoncoherentXBar()
x86_sys.iobus = IOXBar()
x86_sys.bridge = Bridge(delay='50ns')
x86_sys.bridge.master = x86_sys.iobus.slave
x86_sys.bridge.slave = x86_sys.membus.master
@ -394,7 +394,7 @@ def connectX86ClassicSystem(x86_sys, numCPUs):
def connectX86RubySystem(x86_sys):
# North Bridge
x86_sys.iobus = NoncoherentXBar()
x86_sys.iobus = IOXBar()
# add the ide to the list of dma devices that later need to attach to
# dma controllers

View file

@ -84,7 +84,7 @@ if args:
# start with the system itself, using a multi-layer 1.5 GHz
# crossbar, delivering 64 bytes / 5 cycles (one header cycle)
# which amounts to 19.2 GByte/s per layer and thus per port
system = System(membus = NoncoherentXBar(width = 16))
system = System(membus = IOXBar(width = 16))
system.clk_domain = SrcClockDomain(clock = '1.5GHz',
voltage_domain =
VoltageDomain(voltage = '1V'))

View file

@ -243,7 +243,7 @@ def make_cache_level(ncaches, prototypes, level, next_cache):
if level != 0:
# Create a crossbar and add it to the subsystem, note that
# we do this even with a single element on this level
xbar = CoherentXBar(width = 32)
xbar = L2XBar(width = 32)
subsys.xbar = xbar
if next_cache:
xbar.master = next_cache.cpu_side
@ -269,7 +269,7 @@ def make_cache_level(ncaches, prototypes, level, next_cache):
if ntesters > 1:
# Create a crossbar and add it to the subsystem
xbar = CoherentXBar(width = 32)
xbar = L2XBar(width = 32)
subsys.xbar = xbar
xbar.master = next_cache.cpu_side
for tester, checker in zip(testers, checkers):

View file

@ -233,7 +233,7 @@ def make_cache_level(ncaches, prototypes, level, next_cache):
if level != 0:
# Create a crossbar and add it to the subsystem, note that
# we do this even with a single element on this level
xbar = CoherentXBar(width = 32)
xbar = L2XBar()
subsys.xbar = xbar
if next_cache:
xbar.master = next_cache.cpu_side
@ -258,7 +258,7 @@ def make_cache_level(ncaches, prototypes, level, next_cache):
if ntesters > 1:
# Create a crossbar and add it to the subsystem
xbar = CoherentXBar(width = 32)
xbar = L2XBar()
subsys.xbar = xbar
xbar.master = next_cache.cpu_side
for tester in testers:

View file

@ -106,7 +106,7 @@ cpus = [ MemTest(atomic = False,
system = System(cpu = cpus,
funcmem = SimpleMemory(in_addr_map = False),
funcbus = NoncoherentXBar(),
funcbus = IOXBar(),
clk_domain = SrcClockDomain(clock = options.sys_clock),
mem_ranges = [AddrRange(options.mem_size)])

View file

@ -265,7 +265,7 @@ if options.ruby:
system.cpu[i].dtb.walker.port = ruby_port.slave
else:
MemClass = Simulation.setMemClass(options)
system.membus = CoherentXBar()
system.membus = SystemXBar()
system.system_port = system.membus.slave
CacheConfig.config_cache(options, system)
MemConfig.config_mem(options, system)

View file

@ -116,7 +116,7 @@ def setup_memory_controllers(system, ruby, dir_cntrls, options):
crossbar = None
if len(system.mem_ranges) > 1:
crossbar = NoncoherentXBar()
crossbar = IOXBar()
crossbars.append(crossbar)
dir_cntrl.memory = crossbar.slave

View file

@ -171,7 +171,7 @@ if options.timing:
for j in xrange(options.numclusters):
clusters[j].id = j
for cluster in clusters:
cluster.clusterbus = CoherentXBar(clock=busFrequency)
cluster.clusterbus = L2XBar(clock=busFrequency)
all_l1buses += [cluster.clusterbus]
cluster.cpus = [TimingSimpleCPU(cpu_id = i + cluster.id,
clock=options.frequency)
@ -184,7 +184,7 @@ elif options.detailed:
for j in xrange(options.numclusters):
clusters[j].id = j
for cluster in clusters:
cluster.clusterbus = CoherentXBar(clock=busFrequency)
cluster.clusterbus = L2XBar(clock=busFrequency)
all_l1buses += [cluster.clusterbus]
cluster.cpus = [DerivO3CPU(cpu_id = i + cluster.id,
clock=options.frequency)
@ -197,7 +197,7 @@ else:
for j in xrange(options.numclusters):
clusters[j].id = j
for cluster in clusters:
cluster.clusterbus = CoherentXBar(clock=busFrequency)
cluster.clusterbus = L2XBar(clock=busFrequency)
all_l1buses += [cluster.clusterbus]
cluster.cpus = [AtomicSimpleCPU(cpu_id = i + cluster.id,
clock=options.frequency)
@ -211,10 +211,10 @@ else:
# ----------------------
system = System(cpu = all_cpus, l1_ = all_l1s, l1bus_ = all_l1buses,
physmem = SimpleMemory(),
membus = CoherentXBar(clock = busFrequency))
membus = SystemXBar(clock = busFrequency))
system.clock = '1GHz'
system.toL2bus = CoherentXBar(clock = busFrequency)
system.toL2bus = L2XBar(clock = busFrequency)
system.l2 = L2(size = options.l2size, assoc = 8)
# ----------------------

View file

@ -196,10 +196,10 @@ else:
# Create a system, and add system wide objects
# ----------------------
system = System(cpu = cpus, physmem = SimpleMemory(),
membus = CoherentXBar(clock = busFrequency))
membus = SystemXBar(clock = busFrequency))
system.clock = '1GHz'
system.toL2bus = CoherentXBar(clock = busFrequency)
system.toL2bus = L2XBar(clock = busFrequency)
system.l2 = L2(size = options.l2size, assoc = 8)
# ----------------------

View file

@ -47,7 +47,7 @@ from m5.defines import buildEnv
from m5.params import *
from m5.proxy import *
from XBar import CoherentXBar
from XBar import L2XBar
from InstTracer import InstTracer
from CPUTracers import ExeTracer
from MemObject import MemObject
@ -285,10 +285,7 @@ class BaseCPU(MemObject):
def addTwoLevelCacheHierarchy(self, ic, dc, l2c, iwc = None, dwc = None):
self.addPrivateSplitL1Caches(ic, dc, iwc, dwc)
# Set a width of 32 bytes (256-bits), which is four times that
# of the default bus. The clock of the CPU is inherited by
# default.
self.toL2Bus = CoherentXBar(width = 32)
self.toL2Bus = L2XBar()
self.connectCachedPorts(self.toL2Bus)
self.l2cache = l2c
self.toL2Bus.master = self.l2cache.cpu_side

View file

@ -66,12 +66,12 @@ class BaseXBar(MemObject):
# is the latency involved once a decision is made to forward the
# request. The response latency, is similar to the forward
# latency, but for responses rather than requests.
frontend_latency = Param.Cycles(3, "Frontend latency")
forward_latency = Param.Cycles(4, "Forward latency")
response_latency = Param.Cycles(2, "Response latency")
frontend_latency = Param.Cycles("Frontend latency")
forward_latency = Param.Cycles("Forward latency")
response_latency = Param.Cycles("Response latency")
# Width governing the throughput of the crossbar
width = Param.Unsigned(8, "Datapath width per port (bytes)")
width = Param.Unsigned("Datapath width per port (bytes)")
# The default port can be left unconnected, or be used to connect
# a default slave port
@ -95,7 +95,7 @@ class CoherentXBar(BaseXBar):
# The coherent crossbar additionally has snoop responses that are
# forwarded after a specific latency.
snoop_response_latency = Param.Cycles(4, "Snoop response latency")
snoop_response_latency = Param.Cycles("Snoop response latency")
# An optional snoop filter
snoop_filter = Param.SnoopFilter(NULL, "Selected snoop filter")
@ -111,3 +111,44 @@ class SnoopFilter(SimObject):
lookup_latency = Param.Cycles(1, "Lookup latency")
system = Param.System(Parent.any, "System that the crossbar belongs to.")
# We use a coherent crossbar to connect multiple masters to the L2
# caches. Normally this crossbar would be part of the cache itself.
class L2XBar(CoherentXBar):
# 256-bit crossbar by default
width = 32
# Assume that most of this is covered by the cache latencies, with
# no more than a single pipeline stage for any packet.
frontend_latency = 1
forward_latency = 0
response_latency = 1
snoop_response_latency = 1
# One of the key coherent crossbar instances is the system
# interconnect, tying together the CPU clusters, GPUs, and any I/O
# coherent masters, and DRAM controllers.
class SystemXBar(CoherentXBar):
# 128-bit crossbar by default
width = 16
# A handful pipeline stages for each portion of the latency
# contributions.
frontend_latency = 3
forward_latency = 4
response_latency = 2
snoop_response_latency = 4
# In addition to the system interconnect, we typically also have one
# or more on-chip I/O crossbars. Note that at some point we might want
# to also define an off-chip I/O crossbar such as PCIe.
class IOXBar(NoncoherentXBar):
# 128-bit crossbar by default
width = 16
# Assume a simpler datapath than a coherent crossbar, incuring
# less pipeline stages for decision making and forwarding of
# requests.
frontend_latency = 2
forward_latency = 1
response_latency = 2

View file

@ -104,7 +104,7 @@ class BaseSystem(object):
Returns:
A bus that CPUs should use to connect to the shared cache.
"""
system.toL2Bus = CoherentXBar(clk_domain=system.cpu_clk_domain)
system.toL2Bus = L2XBar(clk_domain=system.cpu_clk_domain)
system.l2c = L2Cache(clk_domain=system.cpu_clk_domain,
size='4MB', assoc=8)
system.l2c.cpu_side = system.toL2Bus.master
@ -186,7 +186,7 @@ class BaseSESystem(BaseSystem):
def create_system(self):
system = System(physmem = self.mem_class(),
membus = CoherentXBar(),
membus = SystemXBar(),
mem_mode = self.mem_mode)
system.system_port = system.membus.slave
system.physmem.port = system.membus.master

View file

@ -38,7 +38,7 @@ cpus = [ MemTest() for i in xrange(nb_cores) ]
# system simulated
system = System(cpu = cpus,
physmem = SimpleMemory(),
membus = CoherentXBar(width=16, snoop_filter = SnoopFilter()))
membus = SystemXBar(width=16, snoop_filter = SnoopFilter()))
# Dummy voltage domain for all our clock domains
system.voltage_domain = VoltageDomain()
system.clk_domain = SrcClockDomain(clock = '1GHz',
@ -49,8 +49,8 @@ system.clk_domain = SrcClockDomain(clock = '1GHz',
system.cpu_clk_domain = SrcClockDomain(clock = '2GHz',
voltage_domain = system.voltage_domain)
system.toL2Bus = CoherentXBar(clk_domain = system.cpu_clk_domain, width=16,
snoop_filter = SnoopFilter())
system.toL2Bus = L2XBar(clk_domain = system.cpu_clk_domain,
snoop_filter = SnoopFilter())
system.l2c = L2Cache(clk_domain = system.cpu_clk_domain, size='64kB', assoc=8)
system.l2c.cpu_side = system.toL2Bus.master

View file

@ -38,7 +38,7 @@ cpus = [ MemTest() for i in xrange(nb_cores) ]
# system simulated
system = System(cpu = cpus,
physmem = SimpleMemory(),
membus = CoherentXBar(width=16))
membus = SystemXBar())
# Dummy voltage domain for all our clock domains
system.voltage_domain = VoltageDomain()
system.clk_domain = SrcClockDomain(clock = '1GHz',
@ -49,7 +49,7 @@ system.clk_domain = SrcClockDomain(clock = '1GHz',
system.cpu_clk_domain = SrcClockDomain(clock = '2GHz',
voltage_domain = system.voltage_domain)
system.toL2Bus = CoherentXBar(clk_domain = system.cpu_clk_domain, width=16)
system.toL2Bus = L2XBar(clk_domain = system.cpu_clk_domain)
system.l2c = L2Cache(clk_domain = system.cpu_clk_domain, size='64kB', assoc=8)
system.l2c.cpu_side = system.toL2Bus.master

View file

@ -38,7 +38,7 @@ import ruby_config
ruby_memory = ruby_config.generate("TwoLevel_SplitL1UnifiedL2.rb", nb_cores)
# system simulated
system = System(cpu = cpus, physmem = ruby_memory, membus = CoherentXBar(),
system = System(cpu = cpus, physmem = ruby_memory, membus = SystemXBar(),
mem_mode = "timing",
clk_domain = SrcClockDomain(clock = '1GHz'))

View file

@ -39,7 +39,7 @@ cpu = DerivO3CPU(cpu_id=0)
system = System(cpu = cpu,
physmem = ruby_memory,
membus = CoherentXBar(),
membus = SystemXBar(),
mem_mode = "timing",
clk_domain = SrcClockDomain(clock = '1GHz'))

View file

@ -38,7 +38,7 @@ import ruby_config
ruby_memory = ruby_config.generate("TwoLevel_SplitL1UnifiedL2.rb", nb_cores)
# system simulated
system = System(cpu = cpus, physmem = ruby_memory, membus = CoherentXBar(),
system = System(cpu = cpus, physmem = ruby_memory, membus = SystemXBar(),
clk_domain = SrcClockDomain(clock = '1GHz'))
# Create a seperate clock domain for components that should run at

View file

@ -49,7 +49,7 @@ cpu = TrafficGen(config_file = "tests/quick/se/70.tgen/tgen-dram-ctrl.cfg")
# system simulated
system = System(cpu = cpu, physmem = DDR3_1600_x64(),
membus = NoncoherentXBar(width = 16),
membus = IOXBar(width = 16),
clk_domain = SrcClockDomain(clock = '1GHz',
voltage_domain =
VoltageDomain()))

View file

@ -49,7 +49,7 @@ cpu = TrafficGen(config_file = "tests/quick/se/70.tgen/tgen-simple-mem.cfg")
# system simulated
system = System(cpu = cpu, physmem = SimpleMemory(),
membus = NoncoherentXBar(width = 16),
membus = IOXBar(width = 16),
clk_domain = SrcClockDomain(clock = '1GHz',
voltage_domain =
VoltageDomain()))