gpu-compute: AMD's baseline GPU model

2016-01-19 14:28:22 -05:00 · 2016-01-19 14:28:22 -05:00 · 1a7d3f9fcb
commit 1a7d3f9fcb
parent 28e353e040
191 changed files with 95286 additions and 92 deletions
--- a/41
+++ b/41
@ -1065,7 +1065,9 @@ main = conf.Finish()
 # Define the universe of supported ISAs
 all_isa_list = [ ]
 all_gpu_isa_list = [ ]
 Export('all_isa_list')
 Export('all_gpu_isa_list')
 class CpuModel(object):
    '''The CpuModel class encapsulates everything the ISA parser needs to
@ -1121,9 +1123,11 @@ for bdir in [ base_dir ] + extras_dir_list:
            SConscript(joinpath(root, 'SConsopts'))
 all_isa_list.sort()
 all_gpu_isa_list.sort()
 sticky_vars.AddVariables(
    EnumVariable('TARGET_ISA', 'Target ISA', 'alpha', all_isa_list),
    EnumVariable('TARGET_GPU_ISA', 'Target GPU ISA', 'hsail', all_gpu_isa_list),
    ListVariable('CPU_MODELS', 'CPU models',
                 sorted(n for n,m in CpuModel.dict.iteritems() if m.default),
                 sorted(CpuModel.dict.keys())),
@ -1139,6 +1143,7 @@ sticky_vars.AddVariables(
    BoolVariable('USE_FENV', 'Use <fenv.h> IEEE mode control', have_fenv),
    BoolVariable('CP_ANNOTATE', 'Enable critical path annotation capability', False),
    BoolVariable('USE_KVM', 'Enable hardware virtualized (KVM) CPU models', have_kvm),
    BoolVariable('BUILD_GPU', 'Build the compute-GPU model', False),
    EnumVariable('PROTOCOL', 'Coherence protocol for Ruby', 'None',
                  all_protocols),
    EnumVariable('BACKTRACE_IMPL', 'Post-mortem dump implementation',
@ -1146,9 +1151,9 @@ sticky_vars.AddVariables(
    )
 # These variables get exported to #defines in config/*.hh (see src/SConscript).
-export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'CP_ANNOTATE',
+export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'TARGET_GPU_ISA',
-                'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL', 'HAVE_PROTOBUF',
+                'CP_ANNOTATE', 'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL',
-                'HAVE_PERF_ATTR_EXCLUDE_HOST']
+                'HAVE_PROTOBUF', 'HAVE_PERF_ATTR_EXCLUDE_HOST']
 ###################################################
 #
@ -1226,6 +1231,7 @@ main.SConscript('ext/nomali/SConscript',
 ###################################################
 main['ALL_ISA_LIST'] = all_isa_list
 main['ALL_GPU_ISA_LIST'] = all_gpu_isa_list
 all_isa_deps = {}
 def make_switching_dir(dname, switch_headers, env):
    # Generate the header.  target[0] is the full path of the output
@ -1258,6 +1264,35 @@ def make_switching_dir(dname, switch_headers, env):
 Export('make_switching_dir')
 def make_gpu_switching_dir(dname, switch_headers, env):
    # Generate the header.  target[0] is the full path of the output
    # header to generate.  'source' is a dummy variable, since we get the
    # list of ISAs from env['ALL_ISA_LIST'].
    def gen_switch_hdr(target, source, env):
        fname = str(target[0])
        isa = env['TARGET_GPU_ISA'].lower()
        try:
            f = open(fname, 'w')
            print >>f, '#include "%s/%s/%s"' % (dname, isa, basename(fname))
            f.close()
        except IOError:
            print "Failed to create %s" % fname
            raise
    # Build SCons Action object. 'varlist' specifies env vars that this
    # action depends on; when env['ALL_ISA_LIST'] changes these actions
    # should get re-executed.
    switch_hdr_action = MakeAction(gen_switch_hdr,
                          Transform("GENERATE"), varlist=['ALL_ISA_GPU_LIST'])
    # Instantiate actions for each header
    for hdr in switch_headers:
        env.Command(hdr, [], switch_hdr_action)
 Export('make_gpu_switching_dir')
 # all-isas -> all-deps -> all-environs -> all_targets
 main.Alias('#all-isas', [])
 main.Alias('#all-deps', '#all-isas')
--- a/build_opts/HSAIL_X86
+++ b/build_opts/HSAIL_X86
@ -0,0 +1,5 @@
 PROTOCOL = 'GPU_RfO'
 TARGET_ISA = 'x86'
 TARGET_GPU_ISA = 'hsail'
 BUILD_GPU = True
 CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
--- a/build_opts/X86_MOESI_AMD_Base
+++ b/build_opts/X86_MOESI_AMD_Base
@ -0,0 +1,3 @@
 PROTOCOL = 'MOESI_AMD_Base'
 TARGET_ISA = 'x86'
 CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
--- a/configs/common/GPUTLBConfig.py
+++ b/configs/common/GPUTLBConfig.py
@ -0,0 +1,203 @@
 #
 #  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Lisa Hsu
 #
 # Configure the TLB hierarchy
 # Places which would probably need to be modified if you
 # want a different hierarchy are specified by a <Modify here .. >'
 # comment
 import m5
 from m5.objects import *
 def TLB_constructor(level):
    constructor_call = "X86GPUTLB(size = options.L%(level)dTLBentries, \
            assoc = options.L%(level)dTLBassoc, \
            hitLatency = options.L%(level)dAccessLatency,\
            missLatency2 = options.L%(level)dMissLatency,\
            maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
            accessDistance = options.L%(level)dAccessDistanceStat,\
            clk_domain = SrcClockDomain(\
                clock = options.GPUClock,\
                voltage_domain = VoltageDomain(\
                    voltage = options.gpu_voltage)))" % locals()
    return constructor_call
 def Coalescer_constructor(level):
    constructor_call = "TLBCoalescer(probesPerCycle = \
                options.L%(level)dProbesPerCycle, \
                coalescingWindow = options.L%(level)dCoalescingWindow,\
                disableCoalescing = options.L%(level)dDisableCoalescing,\
                clk_domain = SrcClockDomain(\
                    clock = options.GPUClock,\
                    voltage_domain = VoltageDomain(\
                        voltage = options.gpu_voltage)))" % locals()
    return constructor_call
 def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
    # arguments: options, TLB level, number of private structures for this Level,
    # TLB name and  Coalescer name
    for i in xrange(my_index):
        TLB_name.append(eval(TLB_constructor(my_level)))
        Coalescer_name.append(eval(Coalescer_constructor(my_level)))
 def config_tlb_hierarchy(options, system, shader_idx):
    n_cu = options.num_compute_units
    # Make this configurable now, instead of the hard coded val.  The dispatcher
    # is always the last item in the system.cpu list.
    dispatcher_idx = len(system.cpu) - 1
    if options.TLB_config == "perLane":
        num_TLBs = 64 * n_cu
    elif options.TLB_config == "mono":
        num_TLBs = 1
    elif options.TLB_config == "perCU":
        num_TLBs = n_cu
    elif options.TLB_config == "2CU":
        num_TLBs = n_cu >> 1
    else:
        print "Bad option for TLB Configuration."
        sys.exit(1)
    #----------------------------------------------------------------------------------------
    # A visual representation of the TLB hierarchy
    # for ease of configuration
    # < Modify here the width and the number of levels if you want a different configuration >
    # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
    L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
          {'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
          {'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
    L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
    L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
    TLB_hierarchy = [L1, L2, L3]
    #----------------------------------------------------------------------------------------
    # Create the hiearchy
    # Call the appropriate constructors and add objects to the system
    for i in xrange(len(TLB_hierarchy)):
        hierarchy_level = TLB_hierarchy[i]
        level = i+1
        for TLB_type in hierarchy_level:
            TLB_index = TLB_type['width']
            TLB_array = TLB_type['TLBarray']
            Coalescer_array = TLB_type['CoalescerArray']
            # If the sim calls for a fixed L1 TLB size across CUs,
            # override the TLB entries option
            if options.tot_L1TLB_size:
                options.L1TLBentries = options.tot_L1TLB_size / num_TLBs
                if options.L1TLBassoc > options.L1TLBentries:
                    options.L1TLBassoc = options.L1TLBentries
            # call the constructors for the TLB and the Coalescer
            create_TLB_Coalescer(options, level, TLB_index,\
                TLB_array, Coalescer_array)
            system_TLB_name = TLB_type['name'] + '_tlb'
            system_Coalescer_name = TLB_type['name'] + '_coalescer'
            # add the different TLB levels to the system
            # Modify here if you want to make the TLB hierarchy a child of
            # the shader.
            exec('system.%s = TLB_array' % system_TLB_name)
            exec('system.%s = Coalescer_array' % system_Coalescer_name)
    #===========================================================
    # Specify the TLB hierarchy (i.e., port connections)
    # All TLBs but the last level TLB need to have a memSidePort (master)
    #===========================================================
    # Each TLB is connected with its Coalescer through a single port.
    # There is a one-to-one mapping of TLBs to Coalescers at a given level
    # This won't be modified no matter what the hierarchy looks like.
    for i in xrange(len(TLB_hierarchy)):
        hierarchy_level = TLB_hierarchy[i]
        level = i+1
        for TLB_type in hierarchy_level:
            name = TLB_type['name']
            for index in range(TLB_type['width']):
                exec('system.%s_coalescer[%d].master[0] = \
                        system.%s_tlb[%d].slave[0]' % \
                        (name, index, name, index))
    # Connect the cpuSidePort (slave) of all the coalescers in level 1
    # < Modify here if you want a different configuration >
    for TLB_type in L1:
        name = TLB_type['name']
        num_TLBs = TLB_type['width']
        if name == 'l1':     # L1 D-TLBs
            tlb_per_cu = num_TLBs / n_cu
            for cu_idx in range(n_cu):
                if tlb_per_cu:
                    for tlb in range(tlb_per_cu):
                        exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
                                system.l1_coalescer[%d].slave[%d]' % \
                                (shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
                else:
                    exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
                            system.l1_coalescer[%d].slave[%d]' % \
                            (shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
        elif name == 'dispatcher': # Dispatcher TLB
            for index in range(TLB_type['width']):
                exec('system.cpu[%d].translation_port = \
                        system.dispatcher_coalescer[%d].slave[0]' % \
                        (dispatcher_idx, index))
        elif name == 'sqc': # I-TLB
            for index in range(n_cu):
                sqc_tlb_index = index / options.cu_per_sqc
                sqc_tlb_port_id = index % options.cu_per_sqc
                exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
                        system.sqc_coalescer[%d].slave[%d]' % \
                        (shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
    # Connect the memSidePorts (masters) of all the TLBs with the
    # cpuSidePorts (slaves) of the Coalescers of the next level
    # < Modify here if you want a different configuration >
    # L1 <-> L2
    l2_coalescer_index = 0
    for TLB_type in L1:
        name = TLB_type['name']
        for index in range(TLB_type['width']):
            exec('system.%s_tlb[%d].master[0] = \
                    system.l2_coalescer[0].slave[%d]' % \
                    (name, index, l2_coalescer_index))
            l2_coalescer_index += 1
    # L2 <-> L3
    system.l2_tlb[0].master[0] = system.l3_coalescer[0].slave[0]
    return system
--- a/configs/common/GPUTLBOptions.py
+++ b/configs/common/GPUTLBOptions.py
@ -0,0 +1,109 @@
 #
 #  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Myrto Papadopoulou
 #
 def tlb_options(parser):
    #===================================================================
    # TLB Configuration
    #===================================================================
    parser.add_option("--TLB-config", type="string", default="perCU",
            help="Options are: perCU (default), mono, 2CU, or perLane")
    #===================================================================
    #   L1 TLB Options (D-TLB, I-TLB, Dispatcher-TLB)
    #===================================================================
    parser.add_option("--L1TLBentries", type='int', default="32")
    parser.add_option("--L1TLBassoc", type='int', default="32")
    parser.add_option("--L1AccessLatency", type='int', default="1",
                      help="latency in gpu cycles")
    parser.add_option("--L1MissLatency", type='int', default="750",
                      help="latency (in gpu cycles) of a page walk, "
                      "if this is a last level TLB")
    parser.add_option("--L1MaxOutstandingReqs", type='int', default="64")
    parser.add_option("--L1AccessDistanceStat", action="store_true")
    parser.add_option("--tot-L1TLB-size", type="int", default="0")
    #===================================================================
    #   L2 TLB Options
    #===================================================================
    parser.add_option("--L2TLBentries", type='int', default="4096")
    parser.add_option("--L2TLBassoc", type='int', default="32")
    parser.add_option("--L2AccessLatency", type='int', default="69",
                      help="latency in gpu cycles")
    parser.add_option("--L2MissLatency", type='int', default="750",
                      help="latency (in gpu cycles) of a page walk, "
                      "if this is a last level TLB")
    parser.add_option("--L2MaxOutstandingReqs", type='int', default="64")
    parser.add_option("--L2AccessDistanceStat", action="store_true")
    #===================================================================
    #   L3 TLB Options
    #===================================================================
    parser.add_option("--L3TLBentries", type='int', default="8192")
    parser.add_option("--L3TLBassoc", type='int', default="32")
    parser.add_option("--L3AccessLatency", type='int', default="150",
                      help="latency in gpu cycles")
    parser.add_option("--L3MissLatency", type='int', default="750",
                      help="latency (in gpu cycles) of a page walk")
    parser.add_option("--L3MaxOutstandingReqs", type='int', default="64")
    parser.add_option("--L3AccessDistanceStat", action="store_true")
    #===================================================================
    #   L1 TLBCoalescer Options
    #===================================================================
    parser.add_option("--L1ProbesPerCycle", type='int', default="2")
    parser.add_option("--L1CoalescingWindow", type='int', default="1")
    parser.add_option("--L1DisableCoalescing", action="store_true")
    #===================================================================
    #   L2 TLBCoalescer Options
    #===================================================================
    parser.add_option("--L2ProbesPerCycle", type='int', default="2")
    parser.add_option("--L2CoalescingWindow", type='int', default="1")
    parser.add_option("--L2DisableCoalescing", action="store_true")
    #===================================================================
    #   L3 TLBCoalescer Options
    #===================================================================
    parser.add_option("--L3ProbesPerCycle", type='int', default="2")
    parser.add_option("--L3CoalescingWindow", type='int', default="1")
    parser.add_option("--L3DisableCoalescing", action="store_true")
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@ -0,0 +1,499 @@
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Sooraj Puthoor
 #
 import optparse, os, re
 import math
 import glob
 import inspect
 import m5
 from m5.objects import *
 from m5.util import addToPath
 addToPath('../ruby')
 addToPath('../common')
 addToPath('../topologies')
 import Options
 import Ruby
 import Simulation
 import GPUTLBOptions, GPUTLBConfig
 ########################## Script Options ########################
 def setOption(parser, opt_str, value = 1):
    # check to make sure the option actually exists
    if not parser.has_option(opt_str):
        raise Exception("cannot find %s in list of possible options" % opt_str)
    opt = parser.get_option(opt_str)
    # set the value
    exec("parser.values.%s = %s" % (opt.dest, value))
 def getOption(parser, opt_str):
    # check to make sure the option actually exists
    if not parser.has_option(opt_str):
        raise Exception("cannot find %s in list of possible options" % opt_str)
    opt = parser.get_option(opt_str)
    # get the value
    exec("return_value = parser.values.%s" % opt.dest)
    return return_value
 # Adding script options
 parser = optparse.OptionParser()
 Options.addCommonOptions(parser)
 Options.addSEOptions(parser)
 parser.add_option("--cpu-only-mode", action="store_true", default=False,
                  help="APU mode. Used to take care of problems in "\
                       "Ruby.py while running APU protocols")
 parser.add_option("-k", "--kernel-files",
                  help="file(s) containing GPU kernel code (colon separated)")
 parser.add_option("-u", "--num-compute-units", type="int", default=1,
                  help="number of GPU compute units"),
 parser.add_option("--num-cp", type="int", default=0,
                  help="Number of GPU Command Processors (CP)")
 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
 # not super important now, but to avoid putting the number 4 everywhere, make
 # it an option/knob
 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
                  "sharing an SQC (icache, and thus icache TLB)")
 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
                  "per CU")
 parser.add_option("--wf-size", type="int", default=64,
                  help="Wavefront size(in workitems)")
 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
                  help="Number of stages of bypass path in vector ALU for Single Precision ops")
 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
                  help="Number of stages of bypass path in vector ALU for Double Precision ops")
 # issue period per SIMD unit: number of cycles before issuing another vector
 parser.add_option("--issue-period", type="int", default=4, \
                  help="Number of cycles per vector instruction issue period")
 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
                  help="VGPR to Coalescer (Global Memory) data bus width in bytes")
 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
                  help="Coalescer to VGPR (Global Memory) data bus width in bytes")
 # Currently we only support 1 local memory pipe
 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
                  help="Number of Shared Memory pipelines per CU")
 # Currently we only support 1 global memory pipe
 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
                  help="Number of Global Memory pipelines per CU")
 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
                  "WF slots per SIMD")
 parser.add_option("--vreg-file-size", type="int", default=2048,
                  help="number of physical vector registers per SIMD")
 parser.add_option("--bw-scalor", type="int", default=0,
                  help="bandwidth scalor for scalability analysis")
 parser.add_option("--CPUClock", type="string", default="2GHz",
                  help="CPU clock")
 parser.add_option("--GPUClock", type="string", default="1GHz",
                  help="GPU clock")
 parser.add_option("--cpu-voltage", action="store", type="string",
                  default='1.0V',
                  help = """CPU  voltage domain""")
 parser.add_option("--gpu-voltage", action="store", type="string",
                  default='1.0V',
                  help = """CPU  voltage domain""")
 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
                  help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
 parser.add_option("--xact-cas-mode", action="store_true",
                  help="enable load_compare mode (transactional CAS)")
 parser.add_option("--SegFaultDebug",action="store_true",
                 help="checks for GPU seg fault before TLB access")
 parser.add_option("--FunctionalTLB",action="store_true",
                 help="Assumes TLB has no latency")
 parser.add_option("--LocalMemBarrier",action="store_true",
                 help="Barrier does not wait for writethroughs to complete")
 parser.add_option("--countPages", action="store_true",
                 help="Count Page Accesses and output in per-CU output files")
 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
                  "TLBs")
 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
                  "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
 parser.add_option("--numLdsBanks", type="int", default=32,
                  help="number of physical banks per LDS module")
 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
                  help="number of cycles per LDS bank conflict")
 Ruby.define_options(parser)
 #add TLB options to the parser
 GPUTLBOptions.tlb_options(parser)
 (options, args) = parser.parse_args()
 # The GPU cache coherence protocols only work with the backing store
 setOption(parser, "--access-backing-store")
 # if benchmark root is specified explicitly, that overrides the search path
 if options.benchmark_root:
    benchmark_path = [options.benchmark_root]
 else:
    # Set default benchmark search path to current dir
    benchmark_path = ['.']
 ########################## Sanity Check ########################
 # Currently the gpu model requires ruby
 if buildEnv['PROTOCOL'] == 'None':
    fatal("GPU model requires ruby")
 # Currently the gpu model requires only timing or detailed CPU
 if not (options.cpu_type == "timing" or
   options.cpu_type == "detailed"):
    fatal("GPU model requires timing or detailed CPU")
 # This file can support multiple compute units
 assert(options.num_compute_units >= 1)
 # Currently, the sqc (I-Cache of GPU) is shared by
 # multiple compute units(CUs). The protocol works just fine
 # even if sqc is not shared. Overriding this option here
 # so that the user need not explicitly set this (assuming
 # sharing sqc is the common usage)
 n_cu = options.num_compute_units
 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
 options.num_sqc = num_sqc # pass this to Ruby
 ########################## Creating the GPU system ########################
 # shader is the GPU
 shader = Shader(n_wf = options.wfs_per_simd,
                clk_domain = SrcClockDomain(
                    clock = options.GPUClock,
                    voltage_domain = VoltageDomain(
                        voltage = options.gpu_voltage)))
 # GPU_RfO(Read For Ownership) implements SC/TSO memory model.
 # Other GPU protocols implement release consistency at GPU side.
 # So, all GPU protocols other than GPU_RfO should make their writes
 # visible to the global memory and should read from global memory
 # during kernal boundary. The pipeline initiates(or do not initiate)
 # the acquire/release operation depending on this impl_kern_boundary_sync
 # flag. This flag=true means pipeline initiates a acquire/release operation
 # at kernel boundary.
 if buildEnv['PROTOCOL'] == 'GPU_RfO':
    shader.impl_kern_boundary_sync = False
 else:
    shader.impl_kern_boundary_sync = True
 # Switching off per-lane TLB by default
 per_lane = False
 if options.TLB_config == "perLane":
    per_lane = True
 # List of compute units; one GPU can have multiple compute units
 compute_units = []
 for i in xrange(n_cu):
    compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
                                     num_SIMDs = options.simds_per_cu,
                                     wfSize = options.wf_size,
                                     spbypass_pipe_length = options.sp_bypass_path_length,
                                     dpbypass_pipe_length = options.dp_bypass_path_length,
                                     issue_period = options.issue_period,
                                     coalescer_to_vrf_bus_width = \
                                     options.glbmem_rd_bus_width,
                                     vrf_to_coalescer_bus_width = \
                                     options.glbmem_wr_bus_width,
                                     num_global_mem_pipes = \
                                     options.glb_mem_pipes_per_cu,
                                     num_shared_mem_pipes = \
                                     options.shr_mem_pipes_per_cu,
                                     n_wf = options.wfs_per_simd,
                                     execPolicy = options.CUExecPolicy,
                                     xactCasMode = options.xact_cas_mode,
                                     debugSegFault = options.SegFaultDebug,
                                     functionalTLB = options.FunctionalTLB,
                                     localMemBarrier = options.LocalMemBarrier,
                                     countPages = options.countPages,
                                     localDataStore = \
                                     LdsState(banks = options.numLdsBanks,
                                              bankConflictPenalty = \
                                              options.ldsBankConflictPenalty)))
    wavefronts = []
    vrfs = []
    for j in xrange(options.simds_per_cu):
        for k in xrange(shader.n_wf):
            wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
        vrfs.append(VectorRegisterFile(simd_id=j,
                              num_regs_per_simd=options.vreg_file_size))
    compute_units[-1].wavefronts = wavefronts
    compute_units[-1].vector_register_file = vrfs
    if options.TLB_prefetch:
        compute_units[-1].prefetch_depth = options.TLB_prefetch
        compute_units[-1].prefetch_prev_type = options.pf_type
    # attach the LDS and the CU to the bus (actually a Bridge)
    compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
    compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
 # Attach compute units to GPU
 shader.CUs = compute_units
 ########################## Creating the CPU system ########################
 options.num_cpus = options.num_cpus
 # The shader core will be whatever is after the CPU cores are accounted for
 shader_idx = options.num_cpus
 # The command processor will be whatever is after the shader is accounted for
 cp_idx = shader_idx + 1
 cp_list = []
 # List of CPUs
 cpu_list = []
 # We only support timing mode for shader and memory
 shader.timing = True
 mem_mode = 'timing'
 # create the cpus
 for i in range(options.num_cpus):
    cpu = None
    if options.cpu_type == "detailed":
        cpu = DerivO3CPU(cpu_id=i,
                         clk_domain = SrcClockDomain(
                             clock = options.CPUClock,
                             voltage_domain = VoltageDomain(
                                 voltage = options.cpu_voltage)))
    elif options.cpu_type == "timing":
        cpu = TimingSimpleCPU(cpu_id=i,
                              clk_domain = SrcClockDomain(
                                  clock = options.CPUClock,
                                  voltage_domain = VoltageDomain(
                                      voltage = options.cpu_voltage)))
    else:
        fatal("Atomic CPU not supported/tested")
    cpu_list.append(cpu)
 # create the command processors
 for i in xrange(options.num_cp):
    cp = None
    if options.cpu_type == "detailed":
        cp = DerivO3CPU(cpu_id = options.num_cpus + i,
                        clk_domain = SrcClockDomain(
                            clock = options.CPUClock,
                            voltage_domain = VoltageDomain(
                                voltage = options.cpu_voltage)))
    elif options.cpu_type == 'timing':
        cp = TimingSimpleCPU(cpu_id=options.num_cpus + i,
                             clk_domain = SrcClockDomain(
                                 clock = options.CPUClock,
                                 voltage_domain = VoltageDomain(
                                     voltage = options.cpu_voltage)))
    else:
        fatal("Atomic CPU not supported/tested")
    cp_list = cp_list + [cp]
 ########################## Creating the GPU dispatcher ########################
 # Dispatcher dispatches work from host CPU to GPU
 host_cpu = cpu_list[0]
 dispatcher = GpuDispatcher()
 ########################## Create and assign the workload ########################
 # Check for rel_path in elements of base_list using test, returning
 # the first full path that satisfies test
 def find_path(base_list, rel_path, test):
    for base in base_list:
        if not base:
            # base could be None if environment var not set
            continue
        full_path = os.path.join(base, rel_path)
        if test(full_path):
            return full_path
    fatal("%s not found in %s" % (rel_path, base_list))
 def find_file(base_list, rel_path):
    return find_path(base_list, rel_path, os.path.isfile)
 executable = find_path(benchmark_path, options.cmd, os.path.exists)
 # it's common for a benchmark to be in a directory with the same
 # name as the executable, so we handle that automatically
 if os.path.isdir(executable):
    benchmark_path = [executable]
    executable = find_file(benchmark_path, options.cmd)
 if options.kernel_files:
    kernel_files = [find_file(benchmark_path, f)
                    for f in options.kernel_files.split(':')]
 else:
    # if kernel_files is not set, see if there's a unique .asm file
    # in the same directory as the executable
    kernel_path = os.path.dirname(executable)
    kernel_files = glob.glob(os.path.join(kernel_path, '*.asm'))
    if kernel_files:
        print "Using GPU kernel code file(s)", ",".join(kernel_files)
    else:
        fatal("Can't locate kernel code (.asm) in " + kernel_path)
 # OpenCL driver
 driver = ClDriver(filename="hsa", codefile=kernel_files)
 for cpu in cpu_list:
    cpu.workload = LiveProcess(executable = executable,
                               cmd = [options.cmd] + options.options.split(),
                               drivers = [driver])
 for cp in cp_list:
    cp.workload = host_cpu.workload
 ########################## Create the overall system ########################
 # Full list of processing cores in the system. Note that
 # dispatcher is also added to cpu_list although it is
 # not a processing element
 cpu_list = cpu_list + [shader] + cp_list + [dispatcher]
 # creating the overall system
 # notice the cpu list is explicitly added as a parameter to System
 system = System(cpu = cpu_list,
                mem_ranges = [AddrRange(options.mem_size)],
                cache_line_size = options.cacheline_size,
                mem_mode = mem_mode)
 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
 system.clk_domain = SrcClockDomain(clock =  options.sys_clock,
                                   voltage_domain = system.voltage_domain)
 # configure the TLB hierarchy
 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
 # create Ruby system
 system.piobus = IOXBar(width=32, response_latency=0,
                       frontend_latency=0, forward_latency=0)
 Ruby.create_system(options, None, system)
 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
                                    voltage_domain = system.voltage_domain)
 # attach the CPU ports to Ruby
 for i in range(options.num_cpus):
    ruby_port = system.ruby._cpu_ports[i]
    # Create interrupt controller
    system.cpu[i].createInterruptController()
    # Connect cache port's to ruby
    system.cpu[i].icache_port = ruby_port.slave
    system.cpu[i].dcache_port = ruby_port.slave
    ruby_port.mem_master_port = system.piobus.slave
    if buildEnv['TARGET_ISA'] == "x86":
        system.cpu[i].interrupts[0].pio = system.piobus.master
        system.cpu[i].interrupts[0].int_master = system.piobus.slave
        system.cpu[i].interrupts[0].int_slave = system.piobus.master
 # attach CU ports to Ruby
 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
 # hooked up until after the CP. To make this script generic, figure out
 # the index as below, but note that this assumes there is one sequencer
 # per compute unit and one sequencer per SQC for the math to work out
 # correctly.
 gpu_port_idx = len(system.ruby._cpu_ports) \
               - options.num_compute_units - options.num_sqc
 gpu_port_idx = gpu_port_idx - options.num_cp * 2
 wavefront_size = options.wf_size
 for i in xrange(n_cu):
    # The pipeline issues wavefront_size number of uncoalesced requests
    # in one GPU issue cycle. Hence wavefront_size mem ports.
    for j in xrange(wavefront_size):
        system.cpu[shader_idx].CUs[i].memory_port[j] = \
                  system.ruby._cpu_ports[gpu_port_idx].slave[j]
    gpu_port_idx += 1
 for i in xrange(n_cu):
    if i > 0 and not i % options.cu_per_sqc:
        print "incrementing idx on ", i
        gpu_port_idx += 1
    system.cpu[shader_idx].CUs[i].sqc_port = \
            system.ruby._cpu_ports[gpu_port_idx].slave
 gpu_port_idx = gpu_port_idx + 1
 # attach CP ports to Ruby
 for i in xrange(options.num_cp):
    system.cpu[cp_idx].createInterruptController()
    system.cpu[cp_idx].dcache_port = \
                system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
    system.cpu[cp_idx].icache_port = \
                system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
    system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
    system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
    system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
    cp_idx = cp_idx + 1
 # connect dispatcher to the system.piobus
 dispatcher.pio = system.piobus.master
 dispatcher.dma = system.piobus.slave
 ################# Connect the CPU and GPU via GPU Dispatcher ###################
 # CPU rings the GPU doorbell to notify a pending task
 # using this interface.
 # And GPU uses this interface to notify the CPU of task completion
 # The communcation happens through emulated driver.
 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
 # parameters must be after the explicit setting of the System cpu list
 shader.cpu_pointer = host_cpu
 dispatcher.cpu = host_cpu
 dispatcher.shader_pointer = shader
 dispatcher.cl_driver = driver
 ########################## Start simulation ########################
 root = Root(system=system, full_system=False)
 m5.ticks.setGlobalFrequency('1THz')
 if options.abs_max_tick:
    maxtick = options.abs_max_tick
 else:
    maxtick = m5.MaxTick
 # Benchmarks support work item annotations
 Simulation.setWorkCountOptions(system, options)
 # Checkpointing is not supported by APU model
 if (options.checkpoint_dir != None or
    options.checkpoint_restore != None):
    fatal("Checkpointing not supported by apu model")
 checkpoint_dir = None
 m5.instantiate(checkpoint_dir)
 # Map workload to this address space
 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
 exit_event = m5.simulate(maxtick)
 print "Ticks:", m5.curTick()
 print 'Exiting because ', exit_event.getCause()
 sys.exit(exit_event.getCode())
--- a/configs/example/ruby_gpu_random_test.py
+++ b/configs/example/ruby_gpu_random_test.py
@ -0,0 +1,187 @@
 #
 #  Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Brad Beckmann
 #
 import m5
 from m5.objects import *
 from m5.defines import buildEnv
 from m5.util import addToPath
 import os, optparse, sys
 addToPath('../common')
 addToPath('../ruby')
 addToPath('../topologies')
 import Options
 import Ruby
 # Get paths we might need.
 config_path = os.path.dirname(os.path.abspath(__file__))
 config_root = os.path.dirname(config_path)
 m5_root = os.path.dirname(config_root)
 parser = optparse.OptionParser()
 Options.addCommonOptions(parser)
 parser.add_option("--maxloads", metavar="N", default=100,
                  help="Stop after N loads")
 parser.add_option("-f", "--wakeup_freq", metavar="N", default=10,
                  help="Wakeup every N cycles")
 parser.add_option("-u", "--num-compute-units", type="int", default=1,
                  help="number of compute units in the GPU")
 parser.add_option("--numCPs", type="int", default=0,
                  help="Number of GPU Command Processors (CP)")
 # not super important now, but to avoid putting the number 4 everywhere, make
 # it an option/knob
 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs \
                  sharing an SQC (icache, and thus icache TLB)")
 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
                  "per CU")
 parser.add_option("--wf-size", type="int", default=64,
                  help="Wavefront size(in workitems)")
 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
                  "WF slots per SIMD")
 #
 # Add the ruby specific and protocol specific options
 #
 Ruby.define_options(parser)
 execfile(os.path.join(config_root, "common", "Options.py"))
 (options, args) = parser.parse_args()
 #
 # Set the default cache size and associativity to be very small to encourage
 # races between requests and writebacks.
 #
 options.l1d_size="256B"
 options.l1i_size="256B"
 options.l2_size="512B"
 options.l3_size="1kB"
 options.l1d_assoc=2
 options.l1i_assoc=2
 options.l2_assoc=2
 options.l3_assoc=2
 # This file can support multiple compute units
 assert(options.num_compute_units >= 1)
 n_cu = options.num_compute_units
 options.num_sqc = int((n_cu + options.cu_per_sqc - 1) / options.cu_per_sqc)
 if args:
     print "Error: script doesn't take any positional arguments"
     sys.exit(1)
 #
 # Create the ruby random tester
 #
 # Check to for the GPU_RfO protocol.  Other GPU protocols are non-SC and will
 # not work with the Ruby random tester.
 assert(buildEnv['PROTOCOL'] == 'GPU_RfO')
 # The GPU_RfO protocol does not support cache flushes
 check_flush = False
 tester = RubyTester(check_flush=check_flush,
                    checks_to_complete=options.maxloads,
                    wakeup_frequency=options.wakeup_freq,
                    deadlock_threshold=1000000)
 #
 # Create the M5 system.  Note that the Memory Object isn't
 # actually used by the rubytester, but is included to support the
 # M5 memory size == Ruby memory size checks
 #
 system = System(cpu=tester, mem_ranges=[AddrRange(options.mem_size)])
 # Create a top-level voltage domain and clock domain
 system.voltage_domain = VoltageDomain(voltage=options.sys_voltage)
 system.clk_domain = SrcClockDomain(clock=options.sys_clock,
                                   voltage_domain=system.voltage_domain)
 Ruby.create_system(options, False, system)
 # Create a seperate clock domain for Ruby
 system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock,
                                       voltage_domain=system.voltage_domain)
 tester.num_cpus = len(system.ruby._cpu_ports)
 #
 # The tester is most effective when randomization is turned on and
 # artifical delay is randomly inserted on messages
 #
 system.ruby.randomization = True
 for ruby_port in system.ruby._cpu_ports:
    #
    # Tie the ruby tester ports to the ruby cpu read and write ports
    #
    if ruby_port.support_data_reqs and ruby_port.support_inst_reqs:
        tester.cpuInstDataPort = ruby_port.slave
    elif ruby_port.support_data_reqs:
        tester.cpuDataPort = ruby_port.slave
    elif ruby_port.support_inst_reqs:
        tester.cpuInstPort = ruby_port.slave
    # Do not automatically retry stalled Ruby requests
    ruby_port.no_retry_on_stall = True
    #
    # Tell each sequencer this is the ruby tester so that it
    # copies the subblock back to the checker
    #
    ruby_port.using_ruby_tester = True
 # -----------------------
 # run simulation
 # -----------------------
 root = Root( full_system = False, system = system )
 root.system.mem_mode = 'timing'
 # Not much point in this being higher than the L1 latency
 m5.ticks.setGlobalFrequency('1ns')
 # instantiate configuration
 m5.instantiate()
 # simulate until program terminates
 exit_event = m5.simulate(options.abs_max_tick)
 print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause()
--- a/configs/ruby/AMD_Base_Constructor.py
+++ b/configs/ruby/AMD_Base_Constructor.py
@ -0,0 +1,134 @@
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Sooraj Puthoor, Lisa Hsu
 #
 import math
 import m5
 from m5.objects import *
 from m5.defines import buildEnv
 from m5.util import convert
 from CntrlBase import *
 from Cluster import Cluster
 #
 # Note: the L1 Cache latency is only used by the sequencer on fast path hits
 #
 class L1Cache(RubyCache):
    latency = 1
    resourceStalls = False
    def create(self, size, assoc, options):
        self.size = MemorySize(size)
        self.assoc = assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 #
 # Note: the L2 Cache latency is not currently used
 #
 class L2Cache(RubyCache):
    latency = 10
    resourceStalls = False
    def create(self, size, assoc, options):
        self.size = MemorySize(size)
        self.assoc = assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class CPCntrl(AMD_Base_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.cntrl_id = self.cntrlCount()
        self.L1Icache = L1Cache()
        self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
        self.L1D0cache = L1Cache()
        self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
        self.L1D1cache = L1Cache()
        self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
        self.L2cache = L2Cache()
        self.L2cache.create(options.l2_size, options.l2_assoc, options)
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1Icache
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True
        self.sequencer1 = RubySequencer()
        self.sequencer1.version = self.seqCount()
        self.sequencer1.icache = self.L1Icache
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.ruby_system = ruby_system
        self.sequencer1.coreid = 1
        self.sequencer1.is_cpu_sequencer = True
        self.issue_latency = options.cpu_to_dir_latency
        self.send_evictions = send_evicts(options)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 def define_options(parser):
    parser.add_option("--cpu-to-dir-latency", type="int", default=15)
 def construct(options, system, ruby_system):
    if (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
        buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
        buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
        panic("This script requires VIPER based protocols \
        to be built.")
    cpu_sequencers = []
    cpuCluster = None
    cpuCluster = Cluster(name="CPU Cluster", extBW = 8, intBW=8) # 16 GB/s
    for i in xrange((options.num_cpus + 1) / 2):
        cp_cntrl = CPCntrl()
        cp_cntrl.create(options, ruby_system, system)
        # Connect the CP controllers to the ruby network
        cp_cntrl.requestFromCore = ruby_system.network.slave
        cp_cntrl.responseFromCore = ruby_system.network.slave
        cp_cntrl.unblockFromCore = ruby_system.network.slave
        cp_cntrl.probeToCore = ruby_system.network.master
        cp_cntrl.responseToCore = ruby_system.network.master
        exec("system.cp_cntrl%d = cp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
        cpuCluster.add(cp_cntrl)
    return cpu_sequencers, cpuCluster
--- a/configs/ruby/GPU_RfO.py
+++ b/configs/ruby/GPU_RfO.py
@ -0,0 +1,751 @@
 #
 #  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Lisa Hsu
 #
 import math
 import m5
 from m5.objects import *
 from m5.defines import buildEnv
 from Ruby import create_topology
 from Ruby import send_evicts
 from Cluster import Cluster
 from Crossbar import Crossbar
 class CntrlBase:
    _seqs = 0
    @classmethod
    def seqCount(cls):
        # Use SeqCount not class since we need global count
        CntrlBase._seqs += 1
        return CntrlBase._seqs - 1
    _cntrls = 0
    @classmethod
    def cntrlCount(cls):
        # Use CntlCount not class since we need global count
        CntrlBase._cntrls += 1
        return CntrlBase._cntrls - 1
    _version = 0
    @classmethod
    def versionCount(cls):
        cls._version += 1 # Use count for this particular type
        return cls._version - 1
 class TccDirCache(RubyCache):
    size = "512kB"
    assoc = 16
    resourceStalls = False
    def create(self, options):
        self.size = MemorySize(options.tcc_size)
        self.size.value += (options.num_compute_units *
                            (MemorySize(options.tcp_size).value) *
                            options.tcc_dir_factor) / long(options.num_tccs)
        self.start_index_bit = math.log(options.cacheline_size, 2) + \
                               math.log(options.num_tccs, 2)
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L1DCache(RubyCache):
    resourceStalls = False
    def create(self, options):
        self.size = MemorySize(options.l1d_size)
        self.assoc = options.l1d_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L1ICache(RubyCache):
    resourceStalls = False
    def create(self, options):
        self.size = MemorySize(options.l1i_size)
        self.assoc = options.l1i_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L2Cache(RubyCache):
    resourceStalls = False
    def create(self, options):
        self.size = MemorySize(options.l2_size)
        self.assoc = options.l2_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class CPCntrl(CorePair_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1Icache = L1ICache()
        self.L1Icache.create(options)
        self.L1D0cache = L1DCache()
        self.L1D0cache.create(options)
        self.L1D1cache = L1DCache()
        self.L1D1cache.create(options)
        self.L2cache = L2Cache()
        self.L2cache.create(options)
        self.sequencer = RubySequencer()
        self.sequencer.icache_hit_latency = 2
        self.sequencer.dcache_hit_latency = 2
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1Icache
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True
        self.sequencer1 = RubySequencer()
        self.sequencer1.version = self.seqCount()
        self.sequencer1.icache = self.L1Icache
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.icache_hit_latency = 2
        self.sequencer1.dcache_hit_latency = 2
        self.sequencer1.ruby_system = ruby_system
        self.sequencer1.coreid = 1
        self.sequencer1.is_cpu_sequencer = True
        self.issue_latency = options.cpu_to_dir_latency
        self.send_evictions = send_evicts(options)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class TCPCache(RubyCache):
    assoc = 8
    dataArrayBanks = 16
    tagArrayBanks = 4
    dataAccessLatency = 4
    tagAccessLatency = 1
    def create(self, options):
        self.size = MemorySize(options.tcp_size)
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class TCPCntrl(TCP_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
        self.L1cache.resourceStalls = options.no_resource_stalls
        self.L1cache.create(options)
        self.coalescer = RubyGPUCoalescer()
        self.coalescer.version = self.seqCount()
        self.coalescer.icache = self.L1cache
        self.coalescer.dcache = self.L1cache
        self.coalescer.ruby_system = ruby_system
        self.coalescer.support_inst_reqs = False
        self.coalescer.is_cpu_sequencer = False
        self.coalescer.max_outstanding_requests = options.simds_per_cu * \
                                                  options.wfs_per_simd * \
                                                  options.wf_size
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.is_cpu_sequencer = True
        self.use_seq_not_coal = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def createCP(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
        self.L1cache.resourceStalls = options.no_resource_stalls
        self.L1cache.create(options)
        self.coalescer = RubyGPUCoalescer()
        self.coalescer.version = self.seqCount()
        self.coalescer.icache = self.L1cache
        self.coalescer.dcache = self.L1cache
        self.coalescer.ruby_system = ruby_system
        self.coalescer.support_inst_reqs = False
        self.coalescer.is_cpu_sequencer = False
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.is_cpu_sequencer = True
        self.use_seq_not_coal = True
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class SQCCache(RubyCache):
    size = "32kB"
    assoc = 8
    dataArrayBanks = 16
    tagArrayBanks = 4
    dataAccessLatency = 4
    tagAccessLatency = 1
    def create(self, options):
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class SQCCntrl(SQC_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = SQCCache()
        self.L1cache.create(options)
        self.L1cache.resourceStalls = options.no_resource_stalls
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.support_data_reqs = False
        self.sequencer.is_cpu_sequencer = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def createCP(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = SQCCache()
        self.L1cache.create(options)
        self.L1cache.resourceStalls = options.no_resource_stalls
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.support_data_reqs = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class TCC(RubyCache):
    assoc = 16
    dataAccessLatency = 8
    tagAccessLatency = 2
    resourceStalls = True
    def create(self, options):
        self.size = MemorySize(options.tcc_size)
        self.size = self.size / options.num_tccs
        self.dataArrayBanks = 256 / options.num_tccs #number of data banks
        self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
        if ((self.size.value / long(self.assoc)) < 128):
            self.size.value = long(128 * self.assoc)
        self.start_index_bit = math.log(options.cacheline_size, 2) + \
                               math.log(options.num_tccs, 2)
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class TCCCntrl(TCC_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L2cache = TCC()
        self.L2cache.create(options)
        self.l2_response_latency = options.TCC_latency
        self.number_of_TBEs = 2048
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
                           tcc_unblock_to_tccdir, req_to_tcc,
                           probe_to_tcc, resp_to_tcc):
        self.w_reqToTCCDir = req_to_tccdir
        self.w_respToTCCDir = resp_to_tccdir
        self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
        self.w_reqToTCC = req_to_tcc
        self.w_probeToTCC = probe_to_tcc
        self.w_respToTCC = resp_to_tcc
 class TCCDirCntrl(TCCdir_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.directory = TccDirCache()
        self.directory.create(options)
        self.number_of_TBEs = 1024
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
                           tcc_unblock_to_tccdir, req_to_tcc,
                           probe_to_tcc, resp_to_tcc):
        self.w_reqToTCCDir = req_to_tccdir
        self.w_respToTCCDir = resp_to_tccdir
        self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
        self.w_reqToTCC = req_to_tcc
        self.w_probeToTCC = probe_to_tcc
        self.w_respToTCC = resp_to_tcc
 class L3Cache(RubyCache):
    assoc = 8
    dataArrayBanks = 256
    tagArrayBanks = 256
    def create(self, options, ruby_system, system):
        self.size = MemorySize(options.l3_size)
        self.size.value /= options.num_dirs
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataAccessLatency = options.l3_data_latency
        self.tagAccessLatency = options.l3_tag_latency
        self.resourceStalls = options.no_resource_stalls
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L3Cntrl(L3Cache_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L3cache = L3Cache()
        self.L3cache.create(options, ruby_system, system)
        self.l3_response_latency = max(self.L3cache.dataAccessLatency,
                                       self.L3cache.tagAccessLatency)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 class DirMem(RubyDirectoryMemory, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        phys_mem_size = AddrRange(options.mem_size).size()
        mem_module_size = phys_mem_size / options.num_dirs
        dir_size = MemorySize('0B')
        dir_size.value = mem_module_size
        self.size = dir_size
 class DirCntrl(Directory_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.response_latency = 30
        self.directory = DirMem()
        self.directory.create(options, ruby_system, system)
        self.L3CacheMemory = L3Cache()
        self.L3CacheMemory.create(options, ruby_system, system)
        self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
                                  self.L3CacheMemory.tagAccessLatency)
        self.number_of_TBEs = options.num_tbes
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 def define_options(parser):
    parser.add_option("--num-subcaches", type="int", default=4)
    parser.add_option("--l3-data-latency", type="int", default=20)
    parser.add_option("--l3-tag-latency", type="int", default=15)
    parser.add_option("--cpu-to-dir-latency", type="int", default=15)
    parser.add_option("--gpu-to-dir-latency", type="int", default=160)
    parser.add_option("--no-resource-stalls", action="store_false",
                      default=True)
    parser.add_option("--num-tbes", type="int", default=256)
    parser.add_option("--l2-latency", type="int", default=50) # load to use
    parser.add_option("--num-tccs", type="int", default=1,
                      help="number of TCC directories and banks in the GPU")
    parser.add_option("--TCP_latency", type="int", default=4,
                      help="TCP latency")
    parser.add_option("--TCC_latency", type="int", default=16,
                      help="TCC latency")
    parser.add_option("--tcc-size", type='string', default='256kB',
                      help="agregate tcc size")
    parser.add_option("--tcp-size", type='string', default='16kB',
                      help="tcp size")
    parser.add_option("--tcc-dir-factor", type='int', default=4,
                      help="TCCdir size = factor *(TCPs + TCC)")
 def create_system(options, full_system, system, dma_devices, ruby_system):
    if buildEnv['PROTOCOL'] != 'GPU_RfO':
        panic("This script requires the GPU_RfO protocol to be built.")
    cpu_sequencers = []
    #
    # The ruby network creation expects the list of nodes in the system to be
    # consistent with the NetDest list.  Therefore the l1 controller nodes
    # must be listed before the directory nodes and directory nodes before
    # dma nodes, etc.
    #
    cp_cntrl_nodes = []
    tcp_cntrl_nodes = []
    sqc_cntrl_nodes = []
    tcc_cntrl_nodes = []
    tccdir_cntrl_nodes = []
    dir_cntrl_nodes = []
    l3_cntrl_nodes = []
    #
    # Must create the individual controllers before the network to ensure the
    # controller constructors are called before the network constructor
    #
    TCC_bits = int(math.log(options.num_tccs, 2))
    # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
    # Clusters
    mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
    for i in xrange(options.num_dirs):
        dir_cntrl = DirCntrl(TCC_select_num_bits = TCC_bits)
        dir_cntrl.create(options, ruby_system, system)
        dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
        #Enough TBEs for all TCP TBEs
        # Connect the Directory controller to the ruby network
        dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
        dir_cntrl.requestFromCores.slave = ruby_system.network.master
        dir_cntrl.responseFromCores = MessageBuffer()
        dir_cntrl.responseFromCores.slave = ruby_system.network.master
        dir_cntrl.unblockFromCores = MessageBuffer()
        dir_cntrl.unblockFromCores.slave = ruby_system.network.master
        dir_cntrl.probeToCore = MessageBuffer()
        dir_cntrl.probeToCore.master = ruby_system.network.slave
        dir_cntrl.responseToCore = MessageBuffer()
        dir_cntrl.responseToCore.master = ruby_system.network.slave
        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
        dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
        dir_cntrl.responseFromMemory = MessageBuffer()
        exec("system.dir_cntrl%d = dir_cntrl" % i)
        dir_cntrl_nodes.append(dir_cntrl)
        mainCluster.add(dir_cntrl)
    # For an odd number of CPUs, still create the right number of controllers
    cpuCluster = Cluster(extBW = 512, intBW = 512)  # 1 TB/s
    for i in xrange((options.num_cpus + 1) / 2):
        cp_cntrl = CPCntrl()
        cp_cntrl.create(options, ruby_system, system)
        exec("system.cp_cntrl%d = cp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
        # Connect the CP controllers and the network
        cp_cntrl.requestFromCore = MessageBuffer()
        cp_cntrl.requestFromCore.master = ruby_system.network.slave
        cp_cntrl.responseFromCore = MessageBuffer()
        cp_cntrl.responseFromCore.master = ruby_system.network.slave
        cp_cntrl.unblockFromCore = MessageBuffer()
        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
        cp_cntrl.probeToCore = MessageBuffer()
        cp_cntrl.probeToCore.slave = ruby_system.network.master
        cp_cntrl.responseToCore = MessageBuffer()
        cp_cntrl.responseToCore.slave = ruby_system.network.master
        cp_cntrl.mandatoryQueue = MessageBuffer()
        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
        cpuCluster.add(cp_cntrl)
    gpuCluster = Cluster(extBW = 512, intBW = 512)  # 1 TB/s
    for i in xrange(options.num_compute_units):
        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
                             number_of_TBEs = 2560) # max outstanding requests
        tcp_cntrl.create(options, ruby_system, system)
        exec("system.tcp_cntrl%d = tcp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(tcp_cntrl.coalescer)
        tcp_cntrl_nodes.append(tcp_cntrl)
        # Connect the TCP controller to the ruby network
        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
        tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
        tcp_cntrl.mandatoryQueue = MessageBuffer()
        gpuCluster.add(tcp_cntrl)
    for i in xrange(options.num_sqc):
        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
        sqc_cntrl.create(options, ruby_system, system)
        exec("system.sqc_cntrl%d = sqc_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(sqc_cntrl.sequencer)
        # Connect the SQC controller to the ruby network
        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
        sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
        sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
        sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
        sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
        sqc_cntrl.mandatoryQueue = MessageBuffer()
        # SQC also in GPU cluster
        gpuCluster.add(sqc_cntrl)
    for i in xrange(options.numCPs):
        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
                             number_of_TBEs = 2560) # max outstanding requests
        tcp_cntrl.createCP(options, ruby_system, system)
        exec("system.tcp_cntrl%d = tcp_cntrl" % (options.num_compute_units + i))
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(tcp_cntrl.sequencer)
        tcp_cntrl_nodes.append(tcp_cntrl)
        # Connect the TCP controller to the ruby network
        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
        tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
        tcp_cntrl.mandatoryQueue = MessageBuffer()
        gpuCluster.add(tcp_cntrl)
        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
        sqc_cntrl.createCP(options, ruby_system, system)
        exec("system.sqc_cntrl%d = sqc_cntrl" % (options.num_compute_units + i))
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(sqc_cntrl.sequencer)
        # Connect the SQC controller to the ruby network
        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
        sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
        sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
        sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
        sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
        sqc_cntrl.mandatoryQueue = MessageBuffer()
        # SQC also in GPU cluster
        gpuCluster.add(sqc_cntrl)
    for i in xrange(options.num_tccs):
        tcc_cntrl = TCCCntrl(TCC_select_num_bits = TCC_bits,
                             number_of_TBEs = options.num_compute_units * 2560)
        #Enough TBEs for all TCP TBEs
        tcc_cntrl.create(options, ruby_system, system)
        tcc_cntrl_nodes.append(tcc_cntrl)
        tccdir_cntrl = TCCDirCntrl(TCC_select_num_bits = TCC_bits,
                              number_of_TBEs = options.num_compute_units * 2560)
        #Enough TBEs for all TCP TBEs
        tccdir_cntrl.create(options, ruby_system, system)
        tccdir_cntrl_nodes.append(tccdir_cntrl)
        exec("system.tcc_cntrl%d = tcc_cntrl" % i)
        exec("system.tccdir_cntrl%d = tccdir_cntrl" % i)
        # connect all of the wire buffers between L3 and dirs up
        req_to_tccdir = RubyWireBuffer()
        resp_to_tccdir = RubyWireBuffer()
        tcc_unblock_to_tccdir = RubyWireBuffer()
        req_to_tcc = RubyWireBuffer()
        probe_to_tcc = RubyWireBuffer()
        resp_to_tcc = RubyWireBuffer()
        tcc_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
                                     tcc_unblock_to_tccdir, req_to_tcc,
                                     probe_to_tcc, resp_to_tcc)
        tccdir_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
                                        tcc_unblock_to_tccdir, req_to_tcc,
                                        probe_to_tcc, resp_to_tcc)
        # Connect the TCC controller to the ruby network
        tcc_cntrl.responseFromTCC = MessageBuffer(ordered = True)
        tcc_cntrl.responseFromTCC.master = ruby_system.network.slave
        tcc_cntrl.responseToTCC = MessageBuffer(ordered = True)
        tcc_cntrl.responseToTCC.slave = ruby_system.network.master
        # Connect the TCC Dir controller to the ruby network
        tccdir_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tccdir_cntrl.requestFromTCP.slave = ruby_system.network.master
        tccdir_cntrl.responseFromTCP = MessageBuffer(ordered = True)
        tccdir_cntrl.responseFromTCP.slave = ruby_system.network.master
        tccdir_cntrl.unblockFromTCP = MessageBuffer(ordered = True)
        tccdir_cntrl.unblockFromTCP.slave = ruby_system.network.master
        tccdir_cntrl.probeToCore = MessageBuffer(ordered = True)
        tccdir_cntrl.probeToCore.master = ruby_system.network.slave
        tccdir_cntrl.responseToCore = MessageBuffer(ordered = True)
        tccdir_cntrl.responseToCore.master = ruby_system.network.slave
        tccdir_cntrl.probeFromNB = MessageBuffer()
        tccdir_cntrl.probeFromNB.slave = ruby_system.network.master
        tccdir_cntrl.responseFromNB = MessageBuffer()
        tccdir_cntrl.responseFromNB.slave = ruby_system.network.master
        tccdir_cntrl.requestToNB = MessageBuffer()
        tccdir_cntrl.requestToNB.master = ruby_system.network.slave
        tccdir_cntrl.responseToNB = MessageBuffer()
        tccdir_cntrl.responseToNB.master = ruby_system.network.slave
        tccdir_cntrl.unblockToNB = MessageBuffer()
        tccdir_cntrl.unblockToNB.master = ruby_system.network.slave
        tccdir_cntrl.triggerQueue = MessageBuffer(ordered = True)
        # TCC cntrls added to the GPU cluster
        gpuCluster.add(tcc_cntrl)
        gpuCluster.add(tccdir_cntrl)
    # Assuming no DMA devices
    assert(len(dma_devices) == 0)
    # Add cpu/gpu clusters to main cluster
    mainCluster.add(cpuCluster)
    mainCluster.add(gpuCluster)
    ruby_system.network.number_of_virtual_networks = 10
    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@ -0,0 +1,674 @@
 #
 #  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Lisa Hsu
 #
 import math
 import m5
 from m5.objects import *
 from m5.defines import buildEnv
 from Ruby import create_topology
 from Ruby import send_evicts
 from Cluster import Cluster
 from Crossbar import Crossbar
 class CntrlBase:
    _seqs = 0
    @classmethod
    def seqCount(cls):
        # Use SeqCount not class since we need global count
        CntrlBase._seqs += 1
        return CntrlBase._seqs - 1
    _cntrls = 0
    @classmethod
    def cntrlCount(cls):
        # Use CntlCount not class since we need global count
        CntrlBase._cntrls += 1
        return CntrlBase._cntrls - 1
    _version = 0
    @classmethod
    def versionCount(cls):
        cls._version += 1 # Use count for this particular type
        return cls._version - 1
 class L1Cache(RubyCache):
    resourceStalls = False
    dataArrayBanks = 2
    tagArrayBanks = 2
    dataAccessLatency = 1
    tagAccessLatency = 1
    def create(self, size, assoc, options):
        self.size = MemorySize(size)
        self.assoc = assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L2Cache(RubyCache):
    resourceStalls = False
    assoc = 16
    dataArrayBanks = 16
    tagArrayBanks = 16
    def create(self, size, assoc, options):
        self.size = MemorySize(size)
        self.assoc = assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class CPCntrl(CorePair_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1Icache = L1Cache()
        self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
        self.L1D0cache = L1Cache()
        self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
        self.L1D1cache = L1Cache()
        self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
        self.L2cache = L2Cache()
        self.L2cache.create(options.l2_size, options.l2_assoc, options)
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1Icache
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True
        self.sequencer1 = RubySequencer()
        self.sequencer1.version = self.seqCount()
        self.sequencer1.icache = self.L1Icache
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.ruby_system = ruby_system
        self.sequencer1.coreid = 1
        self.sequencer1.is_cpu_sequencer = True
        self.issue_latency = options.cpu_to_dir_latency
        self.send_evictions = send_evicts(options)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class TCPCache(RubyCache):
    size = "16kB"
    assoc = 16
    dataArrayBanks = 16 #number of data banks
    tagArrayBanks = 16  #number of tag banks
    dataAccessLatency = 4
    tagAccessLatency = 1
    def create(self, options):
        self.size = MemorySize(options.tcp_size)
        self.assoc = options.tcp_assoc
        self.resourceStalls = options.no_tcc_resource_stalls
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class TCPCntrl(TCP_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency,
                                dataAccessLatency = options.TCP_latency)
        self.L1cache.resourceStalls = options.no_resource_stalls
        self.L1cache.create(options)
        self.issue_latency = 1
        self.coalescer = VIPERCoalescer()
        self.coalescer.version = self.seqCount()
        self.coalescer.icache = self.L1cache
        self.coalescer.dcache = self.L1cache
        self.coalescer.ruby_system = ruby_system
        self.coalescer.support_inst_reqs = False
        self.coalescer.is_cpu_sequencer = False
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.is_cpu_sequencer = True
        self.use_seq_not_coal = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def createCP(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency,
                                dataAccessLatency = options.TCP_latency)
        self.L1cache.resourceStalls = options.no_resource_stalls
        self.L1cache.create(options)
        self.issue_latency = 1
        self.coalescer = VIPERCoalescer()
        self.coalescer.version = self.seqCount()
        self.coalescer.icache = self.L1cache
        self.coalescer.dcache = self.L1cache
        self.coalescer.ruby_system = ruby_system
        self.coalescer.support_inst_reqs = False
        self.coalescer.is_cpu_sequencer = False
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.is_cpu_sequencer = True
        self.use_seq_not_coal = True
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class SQCCache(RubyCache):
    dataArrayBanks = 8
    tagArrayBanks = 8
    dataAccessLatency = 1
    tagAccessLatency = 1
    def create(self, options):
        self.size = MemorySize(options.sqc_size)
        self.assoc = options.sqc_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class SQCCntrl(SQC_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = SQCCache()
        self.L1cache.create(options)
        self.L1cache.resourceStalls = options.no_resource_stalls
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.support_data_reqs = False
        self.sequencer.is_cpu_sequencer = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class TCC(RubyCache):
    size = MemorySize("256kB")
    assoc = 16
    dataAccessLatency = 8
    tagAccessLatency = 2
    resourceStalls = True
    def create(self, options):
        self.assoc = options.tcc_assoc
        if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
          s = options.num_compute_units
          tcc_size = s * 128
          tcc_size = str(tcc_size)+'kB'
          self.size = MemorySize(tcc_size)
          self.dataArrayBanks = 64
          self.tagArrayBanks = 64
        else:
          self.size = MemorySize(options.tcc_size)
          self.dataArrayBanks = 256 / options.num_tccs #number of data banks
          self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
        self.size.value = self.size.value / options.num_tccs
        if ((self.size.value / long(self.assoc)) < 128):
            self.size.value = long(128 * self.assoc)
        self.start_index_bit = math.log(options.cacheline_size, 2) + \
                               math.log(options.num_tccs, 2)
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class TCCCntrl(TCC_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L2cache = TCC()
        self.L2cache.create(options)
        self.L2cache.resourceStalls = options.no_tcc_resource_stalls
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class L3Cache(RubyCache):
    dataArrayBanks = 16
    tagArrayBanks = 16
    def create(self, options, ruby_system, system):
        self.size = MemorySize(options.l3_size)
        self.size.value /= options.num_dirs
        self.assoc = options.l3_assoc
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataAccessLatency = options.l3_data_latency
        self.tagAccessLatency = options.l3_tag_latency
        self.resourceStalls = False
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L3Cntrl(L3Cache_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L3cache = L3Cache()
        self.L3cache.create(options, ruby_system, system)
        self.l3_response_latency = max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 class DirMem(RubyDirectoryMemory, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        phys_mem_size = AddrRange(options.mem_size).size()
        mem_module_size = phys_mem_size / options.num_dirs
        dir_size = MemorySize('0B')
        dir_size.value = mem_module_size
        self.size = dir_size
 class DirCntrl(Directory_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.response_latency = 30
        self.directory = DirMem()
        self.directory.create(options, ruby_system, system)
        self.L3CacheMemory = L3Cache()
        self.L3CacheMemory.create(options, ruby_system, system)
        self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
                                  self.L3CacheMemory.tagAccessLatency)
        self.number_of_TBEs = options.num_tbes
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 def define_options(parser):
    parser.add_option("--num-subcaches", type = "int", default = 4)
    parser.add_option("--l3-data-latency", type = "int", default = 20)
    parser.add_option("--l3-tag-latency", type = "int", default = 15)
    parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
    parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
    parser.add_option("--no-resource-stalls", action = "store_false",
                      default = True)
    parser.add_option("--no-tcc-resource-stalls", action = "store_false",
                      default = True)
    parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
    parser.add_option("--num-tbes", type = "int", default = 256)
    parser.add_option("--l2-latency", type = "int", default = 50)  # load to use
    parser.add_option("--num-tccs", type = "int", default = 1,
                      help = "number of TCC banks in the GPU")
    parser.add_option("--sqc-size", type = 'string', default = '32kB',
                      help = "SQC cache size")
    parser.add_option("--sqc-assoc", type = 'int', default = 8,
                      help = "SQC cache assoc")
    parser.add_option("--WB_L1", action = "store_true", default = False,
                      help = "writeback L1")
    parser.add_option("--WB_L2", action = "store_true", default = False,
                      help = "writeback L2")
    parser.add_option("--TCP_latency", type = "int", default = 4,
                      help = "TCP latency")
    parser.add_option("--TCC_latency", type = "int", default = 16,
                      help = "TCC latency")
    parser.add_option("--tcc-size", type = 'string', default = '256kB',
                      help = "agregate tcc size")
    parser.add_option("--tcc-assoc", type = 'int', default = 16,
                      help = "tcc assoc")
    parser.add_option("--tcp-size", type = 'string', default = '16kB',
                      help = "tcp size")
    parser.add_option("--tcp-assoc", type = 'int', default = 16,
                      help = "tcp assoc")
    parser.add_option("--noL1", action = "store_true", default = False,
                      help = "bypassL1")
 def create_system(options, full_system, system, dma_devices, ruby_system):
    if buildEnv['PROTOCOL'] != 'GPU_VIPER':
        panic("This script requires the GPU_VIPER protocol to be built.")
    cpu_sequencers = []
    #
    # The ruby network creation expects the list of nodes in the system to be
    # consistent with the NetDest list.  Therefore the l1 controller nodes
    # must be listed before the directory nodes and directory nodes before
    # dma nodes, etc.
    #
    cp_cntrl_nodes = []
    tcp_cntrl_nodes = []
    sqc_cntrl_nodes = []
    tcc_cntrl_nodes = []
    dir_cntrl_nodes = []
    l3_cntrl_nodes = []
    #
    # Must create the individual controllers before the network to ensure the
    # controller constructors are called before the network constructor
    #
    # For an odd number of CPUs, still create the right number of controllers
    TCC_bits = int(math.log(options.num_tccs, 2))
    # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
    # Clusters
    crossbar_bw = None
    mainCluster = None
    if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
        #Assuming a 2GHz clock
        crossbar_bw = 16 * options.num_compute_units * options.bw_scalor
        mainCluster = Cluster(intBW=crossbar_bw)
    else:
        mainCluster = Cluster(intBW=8) # 16 GB/s
    for i in xrange(options.num_dirs):
        dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits = TCC_bits)
        dir_cntrl.create(options, ruby_system, system)
        dir_cntrl.number_of_TBEs = options.num_tbes
        dir_cntrl.useL3OnWT = options.use_L3_on_WT
        # the number_of_TBEs is inclusive of TBEs below
        # Connect the Directory controller to the ruby network
        dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
        dir_cntrl.requestFromCores.slave = ruby_system.network.master
        dir_cntrl.responseFromCores = MessageBuffer()
        dir_cntrl.responseFromCores.slave = ruby_system.network.master
        dir_cntrl.unblockFromCores = MessageBuffer()
        dir_cntrl.unblockFromCores.slave = ruby_system.network.master
        dir_cntrl.probeToCore = MessageBuffer()
        dir_cntrl.probeToCore.master = ruby_system.network.slave
        dir_cntrl.responseToCore = MessageBuffer()
        dir_cntrl.responseToCore.master = ruby_system.network.slave
        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
        dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
        dir_cntrl.responseFromMemory = MessageBuffer()
        exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
        dir_cntrl_nodes.append(dir_cntrl)
        mainCluster.add(dir_cntrl)
    cpuCluster = None
    if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
        cpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
    else:
        cpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
    for i in xrange((options.num_cpus + 1) / 2):
        cp_cntrl = CPCntrl()
        cp_cntrl.create(options, ruby_system, system)
        exec("ruby_system.cp_cntrl%d = cp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
        # Connect the CP controllers and the network
        cp_cntrl.requestFromCore = MessageBuffer()
        cp_cntrl.requestFromCore.master = ruby_system.network.slave
        cp_cntrl.responseFromCore = MessageBuffer()
        cp_cntrl.responseFromCore.master = ruby_system.network.slave
        cp_cntrl.unblockFromCore = MessageBuffer()
        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
        cp_cntrl.probeToCore = MessageBuffer()
        cp_cntrl.probeToCore.slave = ruby_system.network.master
        cp_cntrl.responseToCore = MessageBuffer()
        cp_cntrl.responseToCore.slave = ruby_system.network.master
        cp_cntrl.mandatoryQueue = MessageBuffer()
        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
        cpuCluster.add(cp_cntrl)
    gpuCluster = None
    if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
      gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
    else:
      gpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
    for i in xrange(options.num_compute_units):
        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
                             issue_latency = 1,
                             number_of_TBEs = 2560)
        # TBEs set to max outstanding requests
        tcp_cntrl.create(options, ruby_system, system)
        tcp_cntrl.WB = options.WB_L1
        tcp_cntrl.disableL1 = options.noL1
        tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency
        tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency
        exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(tcp_cntrl.coalescer)
        tcp_cntrl_nodes.append(tcp_cntrl)
        # Connect the TCP controller to the ruby network
        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
        tcp_cntrl.unblockFromCore = MessageBuffer()
        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
        tcp_cntrl.mandatoryQueue = MessageBuffer()
        gpuCluster.add(tcp_cntrl)
    for i in xrange(options.num_sqc):
        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
        sqc_cntrl.create(options, ruby_system, system)
        exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(sqc_cntrl.sequencer)
        # Connect the SQC controller to the ruby network
        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
        sqc_cntrl.mandatoryQueue = MessageBuffer()
        # SQC also in GPU cluster
        gpuCluster.add(sqc_cntrl)
    for i in xrange(options.numCPs):
        tcp_ID = options.num_compute_units + i
        sqc_ID = options.num_sqc + i
        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
                             issue_latency = 1,
                             number_of_TBEs = 2560)
        # TBEs set to max outstanding requests
        tcp_cntrl.createCP(options, ruby_system, system)
        tcp_cntrl.WB = options.WB_L1
        tcp_cntrl.disableL1 = options.noL1
        tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency
        tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency
        exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % tcp_ID)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(tcp_cntrl.sequencer)
        tcp_cntrl_nodes.append(tcp_cntrl)
        # Connect the CP (TCP) controllers to the ruby network
        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
        tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
        tcp_cntrl.mandatoryQueue = MessageBuffer()
        gpuCluster.add(tcp_cntrl)
        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
        sqc_cntrl.create(options, ruby_system, system)
        exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % sqc_ID)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(sqc_cntrl.sequencer)
        # SQC also in GPU cluster
        gpuCluster.add(sqc_cntrl)
    for i in xrange(options.num_tccs):
        tcc_cntrl = TCCCntrl(l2_response_latency = options.TCC_latency)
        tcc_cntrl.create(options, ruby_system, system)
        tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
        tcc_cntrl.l2_response_latency = options.TCC_latency
        tcc_cntrl_nodes.append(tcc_cntrl)
        tcc_cntrl.WB = options.WB_L2
        tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
        # the number_of_TBEs is inclusive of TBEs below
        # Connect the TCC controllers to the ruby network
        tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
        tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
        tcc_cntrl.responseToCore.master = ruby_system.network.slave
        tcc_cntrl.probeFromNB = MessageBuffer()
        tcc_cntrl.probeFromNB.slave = ruby_system.network.master
        tcc_cntrl.responseFromNB = MessageBuffer()
        tcc_cntrl.responseFromNB.slave = ruby_system.network.master
        tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
        tcc_cntrl.requestToNB.master = ruby_system.network.slave
        tcc_cntrl.responseToNB = MessageBuffer()
        tcc_cntrl.responseToNB.master = ruby_system.network.slave
        tcc_cntrl.unblockToNB = MessageBuffer()
        tcc_cntrl.unblockToNB.master = ruby_system.network.slave
        tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
        exec("ruby_system.tcc_cntrl%d = tcc_cntrl" % i)
        # connect all of the wire buffers between L3 and dirs up
        # TCC cntrls added to the GPU cluster
        gpuCluster.add(tcc_cntrl)
    # Assuming no DMA devices
    assert(len(dma_devices) == 0)
    # Add cpu/gpu clusters to main cluster
    mainCluster.add(cpuCluster)
    mainCluster.add(gpuCluster)
    ruby_system.network.number_of_virtual_networks = 10
    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
--- a/configs/ruby/GPU_VIPER_Baseline.py
+++ b/configs/ruby/GPU_VIPER_Baseline.py
@ -0,0 +1,588 @@
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Sooraj Puthoor
 #
 import math
 import m5
 from m5.objects import *
 from m5.defines import buildEnv
 from Ruby import create_topology
 from Ruby import send_evicts
 from Cluster import Cluster
 from Crossbar import Crossbar
 class CntrlBase:
    _seqs = 0
    @classmethod
    def seqCount(cls):
        # Use SeqCount not class since we need global count
        CntrlBase._seqs += 1
        return CntrlBase._seqs - 1
    _cntrls = 0
    @classmethod
    def cntrlCount(cls):
        # Use CntlCount not class since we need global count
        CntrlBase._cntrls += 1
        return CntrlBase._cntrls - 1
    _version = 0
    @classmethod
    def versionCount(cls):
        cls._version += 1 # Use count for this particular type
        return cls._version - 1
 class L1Cache(RubyCache):
    resourceStalls = False
    dataArrayBanks = 2
    tagArrayBanks = 2
    dataAccessLatency = 1
    tagAccessLatency = 1
    def create(self, size, assoc, options):
        self.size = MemorySize(size)
        self.assoc = assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L2Cache(RubyCache):
    resourceStalls = False
    assoc = 16
    dataArrayBanks = 16
    tagArrayBanks = 16
    def create(self, size, assoc, options):
        self.size = MemorySize(size)
        self.assoc = assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class CPCntrl(CorePair_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1Icache = L1Cache()
        self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
        self.L1D0cache = L1Cache()
        self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
        self.L1D1cache = L1Cache()
        self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
        self.L2cache = L2Cache()
        self.L2cache.create(options.l2_size, options.l2_assoc, options)
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1Icache
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True
        self.sequencer1 = RubySequencer()
        self.sequencer1.version = self.seqCount()
        self.sequencer1.icache = self.L1Icache
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.ruby_system = ruby_system
        self.sequencer1.coreid = 1
        self.sequencer1.is_cpu_sequencer = True
        self.issue_latency = options.cpu_to_dir_latency
        self.send_evictions = send_evicts(options)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class TCPCache(RubyCache):
    size = "16kB"
    assoc = 16
    dataArrayBanks = 16
    tagArrayBanks = 16
    dataAccessLatency = 4
    tagAccessLatency = 1
    def create(self, options):
        self.size = MemorySize(options.tcp_size)
        self.dataArrayBanks = 16
        self.tagArrayBanks = 16
        self.dataAccessLatency = 4
        self.tagAccessLatency = 1
        self.resourceStalls = options.no_tcc_resource_stalls
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class TCPCntrl(TCP_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = TCPCache()
        self.L1cache.create(options)
        self.issue_latency = 1
        self.coalescer = VIPERCoalescer()
        self.coalescer.version = self.seqCount()
        self.coalescer.icache = self.L1cache
        self.coalescer.dcache = self.L1cache
        self.coalescer.ruby_system = ruby_system
        self.coalescer.support_inst_reqs = False
        self.coalescer.is_cpu_sequencer = False
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.is_cpu_sequencer = True
        self.use_seq_not_coal = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class SQCCache(RubyCache):
    dataArrayBanks = 8
    tagArrayBanks = 8
    dataAccessLatency = 1
    tagAccessLatency = 1
    def create(self, options):
        self.size = MemorySize(options.sqc_size)
        self.assoc = options.sqc_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class SQCCntrl(SQC_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = SQCCache()
        self.L1cache.create(options)
        self.L1cache.resourceStalls = False
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.support_data_reqs = False
        self.sequencer.is_cpu_sequencer = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class TCC(RubyCache):
    size = MemorySize("256kB")
    assoc = 16
    dataAccessLatency = 8
    tagAccessLatency = 2
    resourceStalls = True
    def create(self, options):
        self.assoc = options.tcc_assoc
        if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
          s = options.num_compute_units
          tcc_size = s * 128
          tcc_size = str(tcc_size)+'kB'
          self.size = MemorySize(tcc_size)
          self.dataArrayBanks = 64
          self.tagArrayBanks = 64
        else:
          self.size = MemorySize(options.tcc_size)
          self.dataArrayBanks = 256 / options.num_tccs #number of data banks
          self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
        self.size.value = self.size.value / options.num_tccs
        if ((self.size.value / long(self.assoc)) < 128):
            self.size.value = long(128 * self.assoc)
        self.start_index_bit = math.log(options.cacheline_size, 2) + \
                               math.log(options.num_tccs, 2)
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class TCCCntrl(TCC_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L2cache = TCC()
        self.L2cache.create(options)
        self.ruby_system = ruby_system
        self.L2cache.resourceStalls = options.no_tcc_resource_stalls
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class L3Cache(RubyCache):
    dataArrayBanks = 16
    tagArrayBanks = 16
    def create(self, options, ruby_system, system):
        self.size = MemorySize(options.l3_size)
        self.size.value /= options.num_dirs
        self.assoc = options.l3_assoc
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataAccessLatency = options.l3_data_latency
        self.tagAccessLatency = options.l3_tag_latency
        self.resourceStalls = False
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class ProbeFilter(RubyCache):
    size = "4MB"
    assoc = 16
    dataArrayBanks = 256
    tagArrayBanks = 256
    def create(self, options, ruby_system, system):
        self.block_size = "%dB" % (64 * options.blocks_per_region)
        self.size = options.region_dir_entries * \
            self.block_size * options.num_compute_units
        self.assoc = 8
        self.tagArrayBanks = 8
        self.tagAccessLatency = options.dir_tag_latency
        self.dataAccessLatency = 1
        self.resourceStalls = options.no_resource_stalls
        self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L3Cntrl(L3Cache_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L3cache = L3Cache()
        self.L3cache.create(options, ruby_system, system)
        self.l3_response_latency = \
            max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 class DirMem(RubyDirectoryMemory, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        phys_mem_size = AddrRange(options.mem_size).size()
        mem_module_size = phys_mem_size / options.num_dirs
        dir_size = MemorySize('0B')
        dir_size.value = mem_module_size
        self.size = dir_size
 class DirCntrl(Directory_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.response_latency = 30
        self.directory = DirMem()
        self.directory.create(options, ruby_system, system)
        self.L3CacheMemory = L3Cache()
        self.L3CacheMemory.create(options, ruby_system, system)
        self.ProbeFilterMemory = ProbeFilter()
        self.ProbeFilterMemory.create(options, ruby_system, system)
        self.l3_hit_latency = \
            max(self.L3CacheMemory.dataAccessLatency,
            self.L3CacheMemory.tagAccessLatency)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 def define_options(parser):
    parser.add_option("--num-subcaches", type = "int", default = 4)
    parser.add_option("--l3-data-latency", type = "int", default = 20)
    parser.add_option("--l3-tag-latency", type = "int", default = 15)
    parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
    parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
    parser.add_option("--no-resource-stalls", action = "store_false",
                      default = True)
    parser.add_option("--no-tcc-resource-stalls", action = "store_false",
                      default = True)
    parser.add_option("--num-tbes", type = "int", default = 2560)
    parser.add_option("--l2-latency", type = "int", default = 50)  # load to use
    parser.add_option("--num-tccs", type = "int", default = 1,
                      help = "number of TCC banks in the GPU")
    parser.add_option("--sqc-size", type = 'string', default = '32kB',
                      help = "SQC cache size")
    parser.add_option("--sqc-assoc", type = 'int', default = 8,
                      help = "SQC cache assoc")
    parser.add_option("--region-dir-entries", type = "int", default = 8192)
    parser.add_option("--dir-tag-latency", type = "int", default = 8)
    parser.add_option("--dir-tag-banks", type = "int", default = 4)
    parser.add_option("--blocks-per-region", type = "int", default = 1)
    parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
    parser.add_option("--nonInclusiveDir", action = "store_true",
                      default = False)
    parser.add_option("--WB_L1", action = "store_true",
        default = False, help = "writeback L2")
    parser.add_option("--WB_L2", action = "store_true",
        default = False, help = "writeback L2")
    parser.add_option("--TCP_latency", type = "int",
        default = 4, help = "TCP latency")
    parser.add_option("--TCC_latency", type = "int",
        default = 16, help = "TCC latency")
    parser.add_option("--tcc-size", type = 'string', default = '2MB',
                      help = "agregate tcc size")
    parser.add_option("--tcc-assoc", type = 'int', default = 16,
                      help = "tcc assoc")
    parser.add_option("--tcp-size", type = 'string', default = '16kB',
                      help = "tcp size")
    parser.add_option("--sampler-sets", type = "int", default = 1024)
    parser.add_option("--sampler-assoc", type = "int", default = 16)
    parser.add_option("--sampler-counter", type = "int", default = 512)
    parser.add_option("--noL1", action = "store_true", default = False,
                      help = "bypassL1")
    parser.add_option("--noL2", action = "store_true", default = False,
                      help = "bypassL2")
 def create_system(options, full_system, system, dma_devices, ruby_system):
    if buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline':
        panic("This script requires the" \
        "GPU_VIPER_Baseline protocol to be built.")
    cpu_sequencers = []
    #
    # The ruby network creation expects the list of nodes in the system to be
    # consistent with the NetDest list.  Therefore the l1 controller nodes
    # must be listed before the directory nodes and directory nodes before
    # dma nodes, etc.
    #
    cp_cntrl_nodes = []
    tcp_cntrl_nodes = []
    sqc_cntrl_nodes = []
    tcc_cntrl_nodes = []
    dir_cntrl_nodes = []
    l3_cntrl_nodes = []
    #
    # Must create the individual controllers before the network to ensure the
    # controller constructors are called before the network constructor
    #
    # For an odd number of CPUs, still create the right number of controllers
    TCC_bits = int(math.log(options.num_tccs, 2))
    # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
    # Clusters
    crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
    mainCluster = Cluster(intBW = crossbar_bw)
    for i in xrange(options.num_dirs):
        dir_cntrl = DirCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
        dir_cntrl.create(options, ruby_system, system)
        dir_cntrl.number_of_TBEs = options.num_tbes
        dir_cntrl.useL3OnWT = options.use_L3_on_WT
        dir_cntrl.inclusiveDir = not options.nonInclusiveDir
        # Connect the Directory controller to the ruby network
        dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
        dir_cntrl.requestFromCores.slave = ruby_system.network.master
        dir_cntrl.responseFromCores = MessageBuffer()
        dir_cntrl.responseFromCores.slave = ruby_system.network.master
        dir_cntrl.unblockFromCores = MessageBuffer()
        dir_cntrl.unblockFromCores.slave = ruby_system.network.master
        dir_cntrl.probeToCore = MessageBuffer()
        dir_cntrl.probeToCore.master = ruby_system.network.slave
        dir_cntrl.responseToCore = MessageBuffer()
        dir_cntrl.responseToCore.master = ruby_system.network.slave
        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
        dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
        dir_cntrl.responseFromMemory = MessageBuffer()
        exec("system.dir_cntrl%d = dir_cntrl" % i)
        dir_cntrl_nodes.append(dir_cntrl)
        mainCluster.add(dir_cntrl)
    cpuCluster = Cluster(extBW = crossbar_bw, intBW=crossbar_bw)
    for i in xrange((options.num_cpus + 1) / 2):
        cp_cntrl = CPCntrl()
        cp_cntrl.create(options, ruby_system, system)
        exec("system.cp_cntrl%d = cp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
        # Connect the CP controllers and the network
        cp_cntrl.requestFromCore = MessageBuffer()
        cp_cntrl.requestFromCore.master = ruby_system.network.slave
        cp_cntrl.responseFromCore = MessageBuffer()
        cp_cntrl.responseFromCore.master = ruby_system.network.slave
        cp_cntrl.unblockFromCore = MessageBuffer()
        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
        cp_cntrl.probeToCore = MessageBuffer()
        cp_cntrl.probeToCore.slave = ruby_system.network.master
        cp_cntrl.responseToCore = MessageBuffer()
        cp_cntrl.responseToCore.slave = ruby_system.network.master
        cp_cntrl.mandatoryQueue = MessageBuffer()
        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
        cpuCluster.add(cp_cntrl)
    gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
    for i in xrange(options.num_compute_units):
        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
                             issue_latency = 1,
                             number_of_TBEs = 2560)
        # TBEs set to max outstanding requests
        tcp_cntrl.create(options, ruby_system, system)
        tcp_cntrl.WB = options.WB_L1
        tcp_cntrl.disableL1 = options.noL1
        exec("system.tcp_cntrl%d = tcp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(tcp_cntrl.coalescer)
        tcp_cntrl_nodes.append(tcp_cntrl)
        # Connect the CP (TCP) controllers to the ruby network
        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
        tcp_cntrl.unblockFromCore = MessageBuffer()
        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
        tcp_cntrl.mandatoryQueue = MessageBuffer()
        gpuCluster.add(tcp_cntrl)
    for i in xrange(options.num_sqc):
        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
        sqc_cntrl.create(options, ruby_system, system)
        exec("system.sqc_cntrl%d = sqc_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(sqc_cntrl.sequencer)
        # Connect the SQC controller to the ruby network
        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
        sqc_cntrl.mandatoryQueue = MessageBuffer()
        # SQC also in GPU cluster
        gpuCluster.add(sqc_cntrl)
    # Because of wire buffers, num_tccs must equal num_tccdirs
    numa_bit = 6
    for i in xrange(options.num_tccs):
        tcc_cntrl = TCCCntrl()
        tcc_cntrl.create(options, ruby_system, system)
        tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
        tcc_cntrl.l2_response_latency = options.TCC_latency
        tcc_cntrl_nodes.append(tcc_cntrl)
        tcc_cntrl.WB = options.WB_L2
        tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
        # Connect the TCC controllers to the ruby network
        tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
        tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
        tcc_cntrl.responseToCore.master = ruby_system.network.slave
        tcc_cntrl.probeFromNB = MessageBuffer()
        tcc_cntrl.probeFromNB.slave = ruby_system.network.master
        tcc_cntrl.responseFromNB = MessageBuffer()
        tcc_cntrl.responseFromNB.slave = ruby_system.network.master
        tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
        tcc_cntrl.requestToNB.master = ruby_system.network.slave
        tcc_cntrl.responseToNB = MessageBuffer()
        tcc_cntrl.responseToNB.master = ruby_system.network.slave
        tcc_cntrl.unblockToNB = MessageBuffer()
        tcc_cntrl.unblockToNB.master = ruby_system.network.slave
        tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
        exec("system.tcc_cntrl%d = tcc_cntrl" % i)
        # connect all of the wire buffers between L3 and dirs up
        # TCC cntrls added to the GPU cluster
        gpuCluster.add(tcc_cntrl)
    # Assuming no DMA devices
    assert(len(dma_devices) == 0)
    # Add cpu/gpu clusters to main cluster
    mainCluster.add(cpuCluster)
    mainCluster.add(gpuCluster)
    ruby_system.network.number_of_virtual_networks = 10
    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
--- a/configs/ruby/GPU_VIPER_Region.py
+++ b/configs/ruby/GPU_VIPER_Region.py
@ -0,0 +1,758 @@
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Sooraj Puthoor
 #
 import math
 import m5
 from m5.objects import *
 from m5.defines import buildEnv
 from Ruby import send_evicts
 from Cluster import Cluster
 class CntrlBase:
    _seqs = 0
    @classmethod
    def seqCount(cls):
        # Use SeqCount not class since we need global count
        CntrlBase._seqs += 1
        return CntrlBase._seqs - 1
    _cntrls = 0
    @classmethod
    def cntrlCount(cls):
        # Use CntlCount not class since we need global count
        CntrlBase._cntrls += 1
        return CntrlBase._cntrls - 1
    _version = 0
    @classmethod
    def versionCount(cls):
        cls._version += 1 # Use count for this particular type
        return cls._version - 1
 #
 # Note: the L1 Cache latency is only used by the sequencer on fast path hits
 #
 class L1Cache(RubyCache):
    resourceStalls = False
    dataArrayBanks = 2
    tagArrayBanks = 2
    dataAccessLatency = 1
    tagAccessLatency = 1
    def create(self, size, assoc, options):
        self.size = MemorySize(size)
        self.assoc = assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L2Cache(RubyCache):
    resourceStalls = False
    assoc = 16
    dataArrayBanks = 16
    tagArrayBanks = 16
    def create(self, size, assoc, options):
        self.size = MemorySize(size)
        self.assoc = assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class CPCntrl(CorePair_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1Icache = L1Cache()
        self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
        self.L1D0cache = L1Cache()
        self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
        self.L1D1cache = L1Cache()
        self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
        self.L2cache = L2Cache()
        self.L2cache.create(options.l2_size, options.l2_assoc, options)
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1Icache
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True
        self.sequencer1 = RubySequencer()
        self.sequencer1.version = self.seqCount()
        self.sequencer1.icache = self.L1Icache
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.ruby_system = ruby_system
        self.sequencer1.coreid = 1
        self.sequencer1.is_cpu_sequencer = True
        self.issue_latency = 1
        self.send_evictions = send_evicts(options)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class TCPCache(RubyCache):
    size = "16kB"
    assoc = 16
    dataArrayBanks = 16
    tagArrayBanks = 16
    dataAccessLatency = 4
    tagAccessLatency = 1
    def create(self, options):
        self.size = MemorySize(options.tcp_size)
        self.dataArrayBanks = 16
        self.tagArrayBanks = 16
        self.dataAccessLatency = 4
        self.tagAccessLatency = 1
        self.resourceStalls = options.no_tcc_resource_stalls
        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
 class TCPCntrl(TCP_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = TCPCache(dataAccessLatency = options.TCP_latency)
        self.L1cache.create(options)
        self.issue_latency = 1
        self.coalescer = VIPERCoalescer()
        self.coalescer.version = self.seqCount()
        self.coalescer.icache = self.L1cache
        self.coalescer.dcache = self.L1cache
        self.coalescer.ruby_system = ruby_system
        self.coalescer.support_inst_reqs = False
        self.coalescer.is_cpu_sequencer = False
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.is_cpu_sequencer = True
        self.use_seq_not_coal = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class SQCCache(RubyCache):
    dataArrayBanks = 8
    tagArrayBanks = 8
    dataAccessLatency = 1
    tagAccessLatency = 1
    def create(self, options):
        self.size = MemorySize(options.sqc_size)
        self.assoc = options.sqc_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
 class SQCCntrl(SQC_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1cache = SQCCache()
        self.L1cache.create(options)
        self.L1cache.resourceStalls = False
        self.sequencer = RubySequencer()
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1cache
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.support_data_reqs = False
        self.sequencer.is_cpu_sequencer = False
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class TCC(RubyCache):
    size = MemorySize("256kB")
    assoc = 16
    dataAccessLatency = 8
    tagAccessLatency = 2
    resourceStalls = False
    def create(self, options):
        self.assoc = options.tcc_assoc
        if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
          s = options.num_compute_units
          tcc_size = s * 128
          tcc_size = str(tcc_size)+'kB'
          self.size = MemorySize(tcc_size)
          self.dataArrayBanks = 64
          self.tagArrayBanks = 64
        else:
          self.size = MemorySize(options.tcc_size)
          self.dataArrayBanks = 256 / options.num_tccs #number of data banks
          self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
        self.size.value = self.size.value / options.num_tccs
        if ((self.size.value / long(self.assoc)) < 128):
            self.size.value = long(128 * self.assoc)
        self.start_index_bit = math.log(options.cacheline_size, 2) + \
                               math.log(options.num_tccs, 2)
        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
 class TCCCntrl(TCC_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L2cache = TCC()
        self.L2cache.create(options)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class L3Cache(RubyCache):
    dataArrayBanks = 16
    tagArrayBanks = 16
    def create(self, options, ruby_system, system):
        self.size = MemorySize(options.l3_size)
        self.size.value /= options.num_dirs
        self.assoc = options.l3_assoc
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataAccessLatency = options.l3_data_latency
        self.tagAccessLatency = options.l3_tag_latency
        self.resourceStalls = False
        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
 class L3Cntrl(L3Cache_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L3cache = L3Cache()
        self.L3cache.create(options, ruby_system, system)
        self.l3_response_latency = \
            max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 # Directory memory: Directory memory of infinite size which is
 # used by directory controller to store the "states" of the
 # state machine. The state machine is implemented per cache block
 class DirMem(RubyDirectoryMemory, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        phys_mem_size = AddrRange(options.mem_size).size()
        mem_module_size = phys_mem_size / options.num_dirs
        dir_size = MemorySize('0B')
        dir_size.value = mem_module_size
        self.size = dir_size
 # Directory controller: Contains directory memory, L3 cache and associated state
 # machine which is used to accurately redirect a data request to L3 cache or to
 # memory. The permissions requests do not come to this directory for region
 # based protocols as they are handled exclusively by the region directory.
 # However, region directory controller uses this directory controller for
 # sending probe requests and receiving probe responses.
 class DirCntrl(Directory_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.response_latency = 25
        self.response_latency_regionDir = 1
        self.directory = DirMem()
        self.directory.create(options, ruby_system, system)
        self.L3CacheMemory = L3Cache()
        self.L3CacheMemory.create(options, ruby_system, system)
        self.l3_hit_latency = \
            max(self.L3CacheMemory.dataAccessLatency,
            self.L3CacheMemory.tagAccessLatency)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 # Region directory : Stores region permissions
 class RegionDir(RubyCache):
    def create(self, options, ruby_system, system):
        self.block_size = "%dB" % (64 * options.blocks_per_region)
        self.size = options.region_dir_entries * \
            self.block_size * options.num_compute_units
        self.assoc = 8
        self.tagArrayBanks = 8
        self.tagAccessLatency = options.dir_tag_latency
        self.dataAccessLatency = 1
        self.resourceStalls = options.no_resource_stalls
        self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
        self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
 # Region directory controller : Contains region directory and associated state
 # machine for dealing with region coherence requests.
 class RegionCntrl(RegionDir_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.cacheMemory = RegionDir()
        self.cacheMemory.create(options, ruby_system, system)
        self.blocksPerRegion = options.blocks_per_region
        self.toDirLatency = \
            max(self.cacheMemory.dataAccessLatency,
            self.cacheMemory.tagAccessLatency)
        self.ruby_system = ruby_system
        self.always_migrate = options.always_migrate
        self.sym_migrate = options.symmetric_migrate
        self.asym_migrate = options.asymmetric_migrate
        if self.always_migrate:
            assert(not self.asym_migrate and not self.sym_migrate)
        if self.sym_migrate:
            assert(not self.always_migrate and not self.asym_migrate)
        if self.asym_migrate:
            assert(not self.always_migrate and not self.sym_migrate)
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 # Region Buffer: A region directory cache which avoids some potential
 # long latency lookup of region directory for getting region permissions
 class RegionBuffer(RubyCache):
    assoc = 4
    dataArrayBanks = 256
    tagArrayBanks = 256
    dataAccessLatency = 1
    tagAccessLatency = 1
    resourceStalls = True
 class RBCntrl(RegionBuffer_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.cacheMemory = RegionBuffer()
        self.cacheMemory.resourceStalls = options.no_tcc_resource_stalls
        self.cacheMemory.dataArrayBanks = 64
        self.cacheMemory.tagArrayBanks = 64
        self.blocksPerRegion = options.blocks_per_region
        self.cacheMemory.block_size = "%dB" % (64 * self.blocksPerRegion)
        self.cacheMemory.start_index_bit = \
            6 + int(math.log(self.blocksPerRegion, 2))
        self.cacheMemory.size = options.region_buffer_entries * \
            self.cacheMemory.block_size * options.num_compute_units
        self.toDirLatency = options.gpu_to_dir_latency
        self.toRegionDirLatency = options.cpu_to_dir_latency
        self.noTCCdir = True
        TCC_bits = int(math.log(options.num_tccs, 2))
        self.TCC_select_num_bits = TCC_bits
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
        self.cacheMemory.replacement_policy = \
            PseudoLRUReplacementPolicy(assoc = self.cacheMemory.assoc)
 def define_options(parser):
    parser.add_option("--num-subcaches", type="int", default=4)
    parser.add_option("--l3-data-latency", type="int", default=20)
    parser.add_option("--l3-tag-latency", type="int", default=15)
    parser.add_option("--cpu-to-dir-latency", type="int", default=120)
    parser.add_option("--gpu-to-dir-latency", type="int", default=60)
    parser.add_option("--no-resource-stalls", action="store_false",
                      default=True)
    parser.add_option("--no-tcc-resource-stalls", action="store_false",
                      default=True)
    parser.add_option("--num-tbes", type="int", default=32)
    parser.add_option("--l2-latency", type="int", default=50) # load to use
    parser.add_option("--num-tccs", type="int", default=1,
                      help="number of TCC banks in the GPU")
    parser.add_option("--sqc-size", type='string', default='32kB',
                      help="SQC cache size")
    parser.add_option("--sqc-assoc", type='int', default=8,
                      help="SQC cache assoc")
    parser.add_option("--WB_L1", action="store_true",
        default=False, help="L2 Writeback Cache")
    parser.add_option("--WB_L2", action="store_true",
        default=False, help="L2 Writeback Cache")
    parser.add_option("--TCP_latency",
        type="int", default=4, help="TCP latency")
    parser.add_option("--TCC_latency",
        type="int", default=16, help="TCC latency")
    parser.add_option("--tcc-size", type='string', default='2MB',
                      help="agregate tcc size")
    parser.add_option("--tcc-assoc", type='int', default=16,
                      help="tcc assoc")
    parser.add_option("--tcp-size", type='string', default='16kB',
                      help="tcp size")
    parser.add_option("--dir-tag-latency", type="int", default=4)
    parser.add_option("--dir-tag-banks", type="int", default=4)
    parser.add_option("--blocks-per-region", type="int", default=16)
    parser.add_option("--dir-entries", type="int", default=8192)
    # Region buffer is a cache of region directory. Hence region
    # directory is inclusive with respect to region directory.
    # However, region directory is non-inclusive with respect to
    # the caches in the system
    parser.add_option("--region-dir-entries", type="int", default=1024)
    parser.add_option("--region-buffer-entries", type="int", default=512)
    parser.add_option("--always-migrate",
        action="store_true", default=False)
    parser.add_option("--symmetric-migrate",
        action="store_true", default=False)
    parser.add_option("--asymmetric-migrate",
        action="store_true", default=False)
    parser.add_option("--use-L3-on-WT", action="store_true", default=False)
 def create_system(options, full_system, system, dma_devices, ruby_system):
    if buildEnv['PROTOCOL'] != 'GPU_VIPER_Region':
        panic("This script requires the GPU_VIPER_Region protocol to be built.")
    cpu_sequencers = []
    #
    # The ruby network creation expects the list of nodes in the system to be
    # consistent with the NetDest list.  Therefore the l1 controller nodes
    # must be listed before the directory nodes and directory nodes before
    # dma nodes, etc.
    #
    dir_cntrl_nodes = []
    # For an odd number of CPUs, still create the right number of controllers
    TCC_bits = int(math.log(options.num_tccs, 2))
    #
    # Must create the individual controllers before the network to ensure the
    # controller constructors are called before the network constructor
    #
    # For an odd number of CPUs, still create the right number of controllers
    crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
    cpuCluster = Cluster(extBW = (crossbar_bw), intBW=crossbar_bw)
    for i in xrange((options.num_cpus + 1) / 2):
        cp_cntrl = CPCntrl()
        cp_cntrl.create(options, ruby_system, system)
        rb_cntrl = RBCntrl()
        rb_cntrl.create(options, ruby_system, system)
        rb_cntrl.number_of_TBEs = 256
        rb_cntrl.isOnCPU = True
        cp_cntrl.regionBufferNum = rb_cntrl.version
        exec("system.cp_cntrl%d = cp_cntrl" % i)
        exec("system.rb_cntrl%d = rb_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
        # Connect the CP controllers and the network
        cp_cntrl.requestFromCore = MessageBuffer()
        cp_cntrl.requestFromCore.master = ruby_system.network.slave
        cp_cntrl.responseFromCore = MessageBuffer()
        cp_cntrl.responseFromCore.master = ruby_system.network.slave
        cp_cntrl.unblockFromCore = MessageBuffer()
        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
        cp_cntrl.probeToCore = MessageBuffer()
        cp_cntrl.probeToCore.slave = ruby_system.network.master
        cp_cntrl.responseToCore = MessageBuffer()
        cp_cntrl.responseToCore.slave = ruby_system.network.master
        cp_cntrl.mandatoryQueue = MessageBuffer()
        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
        # Connect the RB controllers to the ruby network
        rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
        rb_cntrl.requestFromCore.slave = ruby_system.network.master
        rb_cntrl.responseFromCore = MessageBuffer()
        rb_cntrl.responseFromCore.slave = ruby_system.network.master
        rb_cntrl.requestToNetwork = MessageBuffer()
        rb_cntrl.requestToNetwork.master = ruby_system.network.slave
        rb_cntrl.notifyFromRegionDir = MessageBuffer()
        rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
        rb_cntrl.probeFromRegionDir = MessageBuffer()
        rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
        rb_cntrl.unblockFromDir = MessageBuffer()
        rb_cntrl.unblockFromDir.slave = ruby_system.network.master
        rb_cntrl.responseToRegDir = MessageBuffer()
        rb_cntrl.responseToRegDir.master = ruby_system.network.slave
        rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
        cpuCluster.add(cp_cntrl)
        cpuCluster.add(rb_cntrl)
    gpuCluster = Cluster(extBW = (crossbar_bw), intBW = crossbar_bw)
    for i in xrange(options.num_compute_units):
        tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
                             issue_latency = 1,
                             number_of_TBEs = 2560)
        # TBEs set to max outstanding requests
        tcp_cntrl.create(options, ruby_system, system)
        tcp_cntrl.WB = options.WB_L1
        tcp_cntrl.disableL1 = False
        exec("system.tcp_cntrl%d = tcp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(tcp_cntrl.coalescer)
        # Connect the CP (TCP) controllers to the ruby network
        tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
        tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
        tcp_cntrl.unblockFromCore = MessageBuffer()
        tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
        tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.probeToTCP.slave = ruby_system.network.master
        tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
        tcp_cntrl.responseToTCP.slave = ruby_system.network.master
        tcp_cntrl.mandatoryQueue = MessageBuffer()
        gpuCluster.add(tcp_cntrl)
    for i in xrange(options.num_sqc):
        sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
        sqc_cntrl.create(options, ruby_system, system)
        exec("system.sqc_cntrl%d = sqc_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.append(sqc_cntrl.sequencer)
        # Connect the SQC controller to the ruby network
        sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
        sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
        sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.probeToSQC.slave = ruby_system.network.master
        sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
        sqc_cntrl.responseToSQC.slave = ruby_system.network.master
        sqc_cntrl.mandatoryQueue = MessageBuffer()
        # SQC also in GPU cluster
        gpuCluster.add(sqc_cntrl)
    numa_bit = 6
    for i in xrange(options.num_tccs):
        tcc_cntrl = TCCCntrl()
        tcc_cntrl.create(options, ruby_system, system)
        tcc_cntrl.l2_request_latency = 1
        tcc_cntrl.l2_response_latency = options.TCC_latency
        tcc_cntrl.WB = options.WB_L2
        tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
        # Connect the TCC controllers to the ruby network
        tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
        tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
        tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
        tcc_cntrl.responseToCore.master = ruby_system.network.slave
        tcc_cntrl.probeFromNB = MessageBuffer()
        tcc_cntrl.probeFromNB.slave = ruby_system.network.master
        tcc_cntrl.responseFromNB = MessageBuffer()
        tcc_cntrl.responseFromNB.slave = ruby_system.network.master
        tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
        tcc_cntrl.requestToNB.master = ruby_system.network.slave
        tcc_cntrl.responseToNB = MessageBuffer()
        tcc_cntrl.responseToNB.master = ruby_system.network.slave
        tcc_cntrl.unblockToNB = MessageBuffer()
        tcc_cntrl.unblockToNB.master = ruby_system.network.slave
        tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
        rb_cntrl = RBCntrl()
        rb_cntrl.create(options, ruby_system, system)
        rb_cntrl.number_of_TBEs = 2560 * options.num_compute_units
        rb_cntrl.isOnCPU = False
        # Connect the RB controllers to the ruby network
        rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
        rb_cntrl.requestFromCore.slave = ruby_system.network.master
        rb_cntrl.responseFromCore = MessageBuffer()
        rb_cntrl.responseFromCore.slave = ruby_system.network.master
        rb_cntrl.requestToNetwork = MessageBuffer()
        rb_cntrl.requestToNetwork.master = ruby_system.network.slave
        rb_cntrl.notifyFromRegionDir = MessageBuffer()
        rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
        rb_cntrl.probeFromRegionDir = MessageBuffer()
        rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
        rb_cntrl.unblockFromDir = MessageBuffer()
        rb_cntrl.unblockFromDir.slave = ruby_system.network.master
        rb_cntrl.responseToRegDir = MessageBuffer()
        rb_cntrl.responseToRegDir.master = ruby_system.network.slave
        rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
        tcc_cntrl.regionBufferNum = rb_cntrl.version
        exec("system.tcc_cntrl%d = tcc_cntrl" % i)
        exec("system.tcc_rb_cntrl%d = rb_cntrl" % i)
        # TCC cntrls added to the GPU cluster
        gpuCluster.add(tcc_cntrl)
        gpuCluster.add(rb_cntrl)
    # Because of wire buffers, num_l3caches must equal num_dirs
    # Region coherence only works with 1 dir
    assert(options.num_l3caches == options.num_dirs == 1)
    # This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
    # Clusters
    mainCluster = Cluster(intBW = crossbar_bw)
    dir_cntrl = DirCntrl()
    dir_cntrl.create(options, ruby_system, system)
    dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
    dir_cntrl.useL3OnWT = options.use_L3_on_WT
    # Connect the Directory controller to the ruby network
    dir_cntrl.requestFromCores = MessageBuffer()
    dir_cntrl.requestFromCores.slave = ruby_system.network.master
    dir_cntrl.responseFromCores = MessageBuffer()
    dir_cntrl.responseFromCores.slave = ruby_system.network.master
    dir_cntrl.unblockFromCores = MessageBuffer()
    dir_cntrl.unblockFromCores.slave = ruby_system.network.master
    dir_cntrl.probeToCore = MessageBuffer()
    dir_cntrl.probeToCore.master = ruby_system.network.slave
    dir_cntrl.responseToCore = MessageBuffer()
    dir_cntrl.responseToCore.master = ruby_system.network.slave
    dir_cntrl.reqFromRegBuf = MessageBuffer()
    dir_cntrl.reqFromRegBuf.slave = ruby_system.network.master
    dir_cntrl.reqToRegDir = MessageBuffer(ordered = True)
    dir_cntrl.reqToRegDir.master = ruby_system.network.slave
    dir_cntrl.reqFromRegDir = MessageBuffer(ordered = True)
    dir_cntrl.reqFromRegDir.slave = ruby_system.network.master
    dir_cntrl.unblockToRegDir = MessageBuffer()
    dir_cntrl.unblockToRegDir.master = ruby_system.network.slave
    dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
    dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
    dir_cntrl.responseFromMemory = MessageBuffer()
    exec("system.dir_cntrl%d = dir_cntrl" % i)
    dir_cntrl_nodes.append(dir_cntrl)
    mainCluster.add(dir_cntrl)
    reg_cntrl = RegionCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
    reg_cntrl.create(options, ruby_system, system)
    reg_cntrl.number_of_TBEs = options.num_tbes
    reg_cntrl.cpuRegionBufferNum = system.rb_cntrl0.version
    reg_cntrl.gpuRegionBufferNum = system.tcc_rb_cntrl0.version
    # Connect the Region Dir controllers to the ruby network
    reg_cntrl.requestToDir = MessageBuffer(ordered = True)
    reg_cntrl.requestToDir.master = ruby_system.network.slave
    reg_cntrl.notifyToRBuffer = MessageBuffer()
    reg_cntrl.notifyToRBuffer.master = ruby_system.network.slave
    reg_cntrl.probeToRBuffer = MessageBuffer()
    reg_cntrl.probeToRBuffer.master = ruby_system.network.slave
    reg_cntrl.responseFromRBuffer = MessageBuffer()
    reg_cntrl.responseFromRBuffer.slave = ruby_system.network.master
    reg_cntrl.requestFromRegBuf = MessageBuffer()
    reg_cntrl.requestFromRegBuf.slave = ruby_system.network.master
    reg_cntrl.triggerQueue = MessageBuffer(ordered = True)
    exec("system.reg_cntrl%d = reg_cntrl" % i)
    mainCluster.add(reg_cntrl)
    # Assuming no DMA devices
    assert(len(dma_devices) == 0)
    # Add cpu/gpu clusters to main cluster
    mainCluster.add(cpuCluster)
    mainCluster.add(gpuCluster)
    ruby_system.network.number_of_virtual_networks = 10
    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
--- a/configs/ruby/MOESI_AMD_Base.py
+++ b/configs/ruby/MOESI_AMD_Base.py
@ -0,0 +1,326 @@
 #
 #  Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Lisa Hsu
 #
 import math
 import m5
 from m5.objects import *
 from m5.defines import buildEnv
 from Ruby import create_topology
 from Ruby import send_evicts
 from Cluster import Cluster
 from Crossbar import Crossbar
 class CntrlBase:
    _seqs = 0
    @classmethod
    def seqCount(cls):
        # Use SeqCount not class since we need global count
        CntrlBase._seqs += 1
        return CntrlBase._seqs - 1
    _cntrls = 0
    @classmethod
    def cntrlCount(cls):
        # Use CntlCount not class since we need global count
        CntrlBase._cntrls += 1
        return CntrlBase._cntrls - 1
    _version = 0
    @classmethod
    def versionCount(cls):
        cls._version += 1 # Use count for this particular type
        return cls._version - 1
 class L1DCache(RubyCache):
    resourceStalls = False
    def create(self, options):
        self.size = MemorySize(options.l1d_size)
        self.assoc = options.l1d_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L1ICache(RubyCache):
    resourceStalls = False
    def create(self, options):
        self.size = MemorySize(options.l1i_size)
        self.assoc = options.l1i_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L2Cache(RubyCache):
    resourceStalls = False
    def create(self, options):
        self.size = MemorySize(options.l2_size)
        self.assoc = options.l2_assoc
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class CPCntrl(CorePair_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L1Icache = L1ICache()
        self.L1Icache.create(options)
        self.L1D0cache = L1DCache()
        self.L1D0cache.create(options)
        self.L1D1cache = L1DCache()
        self.L1D1cache.create(options)
        self.L2cache = L2Cache()
        self.L2cache.create(options)
        self.sequencer = RubySequencer()
        self.sequencer.icache_hit_latency = 2
        self.sequencer.dcache_hit_latency = 2
        self.sequencer.version = self.seqCount()
        self.sequencer.icache = self.L1Icache
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True
        self.sequencer1 = RubySequencer()
        self.sequencer1.version = self.seqCount()
        self.sequencer1.icache = self.L1Icache
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.icache_hit_latency = 2
        self.sequencer1.dcache_hit_latency = 2
        self.sequencer1.ruby_system = ruby_system
        self.sequencer1.coreid = 1
        self.sequencer1.is_cpu_sequencer = True
        self.issue_latency = options.cpu_to_dir_latency
        self.send_evictions = send_evicts(options)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
 class L3Cache(RubyCache):
    assoc = 8
    dataArrayBanks = 256
    tagArrayBanks = 256
    def create(self, options, ruby_system, system):
        self.size = MemorySize(options.l3_size)
        self.size.value /= options.num_dirs
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataArrayBanks /= options.num_dirs
        self.tagArrayBanks /= options.num_dirs
        self.dataAccessLatency = options.l3_data_latency
        self.tagAccessLatency = options.l3_tag_latency
        self.resourceStalls = options.no_resource_stalls
        self.replacement_policy = PseudoLRUReplacementPolicy()
 class L3Cntrl(L3Cache_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.L3cache = L3Cache()
        self.L3cache.create(options, ruby_system, system)
        self.l3_response_latency = max(self.L3cache.dataAccessLatency,
                                       self.L3cache.tagAccessLatency)
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 class DirMem(RubyDirectoryMemory, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        phys_mem_size = AddrRange(options.mem_size).size()
        mem_module_size = phys_mem_size / options.num_dirs
        dir_size = MemorySize('0B')
        dir_size.value = mem_module_size
        self.size = dir_size
 class DirCntrl(Directory_Controller, CntrlBase):
    def create(self, options, ruby_system, system):
        self.version = self.versionCount()
        self.response_latency = 30
        self.directory = DirMem()
        self.directory.create(options, ruby_system, system)
        self.L3CacheMemory = L3Cache()
        self.L3CacheMemory.create(options, ruby_system, system)
        self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
                                  self.L3CacheMemory.tagAccessLatency)
        self.number_of_TBEs = options.num_tbes
        self.ruby_system = ruby_system
        if options.recycle_latency:
            self.recycle_latency = options.recycle_latency
        self.CPUonly = True
    def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
                           req_to_l3, probe_to_l3, resp_to_l3):
        self.reqToDir = req_to_dir
        self.respToDir = resp_to_dir
        self.l3UnblockToDir = l3_unblock_to_dir
        self.reqToL3 = req_to_l3
        self.probeToL3 = probe_to_l3
        self.respToL3 = resp_to_l3
 def define_options(parser):
    parser.add_option("--num-subcaches", type="int", default=4)
    parser.add_option("--l3-data-latency", type="int", default=20)
    parser.add_option("--l3-tag-latency", type="int", default=15)
    parser.add_option("--cpu-to-dir-latency", type="int", default=15)
    parser.add_option("--no-resource-stalls", action="store_false",
                      default=True)
    parser.add_option("--num-tbes", type="int", default=256)
    parser.add_option("--l2-latency", type="int", default=50) # load to use
 def create_system(options, full_system, system, dma_devices, ruby_system):
    if buildEnv['PROTOCOL'] != 'MOESI_AMD_Base':
        panic("This script requires the MOESI_AMD_Base protocol.")
    cpu_sequencers = []
    #
    # The ruby network creation expects the list of nodes in the system to
    # be consistent with the NetDest list.  Therefore the l1 controller
    # nodes must be listed before the directory nodes and directory nodes
    # before dma nodes, etc.
    #
    l1_cntrl_nodes = []
    l3_cntrl_nodes = []
    dir_cntrl_nodes = []
    control_count = 0
    #
    # Must create the individual controllers before the network to ensure
    # the controller constructors are called before the network constructor
    #
    # This is the base crossbar that connects the L3s, Dirs, and cpu
    # Cluster
    mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
    for i in xrange(options.num_dirs):
        dir_cntrl = DirCntrl(TCC_select_num_bits = 0)
        dir_cntrl.create(options, ruby_system, system)
        # Connect the Directory controller to the ruby network
        dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
        dir_cntrl.requestFromCores.slave = ruby_system.network.master
        dir_cntrl.responseFromCores = MessageBuffer()
        dir_cntrl.responseFromCores.slave = ruby_system.network.master
        dir_cntrl.unblockFromCores = MessageBuffer()
        dir_cntrl.unblockFromCores.slave = ruby_system.network.master
        dir_cntrl.probeToCore = MessageBuffer()
        dir_cntrl.probeToCore.master = ruby_system.network.slave
        dir_cntrl.responseToCore = MessageBuffer()
        dir_cntrl.responseToCore.master = ruby_system.network.slave
        dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
        dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
        dir_cntrl.responseFromMemory = MessageBuffer()
        exec("system.dir_cntrl%d = dir_cntrl" % i)
        dir_cntrl_nodes.append(dir_cntrl)
        mainCluster.add(dir_cntrl)
    # Technically this config can support an odd number of cpus, but the top
    # level config files, such as the ruby_random_tester, will get confused if
    # the number of cpus does not equal the number of sequencers.  Thus make
    # sure that an even number of cpus is specified.
    assert((options.num_cpus % 2) == 0)
    # For an odd number of CPUs, still create the right number of controllers
    cpuCluster = Cluster(extBW = 512, intBW = 512)  # 1 TB/s
    for i in xrange((options.num_cpus + 1) / 2):
        cp_cntrl = CPCntrl()
        cp_cntrl.create(options, ruby_system, system)
        exec("system.cp_cntrl%d = cp_cntrl" % i)
        #
        # Add controllers and sequencers to the appropriate lists
        #
        cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
        # Connect the CP controllers and the network
        cp_cntrl.requestFromCore = MessageBuffer()
        cp_cntrl.requestFromCore.master = ruby_system.network.slave
        cp_cntrl.responseFromCore = MessageBuffer()
        cp_cntrl.responseFromCore.master = ruby_system.network.slave
        cp_cntrl.unblockFromCore = MessageBuffer()
        cp_cntrl.unblockFromCore.master = ruby_system.network.slave
        cp_cntrl.probeToCore = MessageBuffer()
        cp_cntrl.probeToCore.slave = ruby_system.network.master
        cp_cntrl.responseToCore = MessageBuffer()
        cp_cntrl.responseToCore.slave = ruby_system.network.master
        cp_cntrl.mandatoryQueue = MessageBuffer()
        cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
        cpuCluster.add(cp_cntrl)
    # Assuming no DMA devices
    assert(len(dma_devices) == 0)
    # Add cpu/gpu clusters to main cluster
    mainCluster.add(cpuCluster)
    ruby_system.network.number_of_virtual_networks = 10
    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
--- a/src/SConscript
+++ b/src/SConscript
@ -78,7 +78,7 @@ class SourceMeta(type):
    def __init__(cls, name, bases, dict):
        super(SourceMeta, cls).__init__(name, bases, dict)
        cls.all = []
-        
+
    def get(cls, **guards):
        '''Find all files that match the specified guards.  If a source
        file does not specify a flag, the default is False'''
@ -367,9 +367,9 @@ def makeTheISA(source, target, env):
    target_isa = env['TARGET_ISA']
    def define(isa):
        return isa.upper() + '_ISA'
-    
+
    def namespace(isa):
-        return isa[0].upper() + isa[1:].lower() + 'ISA' 
+        return isa[0].upper() + isa[1:].lower() + 'ISA'
    code = code_formatter()
@ -407,6 +407,51 @@ def makeTheISA(source, target, env):
 env.Command('config/the_isa.hh', map(Value, all_isa_list),
            MakeAction(makeTheISA, Transform("CFG ISA", 0)))
 def makeTheGPUISA(source, target, env):
    isas = [ src.get_contents() for src in source ]
    target_gpu_isa = env['TARGET_GPU_ISA']
    def define(isa):
        return isa.upper() + '_ISA'
    def namespace(isa):
        return isa[0].upper() + isa[1:].lower() + 'ISA'
    code = code_formatter()
    code('''\
 #ifndef __CONFIG_THE_GPU_ISA_HH__
 #define __CONFIG_THE_GPU_ISA_HH__
 ''')
    # create defines for the preprocessing and compile-time determination
    for i,isa in enumerate(isas):
        code('#define $0 $1', define(isa), i + 1)
    code()
    # create an enum for any run-time determination of the ISA, we
    # reuse the same name as the namespaces
    code('enum class GPUArch {')
    for i,isa in enumerate(isas):
        if i + 1 == len(isas):
            code('  $0 = $1', namespace(isa), define(isa))
        else:
            code('  $0 = $1,', namespace(isa), define(isa))
    code('};')
    code('''
 #define THE_GPU_ISA ${{define(target_gpu_isa)}}
 #define TheGpuISA ${{namespace(target_gpu_isa)}}
 #define THE_GPU_ISA_STR "${{target_gpu_isa}}"
 #endif // __CONFIG_THE_GPU_ISA_HH__''')
    code.write(str(target[0]))
 env.Command('config/the_gpu_isa.hh', map(Value, all_gpu_isa_list),
            MakeAction(makeTheGPUISA, Transform("CFG ISA", 0)))
 ########################################################################
 #
 # Prevent any SimObjects from being added after this point, they
@ -784,7 +829,7 @@ extern "C" {
 EmbeddedSwig embed_swig_${module}(init_${module});
 ''')
    code.write(str(target[0]))
-    
+
 # Build all swig modules
 for swig in SwigSource.all:
    env.Command([swig.cc_source.tnode, swig.py_source.tnode], swig.tnode,
@ -959,7 +1004,7 @@ const uint8_t data_${sym}[] = {
        x = array.array('B', data[i:i+step])
        code(''.join('%d,' % d for d in x))
    code.dedent()
-    
+
    code('''};
 EmbeddedPython embedded_${sym}(
--- a/src/arch/SConscript
+++ b/src/arch/SConscript
@ -68,6 +68,14 @@ isa_switch_hdrs = Split('''
 # Set up this directory to support switching headers
 make_switching_dir('arch', isa_switch_hdrs, env)
 if env['BUILD_GPU']:
    gpu_isa_switch_hdrs = Split('''
            gpu_decoder.hh
            gpu_types.hh
            ''')
    make_gpu_switching_dir('arch', gpu_isa_switch_hdrs, env)
 #################################################################
 #
 # Include architecture-specific files.
--- a/src/arch/hsail/Brig.h
+++ b/src/arch/hsail/Brig.h
@ -0,0 +1,67 @@
 // University of Illinois/NCSA
 // Open Source License
 //
 // Copyright (c) 2013, Advanced Micro Devices, Inc.
 // All rights reserved.
 //
 // Developed by:
 //
 //     HSA Team
 //
 //     Advanced Micro Devices, Inc
 //
 //     www.amd.com
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of
 // this software and associated documentation files (the "Software"), to deal with
 // the Software without restriction, including without limitation the rights to
 // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 // of the Software, and to permit persons to whom the Software is furnished to do
 // so, subject to the following conditions:
 //
 //     * Redistributions of source code must retain the above copyright notice,
 //       this list of conditions and the following disclaimers.
 //
 //     * Redistributions in binary form must reproduce the above copyright notice,
 //       this list of conditions and the following disclaimers in the
 //       documentation and/or other materials provided with the distribution.
 //
 //     * Neither the names of the LLVM Team, University of Illinois at
 //       Urbana-Champaign, nor the names of its contributors may be used to
 //       endorse or promote products derived from this Software without specific
 //       prior written permission.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 // CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
 // SOFTWARE.
 #ifndef INTERNAL_BRIG_H
 #define INTERNAL_BRIG_H
 #include <stdint.h>
 namespace Brig {
 #include "Brig_new.hpp"
 // These typedefs provide some backward compatibility with earlier versions
 // of Brig.h, reducing the number of code changes. The distinct names also
 // increase legibility by showing the code's intent.
 typedef BrigBase BrigDirective;
 typedef BrigBase BrigOperand;
 enum BrigMemoryFenceSegments { // for internal use only
    //.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc }
    //.mnemo_token=_EMMemoryFenceSegments
    //.mnemo_context=EInstModifierInstFenceContext
    BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0,
    BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1,
    BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2,
    BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip
 };
 }
 #endif // defined(INTERNAL_BRIG_H)
--- a/src/arch/hsail/Brig_new.hpp
+++ b/src/arch/hsail/Brig_new.hpp
--- a/src/arch/hsail/SConscript
+++ b/src/arch/hsail/SConscript
@ -0,0 +1,54 @@
 # -*- mode:python -*-
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Anthony Gutierrez
 #
 Import('*')
 if not env['BUILD_GPU']:
    Return()
 if env['TARGET_GPU_ISA'] == 'hsail':
    env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'],
                'gen.py', '$SOURCE $TARGETS')
    Source('generic_types.cc')
    Source('gpu_decoder.cc')
    Source('insts/branch.cc')
    Source('insts/gen_exec.cc')
    Source('insts/gpu_static_inst.cc')
    Source('insts/main.cc')
    Source('insts/pseudo_inst.cc')
    Source('insts/mem.cc')
    Source('operand.cc')
--- a/src/arch/hsail/SConsopts
+++ b/src/arch/hsail/SConsopts
@ -0,0 +1,40 @@
 # -*- mode:python -*-
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Anthony Gutierrez
 #
 Import('*')
 all_gpu_isa_list.append('hsail')
--- a/src/arch/hsail/gen.py
+++ b/src/arch/hsail/gen.py
@ -0,0 +1,806 @@
 #! /usr/bin/python
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Steve Reinhardt
 #
 import sys, re
 from m5.util import code_formatter
 if len(sys.argv) != 4:
    print "Error: need 3 args (file names)"
    sys.exit(0)
 header_code = code_formatter()
 decoder_code = code_formatter()
 exec_code = code_formatter()
 ###############
 #
 # Generate file prologs (includes etc.)
 #
 ###############
 header_code('''
 #include "arch/hsail/insts/decl.hh"
 #include "base/bitfield.hh"
 #include "gpu-compute/hsail_code.hh"
 #include "gpu-compute/wavefront.hh"
 namespace HsailISA
 {
 ''')
 header_code.indent()
 decoder_code('''
 #include "arch/hsail/gpu_decoder.hh"
 #include "arch/hsail/insts/branch.hh"
 #include "arch/hsail/insts/decl.hh"
 #include "arch/hsail/insts/gen_decl.hh"
 #include "arch/hsail/insts/mem.hh"
 #include "arch/hsail/insts/mem_impl.hh"
 #include "gpu-compute/brig_object.hh"
 namespace HsailISA
 {
    std::vector<GPUStaticInst*> Decoder::decodedInsts;
    GPUStaticInst*
    Decoder::decode(MachInst machInst)
    {
        using namespace Brig;
        const BrigInstBase *ib = machInst.brigInstBase;
        const BrigObject *obj = machInst.brigObj;
        switch(ib->opcode) {
 ''')
 decoder_code.indent()
 decoder_code.indent()
 exec_code('''
 #include "arch/hsail/insts/gen_decl.hh"
 #include "base/intmath.hh"
 namespace HsailISA
 {
 ''')
 exec_code.indent()
 ###############
 #
 # Define code templates for class declarations (for header file)
 #
 ###############
 # Basic header template for an instruction with no template parameters.
 header_template_nodt = '''
 class $class_name : public $base_class
 {
  public:
    typedef $base_class Base;
    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
       : Base(ib, obj, "$opcode")
    {
    }
    void execute(GPUDynInstPtr gpuDynInst);
 };
 '''
 # Basic header template for an instruction with a single DataType
 # template parameter.
 header_template_1dt = '''
 template<typename DataType>
 class $class_name : public $base_class<DataType>
 {
  public:
    typedef $base_class<DataType> Base;
    typedef typename DataType::CType CType;
    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
       : Base(ib, obj, "$opcode")
    {
    }
    void execute(GPUDynInstPtr gpuDynInst);
 };
 '''
 header_template_1dt_noexec = '''
 template<typename DataType>
 class $class_name : public $base_class<DataType>
 {
  public:
    typedef $base_class<DataType> Base;
    typedef typename DataType::CType CType;
    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
       : Base(ib, obj, "$opcode")
    {
    }
 };
 '''
 # Same as header_template_1dt, except the base class has a second
 # template parameter NumSrcOperands to allow a variable number of
 # source operands.  Note that since this is implemented with an array,
 # it only works for instructions where all sources are of the same
 # type (like most arithmetics).
 header_template_1dt_varsrcs = '''
 template<typename DataType>
 class $class_name : public $base_class<DataType, $num_srcs>
 {
  public:
    typedef $base_class<DataType, $num_srcs> Base;
    typedef typename DataType::CType CType;
    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
       : Base(ib, obj, "$opcode")
    {
    }
    void execute(GPUDynInstPtr gpuDynInst);
 };
 '''
 # Header template for instruction with two DataType template
 # parameters, one for the dest and one for the source.  This is used
 # by compare and convert.
 header_template_2dt = '''
 template<typename DestDataType, class SrcDataType>
 class $class_name : public $base_class<DestDataType, SrcDataType>
 {
  public:
    typedef $base_class<DestDataType, SrcDataType> Base;
    typedef typename DestDataType::CType DestCType;
    typedef typename SrcDataType::CType SrcCType;
    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
       : Base(ib, obj, "$opcode")
    {
    }
    void execute(GPUDynInstPtr gpuDynInst);
 };
 '''
 header_templates = {
    'ArithInst': header_template_1dt_varsrcs,
    'CmovInst': header_template_1dt,
    'ClassInst': header_template_1dt,
    'ShiftInst': header_template_1dt,
    'ExtractInsertInst': header_template_1dt,
    'CmpInst': header_template_2dt,
    'CvtInst': header_template_2dt,
    'LdInst': '',
    'StInst': '',
    'SpecialInstNoSrc': header_template_nodt,
    'SpecialInst1Src': header_template_nodt,
    'SpecialInstNoSrcNoDest': '',
 }
 ###############
 #
 # Define code templates for exec functions
 #
 ###############
 # exec function body
 exec_template_nodt_nosrc = '''
 void
 $class_name::execute(GPUDynInstPtr gpuDynInst)
 {
    Wavefront *w = gpuDynInst->wavefront();
    typedef Base::DestCType DestCType;
    const VectorMask &mask = w->get_pred();
    for (int lane = 0; lane < VSZ; ++lane) {
        if (mask[lane]) {
            DestCType dest_val = $expr;
            this->dest.set(w, lane, dest_val);
        }
    }
 }
 '''
 exec_template_nodt_1src = '''
 void
 $class_name::execute(GPUDynInstPtr gpuDynInst)
 {
    Wavefront *w = gpuDynInst->wavefront();
    typedef Base::DestCType DestCType;
    typedef Base::SrcCType  SrcCType;
    const VectorMask &mask = w->get_pred();
    for (int lane = 0; lane < VSZ; ++lane) {
        if (mask[lane]) {
            SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
            DestCType dest_val = $expr;
            this->dest.set(w, lane, dest_val);
        }
    }
 }
 '''
 exec_template_1dt_varsrcs = '''
 template<typename DataType>
 void
 $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
 {
    Wavefront *w = gpuDynInst->wavefront();
    const VectorMask &mask = w->get_pred();
    for (int lane = 0; lane < VSZ; ++lane) {
        if (mask[lane]) {
            CType dest_val;
            if ($dest_is_src_flag) {
                dest_val = this->dest.template get<CType>(w, lane);
            }
            CType src_val[$num_srcs];
            for (int i = 0; i < $num_srcs; ++i) {
                src_val[i] = this->src[i].template get<CType>(w, lane);
            }
            dest_val = (CType)($expr);
            this->dest.set(w, lane, dest_val);
        }
    }
 }
 '''
 exec_template_1dt_3srcs = '''
 template<typename DataType>
 void
 $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
 {
    Wavefront *w = gpuDynInst->wavefront();
    typedef typename Base::Src0CType Src0T;
    typedef typename Base::Src1CType Src1T;
    typedef typename Base::Src2CType Src2T;
    const VectorMask &mask = w->get_pred();
    for (int lane = 0; lane < VSZ; ++lane) {
        if (mask[lane]) {
            CType dest_val;
            if ($dest_is_src_flag) {
                dest_val = this->dest.template get<CType>(w, lane);
            }
            Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
            Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
            Src2T src_val2 = this->src2.template get<Src2T>(w, lane);
            dest_val = $expr;
            this->dest.set(w, lane, dest_val);
        }
    }
 }
 '''
 exec_template_1dt_2src_1dest = '''
 template<typename DataType>
 void
 $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
 {
    Wavefront *w = gpuDynInst->wavefront();
    typedef typename Base::DestCType DestT;
    typedef CType Src0T;
    typedef typename Base::Src1CType Src1T;
    const VectorMask &mask = w->get_pred();
    for (int lane = 0; lane < VSZ; ++lane) {
        if (mask[lane]) {
            DestT dest_val;
            if ($dest_is_src_flag) {
                dest_val = this->dest.template get<DestT>(w, lane);
            }
            Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
            Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
            dest_val = $expr;
            this->dest.set(w, lane, dest_val);
        }
    }
 }
 '''
 exec_template_shift = '''
 template<typename DataType>
 void
 $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
 {
    Wavefront *w = gpuDynInst->wavefront();
    const VectorMask &mask = w->get_pred();
    for (int lane = 0; lane < VSZ; ++lane) {
        if (mask[lane]) {
            CType dest_val;
            if ($dest_is_src_flag) {
                dest_val = this->dest.template get<CType>(w, lane);
            }
            CType src_val0 = this->src0.template get<CType>(w, lane);
            uint32_t src_val1 = this->src1.template get<uint32_t>(w, lane);
            dest_val = $expr;
            this->dest.set(w, lane, dest_val);
        }
    }
 }
 '''
 exec_template_2dt = '''
 template<typename DestDataType, class SrcDataType>
 void
 $class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
 {
    Wavefront *w = gpuDynInst->wavefront();
    const VectorMask &mask = w->get_pred();
    for (int lane = 0; lane < VSZ; ++lane) {
        if (mask[lane]) {
            DestCType dest_val;
            SrcCType src_val[$num_srcs];
            for (int i = 0; i < $num_srcs; ++i) {
                src_val[i] = this->src[i].template get<SrcCType>(w, lane);
            }
            dest_val = $expr;
            this->dest.set(w, lane, dest_val);
        }
    }
 }
 '''
 exec_templates = {
    'ArithInst': exec_template_1dt_varsrcs,
    'CmovInst': exec_template_1dt_3srcs,
    'ExtractInsertInst': exec_template_1dt_3srcs,
    'ClassInst': exec_template_1dt_2src_1dest,
    'CmpInst': exec_template_2dt,
    'CvtInst': exec_template_2dt,
    'LdInst': '',
    'StInst': '',
    'SpecialInstNoSrc': exec_template_nodt_nosrc,
    'SpecialInst1Src': exec_template_nodt_1src,
    'SpecialInstNoSrcNoDest': '',
 }
 ###############
 #
 # Define code templates for the decoder cases
 #
 ###############
 # decode template for nodt-opcode case
 decode_nodt_template = '''
  case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);'''
 decode_case_prolog_class_inst = '''
  case BRIG_OPCODE_$brig_opcode_upper:
    {
        //const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]);
        BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType;
        //switch (baseOp->kind) {
        //    case BRIG_OPERAND_REG:
        //        type = ((const BrigOperandReg*)baseOp)->type;
        //        break;
        //    case BRIG_OPERAND_IMMED:
        //        type = ((const BrigOperandImmed*)baseOp)->type;
        //        break;
        //    default:
        //        fatal("CLASS unrecognized kind of operand %d\\n",
        //               baseOp->kind);
        //}
        switch (type) {'''
 # common prolog for 1dt- or 2dt-opcode case: switch on data type
 decode_case_prolog = '''
  case BRIG_OPCODE_$brig_opcode_upper:
    {
        switch (ib->type) {'''
 # single-level decode case entry (for 1dt opcodes)
 decode_case_entry = \
 '      case BRIG_TYPE_$type_name: return $constructor(ib, obj);'
 decode_store_prolog = \
 '      case BRIG_TYPE_$type_name: {'
 decode_store_case_epilog = '''
    }'''
 decode_store_case_entry = \
 '          return $constructor(ib, obj);'
 # common epilog for type switch
 decode_case_epilog = '''
          default: fatal("$brig_opcode_upper: unrecognized type %d\\n",
              ib->type);
        }
    }
    break;'''
 # Additional templates for nested decode on a second type field (for
 # compare and convert).  These are used in place of the
 # decode_case_entry template to create a second-level switch on on the
 # second type field inside each case of the first-level type switch.
 # Because the name and location of the second type can vary, the Brig
 # instruction type must be provided in $brig_type, and the name of the
 # second type field must be provided in $type_field.
 decode_case2_prolog = '''
        case BRIG_TYPE_$type_name:
          switch (((Brig$brig_type*)ib)->$type2_field) {'''
 decode_case2_entry = \
 '          case BRIG_TYPE_$type2_name: return $constructor(ib, obj);'
 decode_case2_epilog = '''
          default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n",
                         ((Brig$brig_type*)ib)->$type2_field);
        }
        break;'''
 # Figure out how many source operands an expr needs by looking for the
 # highest-numbered srcN value referenced.  Since sources are numbered
 # starting at 0, the return value is N+1.
 def num_src_operands(expr):
    if expr.find('src2') != -1:
        return 3
    elif expr.find('src1') != -1:
        return 2
    elif expr.find('src0') != -1:
        return 1
    else:
        return 0
 ###############
 #
 # Define final code generation methods
 #
 # The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for
 # generating actual instructions.
 #
 ###############
 # Generate class declaration, exec function, and decode switch case
 # for an brig_opcode with a single-level type switch.  The 'types'
 # parameter is a list or tuple of types for which the instruction
 # should be instantiated.
 def gen(brig_opcode, types=None, expr=None, base_class='ArithInst',
        type2_info=None, constructor_prefix='new ', is_store=False):
    brig_opcode_upper = brig_opcode.upper()
    class_name = brig_opcode
    opcode = class_name.lower()
    if base_class == 'ArithInst':
        # note that expr must be provided with ArithInst so we can
        # derive num_srcs for the template
        assert expr
    if expr:
        # Derive several bits of info from expr.  If expr is not used,
        # this info will be irrelevant.
        num_srcs = num_src_operands(expr)
        # if the RHS expression includes 'dest', then we're doing an RMW
        # on the reg and we need to treat it like a source
        dest_is_src = expr.find('dest') != -1
        dest_is_src_flag = str(dest_is_src).lower() # for C++
        if base_class in ['ShiftInst']:
            expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
        elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']:
            expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr)
        else:
            expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
        expr = re.sub(r'\bdest\b', r'dest_val', expr)
    # Strip template arguments off of base class before looking up
    # appropriate templates
    base_class_base = re.sub(r'<.*>$', '', base_class)
    header_code(header_templates[base_class_base])
    if base_class.startswith('SpecialInst'):
        exec_code(exec_templates[base_class_base])
    elif base_class.startswith('ShiftInst'):
        header_code(exec_template_shift)
    else:
        header_code(exec_templates[base_class_base])
    if not types or isinstance(types, str):
        # Just a single type
        constructor = constructor_prefix + class_name
        decoder_code(decode_nodt_template)
    else:
        # multiple types, need at least one level of decode
        if brig_opcode == 'Class':
            decoder_code(decode_case_prolog_class_inst)
        else:
            decoder_code(decode_case_prolog)
        if not type2_info:
            if is_store == False:
                # single list of types, to basic one-level decode
                for type_name in types:
                    full_class_name = '%s<%s>' % (class_name, type_name.upper())
                    constructor = constructor_prefix + full_class_name
                    decoder_code(decode_case_entry)
            else:
                # single list of types, to basic one-level decode
                for type_name in types:
                    decoder_code(decode_store_prolog)
                    type_size = int(re.findall(r'[0-9]+', type_name)[0])
                    src_size = 32
                    type_type = type_name[0]
                    full_class_name = '%s<%s,%s>' % (class_name, \
                                                     type_name.upper(), \
                                                     '%s%d' % \
                                                     (type_type.upper(), \
                                                     type_size))
                    constructor = constructor_prefix + full_class_name
                    decoder_code(decode_store_case_entry)
                    decoder_code(decode_store_case_epilog)
        else:
            # need secondary type switch (convert, compare)
            # unpack extra info on second switch
            (type2_field, types2) = type2_info
            brig_type = 'Inst%s' % brig_opcode
            for type_name in types:
                decoder_code(decode_case2_prolog)
                fmt = '%s<%s,%%s>' % (class_name, type_name.upper())
                for type2_name in types2:
                    full_class_name = fmt % type2_name.upper()
                    constructor = constructor_prefix + full_class_name
                    decoder_code(decode_case2_entry)
                decoder_code(decode_case2_epilog)
        decoder_code(decode_case_epilog)
 ###############
 #
 # Generate instructions
 #
 ###############
 # handy abbreviations for common sets of types
 # arithmetic ops are typically defined only on 32- and 64-bit sizes
 arith_int_types = ('S32', 'U32', 'S64', 'U64')
 arith_float_types = ('F32', 'F64')
 arith_types = arith_int_types + arith_float_types
 bit_types = ('B1', 'B32', 'B64')
 all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types
 # I think you might be able to do 'f16' memory ops too, but we'll
 # ignore them for now.
 mem_types = all_int_types + arith_float_types
 mem_atom_types = all_int_types + ('B32', 'B64')
 ##### Arithmetic & logical operations
 gen('Add', arith_types, 'src0 + src1')
 gen('Sub', arith_types, 'src0 - src1')
 gen('Mul', arith_types, 'src0 * src1')
 gen('Div', arith_types, 'src0 / src1')
 gen('Min', arith_types, 'std::min(src0, src1)')
 gen('Max', arith_types, 'std::max(src0, src1)')
 gen('Gcnmin', arith_types, 'std::min(src0, src1)')
 gen('CopySign', arith_float_types,
    'src1 < 0 ? -std::abs(src0) : std::abs(src0)')
 gen('Sqrt', arith_float_types, 'sqrt(src0)')
 gen('Floor', arith_float_types, 'floor(src0)')
 # "fast" sqrt... same as slow for us
 gen('Nsqrt', arith_float_types, 'sqrt(src0)')
 gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)')
 gen('Nrcp', arith_float_types, '1.0/src0')
 gen('Fract', arith_float_types,
    '(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)')
 gen('Ncos', arith_float_types, 'cos(src0)');
 gen('Nsin', arith_float_types, 'sin(src0)');
 gen('And', bit_types, 'src0 & src1')
 gen('Or', bit_types,  'src0 | src1')
 gen('Xor', bit_types, 'src0 ^ src1')
 gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)')
 gen('Firstbit',bit_types, 'firstbit(src0)')
 gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)')
 gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst')
 gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst')
 # gen('Mul_hi', types=('s32','u32', '??'))
 # gen('Mul24', types=('s32','u32', '??'))
 gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)')
 gen('Abs', arith_types, 'std::abs(src0)')
 gen('Neg', arith_types, '-src0')
 gen('Mov', bit_types, 'src0')
 gen('Not', bit_types, 'heynot(src0)')
 # mad and fma differ only in rounding behavior, which we don't emulate
 # also there's an integer form of mad, but not of fma
 gen('Mad', arith_types, 'src0 * src1 + src2')
 gen('Fma', arith_float_types, 'src0 * src1 + src2')
 #native floating point operations
 gen('Nfma', arith_float_types, 'src0 * src1 + src2')
 gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst')
 gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))')
 gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))')
 # see base/bitfield.hh
 gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)',
    'ExtractInsertInst')
 gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)',
    'ExtractInsertInst')
 ##### Compare
 gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)',
    'CmpInst', ('sourceType', arith_types + bit_types))
 gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst')
 ##### Conversion
 # Conversion operations are only defined on B1, not B32 or B64
 cvt_types = ('B1',) + mem_types
 gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types))
 ##### Load & Store
 gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode')
 gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode')
 gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode',
    is_store=True)
 gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode')
 gen('AtomicNoRet', mem_atom_types, base_class='StInst',
    constructor_prefix='decode')
 gen('Cbr', base_class = 'LdInst', constructor_prefix='decode')
 gen('Br', base_class = 'LdInst', constructor_prefix='decode')
 ##### Special operations
 def gen_special(brig_opcode, expr, dest_type='U32'):
    num_srcs = num_src_operands(expr)
    if num_srcs == 0:
        base_class = 'SpecialInstNoSrc<%s>' % dest_type
    elif num_srcs == 1:
        base_class = 'SpecialInst1Src<%s>' % dest_type
    else:
        assert false
    gen(brig_opcode, None, expr, base_class)
 gen_special('WorkItemId', 'w->workitemid[src0][lane]')
 gen_special('WorkItemAbsId',
    'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])')
 gen_special('WorkGroupId', 'w->workgroupid[src0]')
 gen_special('WorkGroupSize', 'w->workgroupsz[src0]')
 gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]')
 gen_special('GridSize', 'w->gridsz[src0]')
 gen_special('GridGroups',
    'divCeil(w->gridsz[src0],w->workgroupsz[src0])')
 gen_special('LaneId', 'lane')
 gen_special('WaveId', 'w->dynwaveid')
 gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64')
 # gen_special('CU'', ')
 gen('Ret', base_class='SpecialInstNoSrcNoDest')
 gen('Barrier', base_class='SpecialInstNoSrcNoDest')
 gen('MemFence', base_class='SpecialInstNoSrcNoDest')
 # Map magic instructions to the BrigSyscall opcode
 # Magic instructions are defined in magic.hh
 #
 # In the future, real HSA kernel system calls can be implemented and coexist
 # with magic instructions.
 gen('Call', base_class='SpecialInstNoSrcNoDest')
 ###############
 #
 # Generate file epilogs
 #
 ###############
 header_code.dedent()
 header_code('''
 } // namespace HsailISA
 ''')
 # close off main decode switch
 decoder_code.dedent()
 decoder_code.dedent()
 decoder_code('''
          default: fatal("unrecognized Brig opcode %d\\n", ib->opcode);
        } // end switch(ib->opcode)
    } // end decode()
 } // namespace HsailISA
 ''')
 exec_code.dedent()
 exec_code('''
 } // namespace HsailISA
 ''')
 ###############
 #
 # Output accumulated code to files
 #
 ###############
 header_code.write(sys.argv[1])
 decoder_code.write(sys.argv[2])
 exec_code.write(sys.argv[3])
--- a/src/arch/hsail/generic_types.cc
+++ b/src/arch/hsail/generic_types.cc
@ -0,0 +1,47 @@
 #include "arch/hsail/generic_types.hh"
 #include "base/misc.hh"
 using namespace Brig;
 namespace HsailISA
 {
    Enums::GenericMemoryOrder
    getGenericMemoryOrder(BrigMemoryOrder brig_memory_order)
    {
        switch(brig_memory_order) {
          case BRIG_MEMORY_ORDER_NONE:
            return Enums::MEMORY_ORDER_NONE;
          case BRIG_MEMORY_ORDER_RELAXED:
            return Enums::MEMORY_ORDER_RELAXED;
          case BRIG_MEMORY_ORDER_SC_ACQUIRE:
            return Enums::MEMORY_ORDER_SC_ACQUIRE;
          case BRIG_MEMORY_ORDER_SC_RELEASE:
            return Enums::MEMORY_ORDER_SC_RELEASE;
          case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
            return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE;
          default:
            fatal("HsailISA::MemInst::getGenericMemoryOrder -> ",
                  "bad BrigMemoryOrder\n");
        }
    }
    Enums::GenericMemoryScope
    getGenericMemoryScope(BrigMemoryScope brig_memory_scope)
    {
        switch(brig_memory_scope) {
          case BRIG_MEMORY_SCOPE_NONE:
            return Enums::MEMORY_SCOPE_NONE;
          case BRIG_MEMORY_SCOPE_WORKITEM:
            return Enums::MEMORY_SCOPE_WORKITEM;
          case BRIG_MEMORY_SCOPE_WORKGROUP:
            return Enums::MEMORY_SCOPE_WORKGROUP;
          case BRIG_MEMORY_SCOPE_AGENT:
            return Enums::MEMORY_SCOPE_DEVICE;
          case BRIG_MEMORY_SCOPE_SYSTEM:
            return Enums::MEMORY_SCOPE_SYSTEM;
          default:
            fatal("HsailISA::MemInst::getGenericMemoryScope -> ",
                  "bad BrigMemoryScope\n");
        }
    }
 } // namespace HsailISA
--- a/src/arch/hsail/generic_types.hh
+++ b/src/arch/hsail/generic_types.hh
@ -0,0 +1,16 @@
 #ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__
 #define __ARCH_HSAIL_GENERIC_TYPES_HH__
 #include "arch/hsail/Brig.h"
 #include "enums/GenericMemoryOrder.hh"
 #include "enums/GenericMemoryScope.hh"
 namespace HsailISA
 {
    Enums::GenericMemoryOrder
    getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order);
    Enums::GenericMemoryScope
    getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope);
 } // namespace HsailISA
 #endif // __ARCH_HSAIL_GENERIC_TYPES_HH__
--- a/src/arch/hsail/gpu_decoder.hh
+++ b/src/arch/hsail/gpu_decoder.hh
@ -0,0 +1,77 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __ARCH_HSAIL_GPU_DECODER_HH__
 #define __ARCH_HSAIL_GPU_DECODER_HH__
 #include <vector>
 #include "arch/hsail/gpu_types.hh"
 class BrigObject;
 class GPUStaticInst;
 namespace Brig
 {
    class BrigInstBase;
 }
 namespace HsailISA
 {
    class Decoder
    {
      public:
        GPUStaticInst* decode(MachInst machInst);
        GPUStaticInst*
        decode(RawMachInst inst)
        {
            return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr;
        }
        RawMachInst
        saveInst(GPUStaticInst *decodedInst)
        {
            decodedInsts.push_back(decodedInst);
            return decodedInsts.size() - 1;
        }
      private:
        static std::vector<GPUStaticInst*> decodedInsts;
    };
 } // namespace HsailISA
 #endif // __ARCH_HSAIL_GPU_DECODER_HH__
--- a/src/arch/hsail/gpu_types.hh
+++ b/src/arch/hsail/gpu_types.hh
@ -0,0 +1,69 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __ARCH_HSAIL_GPU_TYPES_HH__
 #define __ARCH_HSAIL_GPU_TYPES_HH__
 #include <cstdint>
 namespace Brig
 {
    class BrigInstBase;
 }
 class BrigObject;
 namespace HsailISA
 {
    // A raw machine instruction represents the raw bits that
    // our model uses to represent an actual instruction. In
    // the case of HSAIL this is just an index into a list of
    // instruction objects.
    typedef uint64_t RawMachInst;
    // The MachInst is a representation of an instruction
    // that has more information than just the machine code.
    // For HSAIL the actual machine code is a BrigInstBase
    // and the BrigObject contains more pertinent
    // information related to operaands, etc.
    struct MachInst
    {
        const Brig::BrigInstBase *brigInstBase;
        const BrigObject *brigObj;
    };
 }
 #endif // __ARCH_HSAIL_GPU_TYPES_HH__
--- a/src/arch/hsail/insts/branch.cc
+++ b/src/arch/hsail/insts/branch.cc
@ -0,0 +1,86 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #include "arch/hsail/insts/branch.hh"
 #include "gpu-compute/hsail_code.hh"
 namespace HsailISA
 {
    GPUStaticInst*
    decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
    {
        // Detect direct vs indirect branch by seeing whether we have a
        // register operand.
        unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
            return new BrnIndirectInst(ib, obj);
        } else {
            return new BrnDirectInst(ib, obj);
        }
    }
    GPUStaticInst*
    decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
    {
        // Detect direct vs indirect branch by seeing whether we have a
        // second register operand (after the condition).
        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
            return new CbrIndirectInst(ib, obj);
        } else {
            return new CbrDirectInst(ib, obj);
        }
    }
    GPUStaticInst*
    decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
    {
        // Detect direct vs indirect branch by seeing whether we have a
        // second register operand (after the condition).
        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
            return new BrIndirectInst(ib, obj);
        } else {
            return new BrDirectInst(ib, obj);
        }
    }
 } // namespace HsailISA
--- a/src/arch/hsail/insts/branch.hh
+++ b/src/arch/hsail/insts/branch.hh
@ -0,0 +1,442 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
 #define __ARCH_HSAIL_INSTS_BRANCH_HH__
 #include "arch/hsail/insts/gpu_static_inst.hh"
 #include "arch/hsail/operand.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/wavefront.hh"
 namespace HsailISA
 {
    // The main difference between a direct branch and an indirect branch
    // is whether the target is a register or a label, so we can share a
    // lot of code if we template the base implementation on that type.
    template<typename TargetType>
    class BrnInstBase : public HsailGPUStaticInst
    {
    public:
        void generateDisassembly();
        Brig::BrigWidth8_t width;
        TargetType target;
        BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
           : HsailGPUStaticInst(obj, "brn")
        {
            o_type = Enums::OT_BRANCH;
            width = ((Brig::BrigInstBr*)ib)->width;
            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
            target.init(op_offs, obj);
            o_type = Enums::OT_BRANCH;
        }
        uint32_t getTargetPc()  override { return target.getTarget(0, 0); }
        bool unconditionalJumpInstruction() override { return true; }
        bool isVectorRegister(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.isVectorRegister();
        }
        bool isCondRegister(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.isCondRegister();
        }
        bool isScalarRegister(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.isScalarRegister();
        }
        bool isSrcOperand(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return true;
        }
        bool isDstOperand(int operandIndex) {
            return false;
        }
        int getOperandSize(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.opSize();
        }
        int getRegisterIndex(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.regIndex();
        }
        int getNumOperands() {
            return 1;
        }
        void execute(GPUDynInstPtr gpuDynInst);
    };
    template<typename TargetType>
    void
    BrnInstBase<TargetType>::generateDisassembly()
    {
        std::string widthClause;
        if (width != 1) {
            widthClause = csprintf("_width(%d)", width);
        }
        disassembly = csprintf("%s%s %s", opcode, widthClause,
                               target.disassemble());
    }
    template<typename TargetType>
    void
    BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();
        if (getTargetPc() == w->rpc()) {
            w->popFromReconvergenceStack();
        } else {
            // Rpc and execution mask remain the same
            w->pc(getTargetPc());
        }
        w->discardFetch();
    }
    class BrnDirectInst : public BrnInstBase<LabelOperand>
    {
      public:
        BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
            : BrnInstBase<LabelOperand>(ib, obj)
        {
        }
        int numSrcRegOperands() { return 0; }
        int numDstRegOperands() { return 0; }
    };
    class BrnIndirectInst : public BrnInstBase<SRegOperand>
    {
      public:
        BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
            : BrnInstBase<SRegOperand>(ib, obj)
        {
        }
        int numSrcRegOperands() { return target.isVectorRegister(); }
        int numDstRegOperands() { return 0; }
    };
    GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
                             const BrigObject *obj);
    template<typename TargetType>
    class CbrInstBase : public HsailGPUStaticInst
    {
      public:
        void generateDisassembly();
        Brig::BrigWidth8_t width;
        CRegOperand cond;
        TargetType target;
        CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
           : HsailGPUStaticInst(obj, "cbr")
        {
            o_type = Enums::OT_BRANCH;
            width = ((Brig::BrigInstBr *)ib)->width;
            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
            cond.init(op_offs, obj);
            op_offs = obj->getOperandPtr(ib->operands, 1);
            target.init(op_offs, obj);
            o_type = Enums::OT_BRANCH;
        }
        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
        void execute(GPUDynInstPtr gpuDynInst);
        // Assumption: Target is operand 0, Condition Register is operand 1
        bool isVectorRegister(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            if (!operandIndex)
                return target.isVectorRegister();
            else
                return false;
        }
        bool isCondRegister(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            if (!operandIndex)
                return target.isCondRegister();
            else
                return true;
        }
        bool isScalarRegister(int operandIndex) {
            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
            if (!operandIndex)
                return target.isScalarRegister();
            else
                return false;
        }
        bool isSrcOperand(int operandIndex) {
            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
            if (operandIndex == 0)
                return true;
            return false;
        }
        // both Condition Register and Target are source operands
        bool isDstOperand(int operandIndex) {
            return false;
        }
        int getOperandSize(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            if (!operandIndex)
                return target.opSize();
            else
                return 1;
        }
        int getRegisterIndex(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            if (!operandIndex)
                return target.regIndex();
            else
                return -1;
         }
        // Operands = Target, Condition Register
        int getNumOperands() {
            return 2;
        }
    };
    template<typename TargetType>
    void
    CbrInstBase<TargetType>::generateDisassembly()
    {
        std::string widthClause;
        if (width != 1) {
            widthClause = csprintf("_width(%d)", width);
        }
        disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
                               cond.disassemble(), target.disassemble());
    }
    template<typename TargetType>
    void
    CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();
        const uint32_t curr_pc = w->pc();
        const uint32_t curr_rpc = w->rpc();
        const VectorMask curr_mask = w->execMask();
        /**
         * TODO: can we move this pop outside the instruction, and
         * into the wavefront?
         */
        w->popFromReconvergenceStack();
        // immediate post-dominator instruction
        const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
        if (curr_rpc != rpc) {
            w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
        }
        // taken branch
        const uint32_t true_pc = getTargetPc();
        VectorMask true_mask;
        for (unsigned int lane = 0; lane < VSZ; ++lane) {
            true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
        }
        // not taken branch
        const uint32_t false_pc = curr_pc + 1;
        assert(true_pc != false_pc);
        if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
            VectorMask false_mask = curr_mask & ~true_mask;
            w->pushToReconvergenceStack(false_pc, rpc, false_mask);
        }
        if (true_pc != rpc && true_mask.count()) {
            w->pushToReconvergenceStack(true_pc, rpc, true_mask);
        }
        assert(w->pc() != curr_pc);
        w->discardFetch();
    }
    class CbrDirectInst : public CbrInstBase<LabelOperand>
    {
      public:
        CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
            : CbrInstBase<LabelOperand>(ib, obj)
        {
        }
        // the source operand of a conditional branch is a Condition
        // Register which is not stored in the VRF
        // so we do not count it as a source-register operand
        // even though, formally, it is one.
        int numSrcRegOperands() { return 0; }
        int numDstRegOperands() { return 0; }
    };
    class CbrIndirectInst : public CbrInstBase<SRegOperand>
    {
      public:
        CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
            : CbrInstBase<SRegOperand>(ib, obj)
        {
        }
        // one source operand of the conditional indirect branch is a Condition
        // register which is not stored in the VRF so we do not count it
        // as a source-register operand even though, formally, it is one.
        int numSrcRegOperands() { return target.isVectorRegister(); }
        int numDstRegOperands() { return 0; }
    };
    GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
                             const BrigObject *obj);
    template<typename TargetType>
    class BrInstBase : public HsailGPUStaticInst
    {
      public:
        void generateDisassembly();
        ImmOperand<uint32_t> width;
        TargetType target;
        BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
           : HsailGPUStaticInst(obj, "br")
        {
            o_type = Enums::OT_BRANCH;
            width.init(((Brig::BrigInstBr *)ib)->width, obj);
            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
            target.init(op_offs, obj);
            o_type = Enums::OT_BRANCH;
        }
        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
        bool unconditionalJumpInstruction() override { return true; }
        void execute(GPUDynInstPtr gpuDynInst);
        bool isVectorRegister(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.isVectorRegister();
        }
        bool isCondRegister(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.isCondRegister();
        }
        bool isScalarRegister(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.isScalarRegister();
        }
        bool isSrcOperand(int operandIndex) {
            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
            return true;
        }
        bool isDstOperand(int operandIndex) { return false; }
        int getOperandSize(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.opSize();
        }
        int getRegisterIndex(int operandIndex) {
            assert(operandIndex >= 0 && operandIndex < getNumOperands());
            return target.regIndex();
        }
        int getNumOperands() { return 1; }
    };
    template<typename TargetType>
    void
    BrInstBase<TargetType>::generateDisassembly()
    {
        std::string widthClause;
        if (width.bits != 1) {
            widthClause = csprintf("_width(%d)", width.bits);
        }
        disassembly = csprintf("%s%s %s", opcode, widthClause,
                               target.disassemble());
    }
    template<typename TargetType>
    void
    BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();
        if (getTargetPc() == w->rpc()) {
            w->popFromReconvergenceStack();
        } else {
            // Rpc and execution mask remain the same
            w->pc(getTargetPc());
        }
        w->discardFetch();
    }
    class BrDirectInst : public BrInstBase<LabelOperand>
    {
      public:
        BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
            : BrInstBase<LabelOperand>(ib, obj)
        {
        }
        int numSrcRegOperands() { return 0; }
        int numDstRegOperands() { return 0; }
    };
    class BrIndirectInst : public BrInstBase<SRegOperand>
    {
      public:
        BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
            : BrInstBase<SRegOperand>(ib, obj)
        {
        }
        int numSrcRegOperands() { return target.isVectorRegister(); }
        int numDstRegOperands() { return 0; }
    };
    GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
                            const BrigObject *obj);
 } // namespace HsailISA
 #endif // __ARCH_HSAIL_INSTS_BRANCH_HH__
--- a/src/arch/hsail/insts/decl.hh
+++ b/src/arch/hsail/insts/decl.hh
--- a/src/arch/hsail/insts/gpu_static_inst.cc
+++ b/src/arch/hsail/insts/gpu_static_inst.cc
@ -0,0 +1,64 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #include "arch/hsail/insts/gpu_static_inst.hh"
 #include "gpu-compute/brig_object.hh"
 namespace HsailISA
 {
    HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
                                           const std::string &opcode)
        : GPUStaticInst(opcode), hsailCode(obj->currentCode)
    {
    }
    void
    HsailGPUStaticInst::generateDisassembly()
    {
        disassembly = opcode;
    }
    const std::string&
    HsailGPUStaticInst::disassemble()
    {
        if (disassembly.empty()) {
            generateDisassembly();
            assert(!disassembly.empty());
        }
        return disassembly;
    }
 } // namespace HsailISA
--- a/src/arch/hsail/insts/gpu_static_inst.hh
+++ b/src/arch/hsail/insts/gpu_static_inst.hh
@ -0,0 +1,65 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
 #define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
 /*
 * @file gpu_static_inst.hh
 *
 * Defines the base class representing HSAIL GPU static instructions.
 */
 #include "gpu-compute/gpu_static_inst.hh"
 class BrigObject;
 class HsailCode;
 namespace HsailISA
 {
    class HsailGPUStaticInst : public GPUStaticInst
    {
      public:
        HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
        void generateDisassembly();
        const std::string &disassemble();
        uint32_t instSize() { return 4; }
      protected:
        HsailCode *hsailCode;
    };
 } // namespace HsailISA
 #endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
--- a/src/arch/hsail/insts/main.cc
+++ b/src/arch/hsail/insts/main.cc
@ -0,0 +1,208 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #include "arch/hsail/insts/decl.hh"
 #include "debug/GPUExec.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "gpu-compute/simple_pool_manager.hh"
 namespace HsailISA
 {
    template<> const char *B1::label = "b1";
    template<> const char *B8::label = "b8";
    template<> const char *B16::label = "b16";
    template<> const char *B32::label = "b32";
    template<> const char *B64::label = "b64";
    template<> const char *S8::label = "s8";
    template<> const char *S16::label = "s16";
    template<> const char *S32::label = "s32";
    template<> const char *S64::label = "s64";
    template<> const char *U8::label = "u8";
    template<> const char *U16::label = "u16";
    template<> const char *U32::label = "u32";
    template<> const char *U64::label = "u64";
    template<> const char *F32::label = "f32";
    template<> const char *F64::label = "f64";
    const char*
    cmpOpToString(Brig::BrigCompareOperation cmpOp)
    {
        using namespace Brig;
        switch (cmpOp) {
          case BRIG_COMPARE_EQ:
            return "eq";
          case BRIG_COMPARE_NE:
            return "ne";
          case BRIG_COMPARE_LT:
            return "lt";
          case BRIG_COMPARE_LE:
            return "le";
          case BRIG_COMPARE_GT:
            return "gt";
          case BRIG_COMPARE_GE:
            return "ge";
          case BRIG_COMPARE_EQU:
            return "equ";
          case BRIG_COMPARE_NEU:
            return "neu";
          case BRIG_COMPARE_LTU:
            return "ltu";
          case BRIG_COMPARE_LEU:
            return "leu";
          case BRIG_COMPARE_GTU:
            return "gtu";
          case BRIG_COMPARE_GEU:
            return "geu";
          case BRIG_COMPARE_NUM:
            return "num";
          case BRIG_COMPARE_NAN:
            return "nan";
          case BRIG_COMPARE_SEQ:
            return "seq";
          case BRIG_COMPARE_SNE:
            return "sne";
          case BRIG_COMPARE_SLT:
            return "slt";
          case BRIG_COMPARE_SLE:
            return "sle";
          case BRIG_COMPARE_SGT:
            return "sgt";
          case BRIG_COMPARE_SGE:
            return "sge";
          case BRIG_COMPARE_SGEU:
            return "sgeu";
          case BRIG_COMPARE_SEQU:
            return "sequ";
          case BRIG_COMPARE_SNEU:
            return "sneu";
          case BRIG_COMPARE_SLTU:
            return "sltu";
          case BRIG_COMPARE_SLEU:
            return "sleu";
          case BRIG_COMPARE_SNUM:
            return "snum";
          case BRIG_COMPARE_SNAN:
            return "snan";
          case BRIG_COMPARE_SGTU:
            return "sgtu";
          default:
            return "unknown";
        }
    }
    void
    Ret::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();
        const VectorMask &mask = w->get_pred();
        // mask off completed work-items
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                w->init_mask[lane] = 0;
            }
        }
        // delete extra instructions fetched for completed work-items
        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
                                   w->instructionBuffer.end());
        if (w->pendingFetch) {
            w->dropFetch = true;
        }
        // if all work-items have completed, then wave-front is done
        if (w->init_mask.none()) {
            w->status = Wavefront::S_STOPPED;
            int32_t refCount = w->computeUnit->getLds().
                                   decreaseRefCounter(w->dispatchid, w->wg_id);
            DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
                            w->computeUnit->cu_id, w->wg_id, refCount);
            // free the vector registers of the completed wavefront
            w->computeUnit->vectorRegsReserved[w->simdId] -=
                w->reservedVectorRegs;
            assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
            uint32_t endIndex = (w->startVgprIndex +
                                 w->reservedVectorRegs - 1) %
                w->computeUnit->vrf[w->simdId]->numRegs();
            w->computeUnit->vrf[w->simdId]->manager->
                freeRegion(w->startVgprIndex, endIndex);
            w->reservedVectorRegs = 0;
            w->startVgprIndex = 0;
            w->computeUnit->completedWfs++;
            DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
                    w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
            if (!refCount) {
                // Notify Memory System of Kernel Completion
                // Kernel End = isKernel + isRelease
                w->status = Wavefront::S_RETURNING;
                GPUDynInstPtr local_mempacket = gpuDynInst;
                local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
                local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
                local_mempacket->useContinuation = false;
                local_mempacket->simdId = w->simdId;
                local_mempacket->wfSlotId = w->wfSlotId;
                local_mempacket->wfDynId = w->wfDynId;
                w->computeUnit->injectGlobalMemFence(local_mempacket, true);
            } else {
                w->computeUnit->shader->dispatcher->scheduleDispatch();
            }
        }
    }
    void
    Barrier::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();
        assert(w->barrier_cnt == w->old_barrier_cnt);
        w->barrier_cnt = w->old_barrier_cnt + 1;
        w->stalledAtBarrier = true;
    }
 } // namespace HsailISA
--- a/src/arch/hsail/insts/mem.cc
+++ b/src/arch/hsail/insts/mem.cc
@ -0,0 +1,139 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #include "arch/hsail/insts/mem.hh"
 #include "arch/hsail/Brig.h"
 #include "enums/OpType.hh"
 using namespace Brig;
 namespace HsailISA
 {
    const char* atomicOpToString(BrigAtomicOperation brigOp);
    Enums::MemOpType
    brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
    {
        if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
            switch (brigOp) {
              case BRIG_ATOMIC_AND:
                return Enums::MO_AAND;
              case BRIG_ATOMIC_OR:
                return Enums::MO_AOR;
              case BRIG_ATOMIC_XOR:
                return Enums::MO_AXOR;
              case BRIG_ATOMIC_CAS:
                return Enums::MO_ACAS;
              case BRIG_ATOMIC_EXCH:
                return Enums::MO_AEXCH;
              case BRIG_ATOMIC_ADD:
                return Enums::MO_AADD;
              case BRIG_ATOMIC_WRAPINC:
                return Enums::MO_AINC;
              case BRIG_ATOMIC_WRAPDEC:
                return Enums::MO_ADEC;
              case BRIG_ATOMIC_MIN:
                return Enums::MO_AMIN;
              case BRIG_ATOMIC_MAX:
                return Enums::MO_AMAX;
              case BRIG_ATOMIC_SUB:
                return Enums::MO_ASUB;
              default:
                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
            }
        } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
            switch (brigOp) {
              case BRIG_ATOMIC_AND:
                  return Enums::MO_ANRAND;
              case BRIG_ATOMIC_OR:
                  return Enums::MO_ANROR;
              case BRIG_ATOMIC_XOR:
                  return Enums::MO_ANRXOR;
              case BRIG_ATOMIC_CAS:
                  return Enums::MO_ANRCAS;
              case BRIG_ATOMIC_EXCH:
                  return Enums::MO_ANREXCH;
              case BRIG_ATOMIC_ADD:
                  return Enums::MO_ANRADD;
              case BRIG_ATOMIC_WRAPINC:
                  return Enums::MO_ANRINC;
              case BRIG_ATOMIC_WRAPDEC:
                  return Enums::MO_ANRDEC;
              case BRIG_ATOMIC_MIN:
                  return Enums::MO_ANRMIN;
              case BRIG_ATOMIC_MAX:
                  return Enums::MO_ANRMAX;
              case BRIG_ATOMIC_SUB:
                  return Enums::MO_ANRSUB;
              default:
                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
            }
        } else {
            fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
        }
    }
    const char*
    atomicOpToString(BrigAtomicOperation brigOp)
    {
        switch (brigOp) {
          case BRIG_ATOMIC_AND:
            return "and";
          case BRIG_ATOMIC_OR:
            return "or";
          case BRIG_ATOMIC_XOR:
            return "xor";
          case BRIG_ATOMIC_CAS:
            return "cas";
          case BRIG_ATOMIC_EXCH:
            return "exch";
          case BRIG_ATOMIC_ADD:
            return "add";
          case BRIG_ATOMIC_WRAPINC:
            return "inc";
          case BRIG_ATOMIC_WRAPDEC:
            return "dec";
          case BRIG_ATOMIC_MIN:
            return "min";
          case BRIG_ATOMIC_MAX:
            return "max";
          case BRIG_ATOMIC_SUB:
            return "sub";
          default:
            return "unknown";
        }
    }
 } // namespace HsailISA
--- a/src/arch/hsail/insts/mem.hh
+++ b/src/arch/hsail/insts/mem.hh
--- a/src/arch/hsail/insts/mem_impl.hh
+++ b/src/arch/hsail/insts/mem_impl.hh
@ -0,0 +1,660 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #include "arch/hsail/generic_types.hh"
 #include "gpu-compute/hsail_code.hh"
 // defined in code.cc, but not worth sucking in all of code.h for this
 // at this point
 extern const char *segmentNames[];
 namespace HsailISA
 {
    template<typename DestDataType, typename AddrRegOperandType>
    void
    LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
    {
        this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
                                     DestDataType::label,
                                     this->dest.disassemble(),
                                     this->addr.disassemble());
    }
    template<typename DestDataType, typename AddrRegOperandType>
    void
    LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();
        typedef typename DestDataType::CType CType M5_VAR_USED;
        const VectorMask &mask = w->get_pred();
        uint64_t addr_vec[VSZ];
        this->addr.calcVector(w, addr_vec);
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                this->dest.set(w, lane, addr_vec[lane]);
            }
        }
    }
    template<typename MemDataType, typename DestDataType,
             typename AddrRegOperandType>
    void
    LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
    {
        switch (num_dest_operands) {
          case 1:
            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
                                         segmentNames[this->segment],
                                         MemDataType::label,
                                         this->dest.disassemble(),
                                         this->addr.disassemble());
            break;
          case 2:
            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
                                         segmentNames[this->segment],
                                         MemDataType::label,
                                         this->dest_vect[0].disassemble(),
                                         this->dest_vect[1].disassemble(),
                                         this->addr.disassemble());
            break;
          case 4:
            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
                                         this->opcode,
                                         segmentNames[this->segment],
                                         MemDataType::label,
                                         this->dest_vect[0].disassemble(),
                                         this->dest_vect[1].disassemble(),
                                         this->dest_vect[2].disassemble(),
                                         this->dest_vect[3].disassemble(),
                                         this->addr.disassemble());
            break;
          default:
            fatal("Bad ld register dest operand, num vector operands: %d \n",
                  num_dest_operands);
            break;
        }
    }
    static Addr
    calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
    {
        // what is the size of the object we are accessing??
        // NOTE: the compiler doesn't generate enough information
        // to do this yet..have to just line up all the private
        // work-item spaces back to back for now
        /*
        StorageElement* se =
            i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
        assert(se);
        return w->wfSlotId * w->privSizePerItem * VSZ +
            se->offset * VSZ +
            lane * se->size;
        */
        // addressing strategy: interleave the private spaces of
        // work-items in a wave-front on 8 byte granularity.
        // this won't be perfect coalescing like the spill space
        // strategy, but it's better than nothing. The spill space
        // strategy won't work with private because the same address
        // may be accessed by different sized loads/stores.
        // Note: I'm assuming that the largest load/store to private
        // is 8 bytes. If it is larger, the stride will have to increase
        Addr addr_div8 = addr / 8;
        Addr addr_mod8 = addr % 8;
        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
        return ret;
    }
    template<typename MemDataType, typename DestDataType,
             typename AddrRegOperandType>
    void
    LdInst<MemDataType, DestDataType,
           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();
        typedef typename MemDataType::CType MemCType;
        const VectorMask &mask = w->get_pred();
        // Kernarg references are handled uniquely for now (no Memory Request
        // is used), so special-case them up front.  Someday we should
        // make this more realistic, at which we should get rid of this
        // block and fold this case into the switch below.
        if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
            MemCType val;
            // I assume no vector ld for kernargs
            assert(num_dest_operands == 1);
            // assuming for the moment that we'll never do register
            // offsets into kernarg space... just to make life simpler
            uint64_t address = this->addr.calcUniform();
            val = *(MemCType*)&w->kernelArgs[address];
            DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    this->dest.set(w, lane, val);
                }
            }
            return;
        } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
            uint64_t address = this->addr.calcUniform();
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    MemCType val = w->readCallArgMem<MemCType>(lane, address);
                    DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
                            (unsigned long long)val);
                    this->dest.set(w, lane, val);
                }
            }
            return;
        }
        GPUDynInstPtr m = gpuDynInst;
        this->addr.calcVector(w, m->addr);
        m->m_op = Enums::MO_LD;
        m->m_type = MemDataType::memType;
        m->v_type = DestDataType::vgprType;
        m->exec_mask = w->execMask();
        m->statusBitVector = 0;
        m->equiv = this->equivClass;
        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
        m->scope = getGenericMemoryScope(this->memoryScope);
        if (num_dest_operands == 1) {
            m->dst_reg = this->dest.regIndex();
            m->n_reg = 1;
        } else {
            m->n_reg = num_dest_operands;
            for (int i = 0; i < num_dest_operands; ++i) {
                m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
            }
        }
        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->kern_id = w->kern_id;
        m->cu_id = w->computeUnit->cu_id;
        m->latency.init(&w->computeUnit->shader->tick_cnt);
        switch (this->segment) {
          case Brig::BRIG_SEGMENT_GLOBAL:
            m->s_type = SEG_GLOBAL;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            // this is a complete hack to get around a compiler bug
            // (the compiler currently generates global access for private
            //  addresses (starting from 0). We need to add the private offset)
            for (int lane = 0; lane < VSZ; ++lane) {
                if (m->addr[lane] < w->privSizePerItem) {
                    if (mask[lane]) {
                        // what is the size of the object we are accessing?
                        // find base for for this wavefront
                        // calcPrivAddr will fail if accesses are unaligned
                        assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
                                                     this);
                        m->addr[lane] = privAddr;
                    }
                }
            }
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;
          case Brig::BRIG_SEGMENT_SPILL:
            assert(num_dest_operands == 1);
            m->s_type = SEG_SPILL;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            {
                for (int lane = 0; lane < VSZ; ++lane) {
                    //  note: this calculation will NOT WORK if the compiler
                    //  ever generates loads/stores to the same address with
                    //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
                    if (mask[lane]) {
                        assert(m->addr[lane] < w->spillSizePerItem);
                        m->addr[lane] = m->addr[lane] * w->spillWidth +
                                        lane * sizeof(MemCType) + w->spillBase;
                        w->last_addr[lane] = m->addr[lane];
                    }
                }
            }
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;
          case Brig::BRIG_SEGMENT_GROUP:
            m->s_type = SEG_SHARED;
            m->pipeId = LDSMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(24));
            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
            w->outstanding_reqs_rd_lm++;
            w->rd_lm_reqs_in_pipe--;
            break;
          case Brig::BRIG_SEGMENT_READONLY:
            m->s_type = SEG_READONLY;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
                    m->addr[lane] += w->roBase;
                }
            }
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;
          case Brig::BRIG_SEGMENT_PRIVATE:
            m->s_type = SEG_PRIVATE;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            {
                for (int lane = 0; lane < VSZ; ++lane) {
                    if (mask[lane]) {
                        assert(m->addr[lane] < w->privSizePerItem);
                        m->addr[lane] = m->addr[lane] +
                            lane * sizeof(MemCType) + w->privBase;
                    }
                }
            }
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;
          default:
            fatal("Load to unsupported segment %d %llxe\n", this->segment,
                  m->addr[0]);
        }
        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }
    template<typename OperationType, typename SrcDataType,
             typename AddrRegOperandType>
    void
    StInst<OperationType, SrcDataType,
           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();
        typedef typename OperationType::CType CType;
        const VectorMask &mask = w->get_pred();
        // arg references are handled uniquely for now (no Memory Request
        // is used), so special-case them up front.  Someday we should
        // make this more realistic, at which we should get rid of this
        // block and fold this case into the switch below.
        if (this->segment == Brig::BRIG_SEGMENT_ARG) {
            uint64_t address = this->addr.calcUniform();
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    CType data = this->src.template get<CType>(w, lane);
                    DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
                    w->writeCallArgMem<CType>(lane, address, data);
                }
            }
            return;
        }
        GPUDynInstPtr m = gpuDynInst;
        m->exec_mask = w->execMask();
        this->addr.calcVector(w, m->addr);
        if (num_src_operands == 1) {
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    ((CType*)m->d_data)[lane] =
                        this->src.template get<CType>(w, lane);
                }
            }
        } else {
            for (int k= 0; k < num_src_operands; ++k) {
                for (int lane = 0; lane < VSZ; ++lane) {
                    if (mask[lane]) {
                        ((CType*)m->d_data)[k * VSZ + lane] =
                            this->src_vect[k].template get<CType>(w, lane);
                    }
                }
            }
        }
        m->m_op = Enums::MO_ST;
        m->m_type = OperationType::memType;
        m->v_type = OperationType::vgprType;
        m->statusBitVector = 0;
        m->equiv = this->equivClass;
        if (num_src_operands == 1) {
            m->n_reg = 1;
        } else {
            m->n_reg = num_src_operands;
        }
        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
        m->scope = getGenericMemoryScope(this->memoryScope);
        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->kern_id = w->kern_id;
        m->cu_id = w->computeUnit->cu_id;
        m->latency.init(&w->computeUnit->shader->tick_cnt);
        switch (this->segment) {
          case Brig::BRIG_SEGMENT_GLOBAL:
            m->s_type = SEG_GLOBAL;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            // this is a complete hack to get around a compiler bug
            // (the compiler currently generates global access for private
            //  addresses (starting from 0). We need to add the private offset)
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    if (m->addr[lane] < w->privSizePerItem) {
                        // calcPrivAddr will fail if accesses are unaligned
                        assert(!((sizeof(CType)-1) & m->addr[lane]));
                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
                                                     this);
                        m->addr[lane] = privAddr;
                    }
                }
            }
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_wr_gm++;
            w->wr_gm_reqs_in_pipe--;
            break;
          case Brig::BRIG_SEGMENT_SPILL:
            assert(num_src_operands == 1);
            m->s_type = SEG_SPILL;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            {
                for (int lane = 0; lane < VSZ; ++lane) {
                    if (mask[lane]) {
                        assert(m->addr[lane] < w->spillSizePerItem);
                        m->addr[lane] = m->addr[lane] * w->spillWidth +
                                        lane * sizeof(CType) + w->spillBase;
                    }
                }
            }
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_wr_gm++;
            w->wr_gm_reqs_in_pipe--;
            break;
          case Brig::BRIG_SEGMENT_GROUP:
            m->s_type = SEG_SHARED;
            m->pipeId = LDSMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(24));
            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
            w->outstanding_reqs_wr_lm++;
            w->wr_lm_reqs_in_pipe--;
            break;
          case Brig::BRIG_SEGMENT_PRIVATE:
            m->s_type = SEG_PRIVATE;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            {
                for (int lane = 0; lane < VSZ; ++lane) {
                    if (mask[lane]) {
                        assert(m->addr[lane] < w->privSizePerItem);
                        m->addr[lane] = m->addr[lane] + lane *
                            sizeof(CType)+w->privBase;
                    }
                }
            }
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_wr_gm++;
            w->wr_gm_reqs_in_pipe--;
            break;
          default:
            fatal("Store to unsupported segment %d\n", this->segment);
        }
        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }
    template<typename OperationType, typename SrcDataType,
             typename AddrRegOperandType>
    void
    StInst<OperationType, SrcDataType,
           AddrRegOperandType>::generateDisassembly()
    {
        switch (num_src_operands) {
          case 1:
            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
                                         segmentNames[this->segment],
                                         OperationType::label,
                                         this->src.disassemble(),
                                         this->addr.disassemble());
            break;
          case 2:
            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
                                         segmentNames[this->segment],
                                         OperationType::label,
                                         this->src_vect[0].disassemble(),
                                         this->src_vect[1].disassemble(),
                                         this->addr.disassemble());
            break;
          case 4:
            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
                                         this->opcode,
                                         segmentNames[this->segment],
                                         OperationType::label,
                                         this->src_vect[0].disassemble(),
                                         this->src_vect[1].disassemble(),
                                         this->src_vect[2].disassemble(),
                                         this->src_vect[3].disassemble(),
                                         this->addr.disassemble());
            break;
          default: fatal("Bad ld register src operand, num vector operands: "
                         "%d \n", num_src_operands);
            break;
        }
    }
    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
             bool HasDst>
    void
    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
        HasDst>::execute(GPUDynInstPtr gpuDynInst)
    {
        typedef typename DataType::CType CType;
        Wavefront *w = gpuDynInst->wavefront();
        GPUDynInstPtr m = gpuDynInst;
        this->addr.calcVector(w, m->addr);
        for (int lane = 0; lane < VSZ; ++lane) {
            ((CType *)m->a_data)[lane] =
                this->src[0].template get<CType>(w, lane);
        }
        // load second source operand for CAS
        if (NumSrcOperands > 1) {
            for (int lane = 0; lane < VSZ; ++lane) {
                ((CType*)m->x_data)[lane] =
                    this->src[1].template get<CType>(w, lane);
            }
        }
        assert(NumSrcOperands <= 2);
        m->m_op = this->opType;
        m->m_type = DataType::memType;
        m->v_type = DataType::vgprType;
        m->exec_mask = w->execMask();
        m->statusBitVector = 0;
        m->equiv = 0;  // atomics don't have an equivalence class operand
        m->n_reg = 1;
        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
        m->scope = getGenericMemoryScope(this->memoryScope);
        if (HasDst) {
            m->dst_reg = this->dest.regIndex();
        }
        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->kern_id = w->kern_id;
        m->cu_id = w->computeUnit->cu_id;
        m->latency.init(&w->computeUnit->shader->tick_cnt);
        switch (this->segment) {
          case Brig::BRIG_SEGMENT_GLOBAL:
            m->s_type = SEG_GLOBAL;
            m->latency.set(w->computeUnit->shader->ticks(64));
            m->pipeId = GLBMEM_PIPE;
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_wr_gm++;
            w->wr_gm_reqs_in_pipe--;
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;
          case Brig::BRIG_SEGMENT_GROUP:
            m->s_type = SEG_SHARED;
            m->pipeId = LDSMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(24));
            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
            w->outstanding_reqs_wr_lm++;
            w->wr_lm_reqs_in_pipe--;
            w->outstanding_reqs_rd_lm++;
            w->rd_lm_reqs_in_pipe--;
            break;
          default:
            fatal("Atomic op to unsupported segment %d\n",
                  this->segment);
        }
        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }
    const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
             bool HasDst>
    void
    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
               HasDst>::generateDisassembly()
    {
        if (HasDst) {
            this->disassembly =
                csprintf("%s_%s_%s_%s %s,%s", this->opcode,
                         atomicOpToString(this->atomicOperation),
                         segmentNames[this->segment],
                         DataType::label, this->dest.disassemble(),
                         this->addr.disassemble());
        } else {
            this->disassembly =
                csprintf("%s_%s_%s_%s %s", this->opcode,
                         atomicOpToString(this->atomicOperation),
                         segmentNames[this->segment],
                         DataType::label, this->addr.disassemble());
        }
        for (int i = 0; i < NumSrcOperands; ++i) {
            this->disassembly += ",";
            this->disassembly += this->src[i].disassemble();
        }
    }
 } // namespace HsailISA
--- a/src/arch/hsail/insts/pseudo_inst.cc
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@ -0,0 +1,787 @@
 /*
 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Marc Orr
 */
 #include <csignal>
 #include "arch/hsail/insts/decl.hh"
 #include "arch/hsail/insts/mem.hh"
 namespace HsailISA
 {
    // Pseudo (or magic) instructions are overloaded on the hsail call
    // instruction, because of its flexible parameter signature.
    // To add a new magic instruction:
    // 1. Add an entry to the enum.
    // 2. Implement it in the switch statement below (Call::exec).
    // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
    //    so its easy to call from an OpenCL kernel.
    // This enum should be identical to the enum in
    // hsa/hsail-gpu-compute/util/magicinst.h
    enum
    {
        MAGIC_PRINT_WF_32 = 0,
        MAGIC_PRINT_WF_64,
        MAGIC_PRINT_LANE,
        MAGIC_PRINT_LANE_64,
        MAGIC_PRINT_WF_FLOAT,
        MAGIC_SIM_BREAK,
        MAGIC_PREF_SUM,
        MAGIC_REDUCTION,
        MAGIC_MASKLANE_LOWER,
        MAGIC_MASKLANE_UPPER,
        MAGIC_JOIN_WF_BAR,
        MAGIC_WAIT_WF_BAR,
        MAGIC_PANIC,
        MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
        MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
        MAGIC_LOAD_GLOBAL_U32_REG,
        MAGIC_XACT_CAS_LD,
        MAGIC_MOST_SIG_THD,
        MAGIC_MOST_SIG_BROADCAST,
        MAGIC_PRINT_WFID_32,
        MAGIC_PRINT_WFID_64
    };
    void
    Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
    {
        const VectorMask &mask = w->get_pred();
        int op = 0;
        bool got_op = false;
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                int src_val0 = src1.get<int>(w, lane, 0);
                if (got_op) {
                    if (src_val0 != op) {
                        fatal("Multiple magic instructions per PC not "
                              "supported\n");
                    }
                } else {
                    op = src_val0;
                    got_op = true;
                }
            }
        }
        switch(op) {
          case MAGIC_PRINT_WF_32:
            MagicPrintWF32(w);
            break;
          case MAGIC_PRINT_WF_64:
            MagicPrintWF64(w);
            break;
          case MAGIC_PRINT_LANE:
            MagicPrintLane(w);
            break;
          case MAGIC_PRINT_LANE_64:
            MagicPrintLane64(w);
            break;
          case MAGIC_PRINT_WF_FLOAT:
            MagicPrintWFFloat(w);
            break;
          case MAGIC_SIM_BREAK:
            MagicSimBreak(w);
            break;
          case MAGIC_PREF_SUM:
            MagicPrefixSum(w);
            break;
          case MAGIC_REDUCTION:
            MagicReduction(w);
            break;
          case MAGIC_MASKLANE_LOWER:
            MagicMaskLower(w);
            break;
          case MAGIC_MASKLANE_UPPER:
            MagicMaskUpper(w);
            break;
          case MAGIC_JOIN_WF_BAR:
            MagicJoinWFBar(w);
            break;
          case MAGIC_WAIT_WF_BAR:
            MagicWaitWFBar(w);
            break;
          case MAGIC_PANIC:
            MagicPanic(w);
            break;
          // atomic instructions
          case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
            MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
            break;
          case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
            MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
            break;
          case MAGIC_LOAD_GLOBAL_U32_REG:
            MagicLoadGlobalU32Reg(w, gpuDynInst);
            break;
          case MAGIC_XACT_CAS_LD:
            MagicXactCasLd(w);
            break;
          case MAGIC_MOST_SIG_THD:
            MagicMostSigThread(w);
            break;
          case MAGIC_MOST_SIG_BROADCAST:
            MagicMostSigBroadcast(w);
            break;
          case MAGIC_PRINT_WFID_32:
            MagicPrintWF32ID(w);
            break;
          case MAGIC_PRINT_WFID_64:
            MagicPrintWFID64(w);
            break;
          default: fatal("unrecognized magic instruction: %d\n", op);
        }
    }
    void
    Call::MagicPrintLane(Wavefront *w)
    {
    #if TRACING_ON
        const VectorMask &mask = w->get_pred();
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                int src_val1 = src1.get<int>(w, lane, 1);
                int src_val2 = src1.get<int>(w, lane, 2);
                if (src_val2) {
                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
                             disassemble(), w->computeUnit->cu_id, w->simdId,
                             w->wfSlotId, lane, src_val1);
                } else {
                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
                             disassemble(), w->computeUnit->cu_id, w->simdId,
                             w->wfSlotId, lane, src_val1);
                }
            }
        }
    #endif
    }
    void
    Call::MagicPrintLane64(Wavefront *w)
    {
    #if TRACING_ON
        const VectorMask &mask = w->get_pred();
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
                int src_val2 = src1.get<int>(w, lane, 2);
                if (src_val2) {
                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
                             disassemble(), w->computeUnit->cu_id, w->simdId,
                             w->wfSlotId, lane, src_val1);
                } else {
                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
                             disassemble(), w->computeUnit->cu_id, w->simdId,
                             w->wfSlotId, lane, src_val1);
                }
            }
        }
    #endif
    }
    void
    Call::MagicPrintWF32(Wavefront *w)
    {
    #if TRACING_ON
        const VectorMask &mask = w->get_pred();
        std::string res_str;
        res_str = csprintf("krl_prt (%s)\n", disassemble());
        for (int lane = 0; lane < VSZ; ++lane) {
            if (!(lane & 7)) {
                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
            }
            if (mask[lane]) {
                int src_val1 = src1.get<int>(w, lane, 1);
                int src_val2 = src1.get<int>(w, lane, 2);
                if (src_val2) {
                    res_str += csprintf("%08x", src_val1);
                } else {
                    res_str += csprintf("%08d", src_val1);
                }
            } else {
                res_str += csprintf("xxxxxxxx");
            }
            if ((lane & 7) == 7) {
                res_str += csprintf("\n");
            } else {
                res_str += csprintf(" ");
            }
        }
        res_str += "\n\n";
        DPRINTFN(res_str.c_str());
    #endif
    }
    void
    Call::MagicPrintWF32ID(Wavefront *w)
    {
    #if TRACING_ON
        const VectorMask &mask = w->get_pred();
        std::string res_str;
        int src_val3 = -1;
        res_str = csprintf("krl_prt (%s)\n", disassemble());
        for (int lane = 0; lane < VSZ; ++lane) {
            if (!(lane & 7)) {
                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
            }
            if (mask[lane]) {
                int src_val1 = src1.get<int>(w, lane, 1);
                int src_val2 = src1.get<int>(w, lane, 2);
                src_val3 = src1.get<int>(w, lane, 3);
                if (src_val2) {
                    res_str += csprintf("%08x", src_val1);
                } else {
                    res_str += csprintf("%08d", src_val1);
                }
            } else {
                res_str += csprintf("xxxxxxxx");
            }
            if ((lane & 7) == 7) {
                res_str += csprintf("\n");
            } else {
                res_str += csprintf(" ");
            }
        }
        res_str += "\n\n";
        if (w->wfDynId == src_val3) {
            DPRINTFN(res_str.c_str());
        }
    #endif
    }
    void
    Call::MagicPrintWF64(Wavefront *w)
    {
    #if TRACING_ON
        const VectorMask &mask = w->get_pred();
        std::string res_str;
        res_str = csprintf("krl_prt (%s)\n", disassemble());
        for (int lane = 0; lane < VSZ; ++lane) {
            if (!(lane & 3)) {
                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
            }
            if (mask[lane]) {
                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
                int src_val2 = src1.get<int>(w, lane, 2);
                if (src_val2) {
                    res_str += csprintf("%016x", src_val1);
                } else {
                    res_str += csprintf("%016d", src_val1);
                }
            } else {
                res_str += csprintf("xxxxxxxxxxxxxxxx");
            }
            if ((lane & 3) == 3) {
                res_str += csprintf("\n");
            } else {
                res_str += csprintf(" ");
            }
        }
        res_str += "\n\n";
        DPRINTFN(res_str.c_str());
    #endif
    }
    void
    Call::MagicPrintWFID64(Wavefront *w)
    {
    #if TRACING_ON
        const VectorMask &mask = w->get_pred();
        std::string res_str;
        int src_val3 = -1;
        res_str = csprintf("krl_prt (%s)\n", disassemble());
        for (int lane = 0; lane < VSZ; ++lane) {
            if (!(lane & 3)) {
                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
            }
            if (mask[lane]) {
                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
                int src_val2 = src1.get<int>(w, lane, 2);
                src_val3 = src1.get<int>(w, lane, 3);
                if (src_val2) {
                    res_str += csprintf("%016x", src_val1);
                } else {
                    res_str += csprintf("%016d", src_val1);
                }
            } else {
                res_str += csprintf("xxxxxxxxxxxxxxxx");
            }
            if ((lane & 3) == 3) {
                res_str += csprintf("\n");
            } else {
                res_str += csprintf(" ");
            }
        }
        res_str += "\n\n";
        if (w->wfDynId == src_val3) {
            DPRINTFN(res_str.c_str());
        }
    #endif
    }
    void
    Call::MagicPrintWFFloat(Wavefront *w)
    {
    #if TRACING_ON
        const VectorMask &mask = w->get_pred();
        std::string res_str;
        res_str = csprintf("krl_prt (%s)\n", disassemble());
        for (int lane = 0; lane < VSZ; ++lane) {
            if (!(lane & 7)) {
                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
            }
            if (mask[lane]) {
                float src_val1 = src1.get<float>(w, lane, 1);
                res_str += csprintf("%08f", src_val1);
            } else {
                res_str += csprintf("xxxxxxxx");
            }
            if ((lane & 7) == 7) {
                res_str += csprintf("\n");
            } else {
                res_str += csprintf(" ");
            }
        }
        res_str += "\n\n";
        DPRINTFN(res_str.c_str());
    #endif
    }
    // raises a signal that GDB will catch
    // when done with the break, type "signal 0" in gdb to continue
    void
    Call::MagicSimBreak(Wavefront *w)
    {
        std::string res_str;
        // print out state for this wavefront and then break
        res_str = csprintf("Breakpoint encountered for wavefront %i\n",
                           w->wfSlotId);
        res_str += csprintf("  Kern ID: %i\n", w->kern_id);
        res_str += csprintf("  Phase ID: %i\n", w->simdId);
        res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
        res_str += csprintf("  Exec mask: ");
        for (int i = VSZ - 1; i >= 0; --i) {
            if (w->execMask(i))
                res_str += "1";
            else
                res_str += "0";
            if ((i & 7) == 7)
                res_str += " ";
        }
        res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
        res_str += "\nHelpful debugging hints:\n";
        res_str += "   Check out w->s_reg / w->d_reg for register state\n";
        res_str += "\n\n";
        DPRINTFN(res_str.c_str());
        fflush(stdout);
        raise(SIGTRAP);
    }
    void
    Call::MagicPrefixSum(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        int res = 0;
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                int src_val1 = src1.get<int>(w, lane, 1);
                dest.set<int>(w, lane, res);
                res += src_val1;
            }
        }
    }
    void
    Call::MagicReduction(Wavefront *w)
    {
        // reduction magic instruction
        //   The reduction instruction takes up to 64 inputs (one from
        //   each thread in a WF) and sums them. It returns the sum to
        //   each thread in the WF.
        const VectorMask &mask = w->get_pred();
        int res = 0;
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                int src_val1 = src1.get<int>(w, lane, 1);
                res += src_val1;
            }
        }
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                dest.set<int>(w, lane, res);
            }
        }
    }
    void
    Call::MagicMaskLower(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        int res = 0;
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                int src_val1 = src1.get<int>(w, lane, 1);
                if (src_val1) {
                    if (lane < (VSZ/2)) {
                        res = res | ((uint32_t)(1) << lane);
                    }
                }
            }
        }
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                dest.set<int>(w, lane, res);
            }
        }
    }
    void
    Call::MagicMaskUpper(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        int res = 0;
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                int src_val1 = src1.get<int>(w, lane, 1);
                if (src_val1) {
                    if (lane >= (VSZ/2)) {
                        res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
                    }
                }
            }
        }
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                dest.set<int>(w, lane, res);
            }
        }
    }
    void
    Call::MagicJoinWFBar(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        int max_cnt = 0;
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                w->bar_cnt[lane]++;
                if (w->bar_cnt[lane] > max_cnt) {
                    max_cnt = w->bar_cnt[lane];
                }
            }
        }
        if (max_cnt > w->max_bar_cnt) {
            w->max_bar_cnt = max_cnt;
        }
    }
    void
    Call::MagicWaitWFBar(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        int max_cnt = 0;
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                w->bar_cnt[lane]--;
            }
            if (w->bar_cnt[lane] > max_cnt) {
                max_cnt = w->bar_cnt[lane];
            }
        }
        if (max_cnt < w->max_bar_cnt) {
            w->max_bar_cnt = max_cnt;
        }
        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
                                   w->instructionBuffer.end());
        if (w->pendingFetch)
            w->dropFetch = true;
    }
    void
    Call::MagicPanic(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                int src_val1 = src1.get<int>(w, lane, 1);
                panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
                      src_val1, lane);
            }
        }
    }
    void
    Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
    {
        // the address is in src1 | src2
        for (int lane = 0; lane < VSZ; ++lane) {
            int src_val1 = src1.get<int>(w, lane, 1);
            int src_val2 = src1.get<int>(w, lane, 2);
            Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
            m->addr[lane] = addr;
        }
    }
    void
    Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
    {
        GPUDynInstPtr m = gpuDynInst;
        calcAddr(w, m);
        for (int lane = 0; lane < VSZ; ++lane) {
            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
        }
        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
                                        Brig::BRIG_ATOMIC_ADD);
        m->m_type = U32::memType;
        m->v_type = U32::vgprType;
        m->exec_mask = w->execMask();
        m->statusBitVector = 0;
        m->equiv = 0;  // atomics don't have an equivalence class operand
        m->n_reg = 1;
        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
        m->scope = Enums::MEMORY_SCOPE_NONE;
        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->latency.init(&w->computeUnit->shader->tick_cnt);
        m->s_type = SEG_GLOBAL;
        m->pipeId = GLBMEM_PIPE;
        m->latency.set(w->computeUnit->shader->ticks(64));
        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
        w->outstanding_reqs_wr_gm++;
        w->wr_gm_reqs_in_pipe--;
        w->outstanding_reqs_rd_gm++;
        w->rd_gm_reqs_in_pipe--;
        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }
    void
    Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
    {
        GPUDynInstPtr m = gpuDynInst;
        calcAddr(w, m);
        for (int lane = 0; lane < VSZ; ++lane) {
            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
        }
        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
                                        Brig::BRIG_ATOMIC_ADD);
        m->m_type = U32::memType;
        m->v_type = U32::vgprType;
        m->exec_mask = w->execMask();
        m->statusBitVector = 0;
        m->equiv = 0;  // atomics don't have an equivalence class operand
        m->n_reg = 1;
        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
        m->scope = Enums::MEMORY_SCOPE_NONE;
        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->latency.init(&w->computeUnit->shader->tick_cnt);
        m->s_type = SEG_GLOBAL;
        m->pipeId = GLBMEM_PIPE;
        m->latency.set(w->computeUnit->shader->ticks(64));
        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
        w->outstanding_reqs_wr_gm++;
        w->wr_gm_reqs_in_pipe--;
        w->outstanding_reqs_rd_gm++;
        w->rd_gm_reqs_in_pipe--;
        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }
    void
    Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
    {
        GPUDynInstPtr m = gpuDynInst;
        // calculate the address
        calcAddr(w, m);
        m->m_op = Enums::MO_LD;
        m->m_type = U32::memType;  //MemDataType::memType;
        m->v_type = U32::vgprType; //DestDataType::vgprType;
        m->exec_mask = w->execMask();
        m->statusBitVector = 0;
        m->equiv = 0;
        m->n_reg = 1;
        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
        m->scope = Enums::MEMORY_SCOPE_NONE;
        // FIXME
        //m->dst_reg = this->dest.regIndex();
        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->latency.init(&w->computeUnit->shader->tick_cnt);
        m->s_type = SEG_GLOBAL;
        m->pipeId = GLBMEM_PIPE;
        m->latency.set(w->computeUnit->shader->ticks(1));
        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
        w->outstanding_reqs_rd_gm++;
        w->rd_gm_reqs_in_pipe--;
        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }
    void
    Call::MagicXactCasLd(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        int src_val1 = 0;
        for (int lane = 0; lane < VSZ; ++lane) {
            if (mask[lane]) {
                src_val1 = src1.get<int>(w, lane, 1);
                break;
            }
        }
        if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
            w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
            w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
        }
        w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
            .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
    }
    void
    Call::MagicMostSigThread(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        unsigned mst = true;
        for (int lane = VSZ - 1; lane >= 0; --lane) {
            if (mask[lane]) {
                dest.set<int>(w, lane, mst);
                mst = false;
            }
        }
    }
    void
    Call::MagicMostSigBroadcast(Wavefront *w)
    {
        const VectorMask &mask = w->get_pred();
        int res = 0;
        bool got_res = false;
        for (int lane = VSZ - 1; lane >= 0; --lane) {
            if (mask[lane]) {
                if (!got_res) {
                    res = src1.get<int>(w, lane, 1);
                    got_res = true;
                }
                dest.set<int>(w, lane, res);
            }
        }
    }
 } // namespace HsailISA
--- a/src/arch/hsail/operand.cc
+++ b/src/arch/hsail/operand.cc
@ -0,0 +1,449 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #include "arch/hsail/operand.hh"
 using namespace Brig;
 bool
 BaseRegOperand::init(unsigned opOffset, const BrigObject *obj,
                     unsigned &maxRegIdx, char _regFileChar)
 {
    regFileChar = _regFileChar;
    const BrigOperand *brigOp = obj->getOperand(opOffset);
    if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER)
        return false;
    const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp;
    regIdx = brigRegOp->regNum;
    DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx,
            brigRegOp->regKind);
    maxRegIdx = std::max(maxRegIdx, regIdx);
    return true;
 }
 void
 ListOperand::init(unsigned opOffset, const BrigObject *obj)
 {
    const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset);
    switch (brigOp->kind) {
      case BRIG_KIND_OPERAND_CODE_LIST:
        {
            const BrigOperandCodeList *opList =
                (const BrigOperandCodeList*)brigOp;
            const Brig::BrigData *oprnd_data =
                obj->getBrigBaseData(opList->elements);
            // Note: for calls Dest list of operands could be size of 0.
            elementCount = oprnd_data->byteCount / 4;
            DPRINTF(GPUReg, "Operand Code List: # elements: %d\n",
                    elementCount);
            for (int i = 0; i < elementCount; ++i) {
                unsigned *data_offset =
                    (unsigned*)obj->getData(opList->elements + 4 * (i + 1));
                const BrigDirectiveVariable *p =
                    (const BrigDirectiveVariable*)obj->
                    getCodeSectionEntry(*data_offset);
                StorageElement *se = obj->currentCode->storageMap->
                    findSymbol(BRIG_SEGMENT_ARG, p);
                assert(se);
                callArgs.push_back(se);
            }
        }
        break;
      default:
        fatal("ListOperand: bad operand kind %d\n", brigOp->kind);
    }
 }
 std::string
 ListOperand::disassemble()
 {
    std::string res_str("");
    for (auto it : callArgs) {
        res_str += csprintf("%s ", it->name.c_str());
    }
    return res_str;
 }
 void
 FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj)
 {
    const BrigOperand *baseOp = obj->getOperand(opOffset);
    if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) {
        fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind);
    }
    const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp;
    const BrigDirectiveExecutable *p =
        (const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref);
    func_name = obj->getString(p->name);
 }
 std::string
 FunctionRefOperand::disassemble()
 {
    DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name);
    return csprintf("%s", func_name);
 }
 bool
 BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj,
                               int at, unsigned &maxRegIdx, char _regFileChar)
 {
    regFileChar = _regFileChar;
    const BrigOperand *brigOp = obj->getOperand(opOffset);
    if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST)
        return false;
    const Brig::BrigOperandOperandList *brigRegVecOp =
         (const Brig::BrigOperandOperandList*)brigOp;
    unsigned *data_offset =
        (unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1));
    const BrigOperand *p =
        (const BrigOperand*)obj->getOperand(*data_offset);
    if (p->kind != BRIG_KIND_OPERAND_REGISTER) {
        return false;
    }
    const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p;
    regIdx = brigRegOp->regNum;
    DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx,
            brigRegOp->regKind);
    maxRegIdx = std::max(maxRegIdx, regIdx);
    return true;
 }
 void
 BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj,
                     unsigned &maxRegIdx, char _regFileChar)
 {
    const char *name = obj->getString(strOffset);
    char *endptr;
    regIdx = strtoul(name + 2, &endptr, 10);
    if (name[0] != '$' || name[1] != _regFileChar) {
        fatal("register operand parse error on \"%s\"\n", name);
    }
    maxRegIdx = std::max(maxRegIdx, regIdx);
 }
 unsigned SRegOperand::maxRegIdx;
 unsigned DRegOperand::maxRegIdx;
 unsigned CRegOperand::maxRegIdx;
 std::string
 SRegOperand::disassemble()
 {
    return csprintf("$s%d", regIdx);
 }
 std::string
 DRegOperand::disassemble()
 {
    return csprintf("$d%d", regIdx);
 }
 std::string
 CRegOperand::disassemble()
 {
    return csprintf("$c%d", regIdx);
 }
 BrigRegOperandInfo
 findRegDataType(unsigned opOffset, const BrigObject *obj)
 {
    const BrigOperand *baseOp = obj->getOperand(opOffset);
    switch (baseOp->kind) {
      case BRIG_KIND_OPERAND_REGISTER:
        {
            const BrigOperandRegister *op = (BrigOperandRegister*)baseOp;
            return BrigRegOperandInfo((BrigKind16_t)baseOp->kind,
                                      (BrigRegisterKind)op->regKind);
        }
        break;
      case BRIG_KIND_OPERAND_OPERAND_LIST:
        {
             const BrigOperandOperandList *op =
                (BrigOperandOperandList*)baseOp;
             const BrigData *data_p = (BrigData*)obj->getData(op->elements);
             int num_operands = 0;
             BrigRegisterKind reg_kind = (BrigRegisterKind)0;
             for (int offset = 0; offset < data_p->byteCount; offset += 4) {
                 const BrigOperand *op_p = (const BrigOperand *)
                    obj->getOperand(((int *)data_p->bytes)[offset/4]);
                 if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) {
                     const BrigOperandRegister *brigRegOp =
                        (const BrigOperandRegister*)op_p;
                     reg_kind = (BrigRegisterKind)brigRegOp->regKind;
                 } else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) {
                     uint16_t num_bytes =
                        ((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount
                            - sizeof(BrigBase);
                     if (num_bytes == sizeof(uint32_t)) {
                         reg_kind = BRIG_REGISTER_KIND_SINGLE;
                     } else if (num_bytes == sizeof(uint64_t)) {
                         reg_kind = BRIG_REGISTER_KIND_DOUBLE;
                     } else {
                         fatal("OperandList: bad operand size %d\n", num_bytes);
                     }
                 } else {
                     fatal("OperandList: bad operand kind %d\n", op_p->kind);
                 }
                 num_operands++;
             }
             assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST);
             return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind);
        }
        break;
      case BRIG_KIND_OPERAND_ADDRESS:
        {
            const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
            if (!op->reg) {
                BrigType type = BRIG_TYPE_NONE;
                if (op->symbol) {
                    const BrigDirective *dir = (BrigDirective*)
                        obj->getCodeSectionEntry(op->symbol);
                    assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
                    const BrigDirectiveVariable *sym =
                       (const BrigDirectiveVariable*)dir;
                    type = (BrigType)sym->type;
                }
                return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS,
                                          (BrigType)type);
            } else {
                const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp;
                const BrigOperand *reg = obj->getOperand(b->reg);
                const BrigOperandRegister *rop = (BrigOperandRegister*)reg;
                return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER,
                                          (BrigRegisterKind)rop->regKind);
            }
        }
        break;
     default:
       fatal("AddrOperand: bad operand kind %d\n", baseOp->kind);
       break;
   }
 }
 void
 AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj)
 {
    assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS);
    const BrigDirective *d =
        (BrigDirective*)obj->getCodeSectionEntry(op->symbol);
    assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
    const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d;
    name = obj->getString(sym->name);
    if (sym->segment != BRIG_SEGMENT_ARG) {
        storageElement =
            obj->currentCode->storageMap->findSymbol(sym->segment, name);
        assert(storageElement);
        offset = 0;
    } else {
        // sym->name does not work for BRIG_SEGMENT_ARG for the following case:
        //
        //     void foo(int a);
        //     void bar(double a);
        //
        //     foo(...) --> arg_u32 %param_p0;
        //                  st_arg_u32 $s0, [%param_p0];
        //                  call &foo (%param_p0);
        //     bar(...) --> arg_f64 %param_p0;
        //                  st_arg_u64 $d0, [%param_p0];
        //                  call &foo (%param_p0);
        //
        //  Both functions use the same variable name (param_p0)!!!
        //
        //  Maybe this is a bug in the compiler (I don't know).
        //
        // Solution:
        // Use directive pointer (BrigDirectiveVariable) to differentiate 2
        // versions of param_p0.
        //
        // Note this solution is kind of stupid, because we are pulling stuff
        // out of the brig binary via the directive pointer and putting it into
        // the symbol table, but now we are indexing the symbol table by the
        // brig directive pointer! It makes the symbol table sort of pointless.
        // But I don't want to mess with the rest of the infrastructure, so
        // let's go with this for now.
        //
        // When we update the compiler again, we should see if this problem goes
        // away. If so, we can fold some of this functionality into the code for
        // kernel arguments. If not, maybe we can index the symbol name on a
        // hash of the variable AND function name
        storageElement = obj->currentCode->
                 storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym);
        assert(storageElement);
    }
 }
 uint64_t
 AddrOperandBase::calcUniformBase()
 {
    // start with offset, will be 0 if not specified
    uint64_t address = offset;
    // add in symbol value if specified
    if (storageElement) {
        address += storageElement->offset;
    }
    return address;
 }
 std::string
 AddrOperandBase::disassemble(std::string reg_disassembly)
 {
    std::string disasm;
    if (offset || reg_disassembly != "") {
        disasm += "[";
        if (reg_disassembly != "") {
            disasm += reg_disassembly;
            if (offset > 0) {
                disasm += "+";
            }
        }
        if (offset) {
            disasm += csprintf("%d", offset);
        }
        disasm += "]";
    } else if (name) {
        disasm += csprintf("[%s]", name);
    }
    return disasm;
 }
 void
 NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj)
 {
    const BrigOperand *baseOp = obj->getOperand(opOffset);
    if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) {
        BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp;
        parseAddr(addrOp, obj);
        offset = (uint64_t(addrOp->offset.hi) << 32) |
                  uint64_t(addrOp->offset.lo);
    } else {
        fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind);
    }
 }
 std::string
 NoRegAddrOperand::disassemble()
 {
    return AddrOperandBase::disassemble(std::string(""));
 }
 void
 LabelOperand::init(unsigned opOffset, const BrigObject *obj)
 {
    const BrigOperandCodeRef *op =
        (const BrigOperandCodeRef*)obj->getOperand(opOffset);
    assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF);
    const BrigDirective *dir =
        (const BrigDirective*)obj->getCodeSectionEntry(op->ref);
    assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL);
    label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj);
 }
 uint32_t
 LabelOperand::getTarget(Wavefront *w, int lane)
 {
    return label->get();
 }
 std::string
 LabelOperand::disassemble()
 {
    return label->name;
 }
--- a/src/arch/hsail/operand.hh
+++ b/src/arch/hsail/operand.hh
@ -0,0 +1,768 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #ifndef __ARCH_HSAIL_OPERAND_HH__
 #define __ARCH_HSAIL_OPERAND_HH__
 /**
 *  @file operand.hh
 *
 *  Defines classes encapsulating HSAIL instruction operands.
 */
 #include <string>
 #include "arch/hsail/Brig.h"
 #include "base/trace.hh"
 #include "base/types.hh"
 #include "debug/GPUReg.hh"
 #include "enums/RegisterType.hh"
 #include "gpu-compute/brig_object.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/hsail_code.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 class Label;
 class StorageElement;
 class BaseOperand
 {
  public:
    Enums::RegisterType registerType;
    uint32_t regOperandSize;
    BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; }
    bool isVectorRegister() { return registerType == Enums::RT_VECTOR; }
    bool isScalarRegister() { return registerType == Enums::RT_SCALAR; }
    bool isCondRegister() { return registerType == Enums::RT_CONDITION; }
    unsigned int regIndex() { return 0; }
    uint32_t opSize() { return regOperandSize; }
    virtual ~BaseOperand() { }
 };
 class BrigRegOperandInfo
 {
  public:
    Brig::BrigKind16_t kind;
    Brig::BrigType type;
    Brig::BrigRegisterKind regKind;
    BrigRegOperandInfo(Brig::BrigKind16_t _kind,
                       Brig::BrigRegisterKind _regKind)
        : kind(_kind), regKind(_regKind)
    {
    }
    BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type)
        : kind(_kind), type(_type)
    {
    }
    BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES),
                           type(Brig::BRIG_TYPE_NONE)
    {
    }
 };
 BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj);
 class BaseRegOperand : public BaseOperand
 {
  public:
    unsigned regIdx;
    char regFileChar;
    bool init(unsigned opOffset, const BrigObject *obj,
              unsigned &maxRegIdx, char _regFileChar);
    bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at,
                        unsigned &maxRegIdx, char _regFileChar);
    void initWithStrOffset(unsigned strOffset, const BrigObject *obj,
                           unsigned &maxRegIdx, char _regFileChar);
    unsigned int regIndex() { return regIdx; }
 };
 class SRegOperand : public BaseRegOperand
 {
  public:
    static unsigned maxRegIdx;
    bool
    init(unsigned opOffset, const BrigObject *obj)
    {
        regOperandSize = sizeof(uint32_t);
        registerType = Enums::RT_VECTOR;
        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's');
    }
    bool
    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
    {
        regOperandSize = sizeof(uint32_t);
        registerType = Enums::RT_VECTOR;
        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
                                              's');
    }
    void
    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
    {
        regOperandSize = sizeof(uint32_t);
        registerType = Enums::RT_VECTOR;
        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
                                                 's');
    }
    template<typename OperandType>
    OperandType
    get(Wavefront *w, int lane)
    {
        assert(sizeof(OperandType) <= sizeof(uint32_t));
        assert(regIdx < w->maxSpVgprs);
        // if OperandType is smaller than 32-bit, we truncate the value
        OperandType ret;
        uint32_t vgprIdx;
        switch (sizeof(OperandType)) {
          case 1: // 1 byte operand
              vgprIdx = w->remap(regIdx, 1, 1);
              ret = (w->computeUnit->vrf[w->simdId]->
                      read<uint32_t>(vgprIdx, lane)) & 0xff;
            break;
          case 2: // 2 byte operand
              vgprIdx = w->remap(regIdx, 2, 1);
              ret = (w->computeUnit->vrf[w->simdId]->
                      read<uint32_t>(vgprIdx, lane)) & 0xffff;
            break;
          case 4: // 4 byte operand
              vgprIdx = w->remap(regIdx,sizeof(OperandType), 1);
              ret = w->computeUnit->vrf[w->simdId]->
                  read<OperandType>(vgprIdx, lane);
            break;
          default:
            panic("Bad OperandType\n");
            break;
        }
        return (OperandType)ret;
    }
    // special get method for compatibility with LabelOperand
    uint32_t
    getTarget(Wavefront *w, int lane)
    {
        return get<uint32_t>(w, lane);
    }
    template<typename OperandType>
    void set(Wavefront *w, int lane, OperandType &val);
    std::string disassemble();
 };
 template<typename OperandType>
 void
 SRegOperand::set(Wavefront *w, int lane, OperandType &val)
 {
    DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
            w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
    assert(sizeof(OperandType) == sizeof(uint32_t));
    assert(regIdx < w->maxSpVgprs);
    uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
    w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
 }
 template<>
 inline void
 SRegOperand::set(Wavefront *w, int lane, uint64_t &val)
 {
    DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
            w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
    assert(regIdx < w->maxSpVgprs);
    uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1);
    w->computeUnit->vrf[w->simdId]->write<uint32_t>(vgprIdx, val, lane);
 }
 class DRegOperand : public BaseRegOperand
 {
  public:
    static unsigned maxRegIdx;
    bool
    init(unsigned opOffset, const BrigObject *obj)
    {
        regOperandSize = sizeof(uint64_t);
        registerType = Enums::RT_VECTOR;
        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd');
    }
    bool
    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
    {
        regOperandSize = sizeof(uint64_t);
        registerType = Enums::RT_VECTOR;
        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
                                              'd');
    }
    void
    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
    {
        regOperandSize = sizeof(uint64_t);
        registerType = Enums::RT_VECTOR;
        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
                                                 'd');
    }
    template<typename OperandType>
    OperandType
    get(Wavefront *w, int lane)
    {
        assert(sizeof(OperandType) <= sizeof(uint64_t));
        // TODO: this check is valid only for HSAIL
        assert(regIdx < w->maxDpVgprs);
        uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
        return w->computeUnit->vrf[w->simdId]->read<OperandType>(vgprIdx,lane);
    }
    template<typename OperandType>
    void
    set(Wavefront *w, int lane, OperandType &val)
    {
        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n",
                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
                val);
        assert(sizeof(OperandType) <= sizeof(uint64_t));
        // TODO: this check is valid only for HSAIL
        assert(regIdx < w->maxDpVgprs);
        uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
        w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
    }
    std::string disassemble();
 };
 class CRegOperand : public BaseRegOperand
 {
  public:
    static unsigned maxRegIdx;
    bool
    init(unsigned opOffset, const BrigObject *obj)
    {
        regOperandSize = sizeof(uint8_t);
        registerType = Enums::RT_CONDITION;
        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c');
    }
    bool
    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
    {
        regOperandSize = sizeof(uint8_t);
        registerType = Enums::RT_CONDITION;
        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
                                              'c');
    }
    void
    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
    {
        regOperandSize = sizeof(uint8_t);
        registerType = Enums::RT_CONDITION;
        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
                                                 'c');
    }
    template<typename OperandType>
    OperandType
    get(Wavefront *w, int lane)
    {
        assert(regIdx < w->condRegState->numRegs());
        return w->condRegState->read<OperandType>((int)regIdx, lane);
    }
    template<typename OperandType>
    void
    set(Wavefront *w, int lane, OperandType &val)
    {
        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n",
                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
                val);
        assert(regIdx < w->condRegState->numRegs());
        w->condRegState->write<OperandType>(regIdx,lane,val);
    }
    std::string disassemble();
 };
 template<typename T>
 class ImmOperand : public BaseOperand
 {
  public:
    T bits;
    bool init(unsigned opOffset, const BrigObject *obj);
    bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
    std::string disassemble();
    template<typename OperandType>
    OperandType
    get()
    {
        assert(sizeof(OperandType) <= sizeof(T));
        return *(OperandType*)&bits;
    }
    // This version of get() takes a WF* and a lane id for
    // compatibility with the register-based get() methods.
    template<typename OperandType>
    OperandType
    get(Wavefront *w, int lane)
    {
        return get<OperandType>();
    }
 };
 template<typename T>
 bool
 ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
 {
    const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
    switch (brigOp->kind) {
      // this is immediate operand
      case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES:
        {
            DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T),
                    brigOp->byteCount);
            auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
            bits = *((T*)(obj->getData(cbptr->bytes + 4)));
            return true;
        }
        break;
      case Brig::BRIG_KIND_OPERAND_WAVESIZE:
        bits = VSZ;
        return true;
      default:
        return false;
    }
 }
 template <typename T>
 bool
 ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
 {
    const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
    if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
        return false;
    }
    const Brig::BrigOperandOperandList *brigVecOp =
         (const Brig::BrigOperandOperandList *)brigOp;
    unsigned *data_offset =
        (unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1));
    const Brig::BrigOperand *p =
        (const Brig::BrigOperand *)obj->getOperand(*data_offset);
    if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
        return false;
    }
    return init(*data_offset, obj);
 }
 template<typename T>
 std::string
 ImmOperand<T>::disassemble()
 {
    return csprintf("0x%08x", bits);
 }
 template<typename RegOperand, typename T>
 class RegOrImmOperand : public BaseOperand
 {
  private:
    bool is_imm;
  public:
    void setImm(const bool value) { is_imm = value; }
    ImmOperand<T> imm_op;
    RegOperand reg_op;
    RegOrImmOperand() { is_imm = false; }
    void init(unsigned opOffset, const BrigObject *obj);
    void init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
    std::string disassemble();
    template<typename OperandType>
    OperandType
    get(Wavefront *w, int lane)
    {
        return is_imm ?  imm_op.template get<OperandType>() :
                         reg_op.template get<OperandType>(w, lane);
    }
    uint32_t
    opSize()
    {
        if (!is_imm) {
            return reg_op.opSize();
        }
        return 0;
    }
    bool
    isVectorRegister()
    {
        if (!is_imm) {
            return reg_op.registerType == Enums::RT_VECTOR;
        }
        return false;
    }
    bool
    isCondRegister()
    {
        if (!is_imm) {
            return reg_op.registerType == Enums::RT_CONDITION;
        }
        return false;
    }
    bool
    isScalarRegister()
    {
        if (!is_imm) {
            return reg_op.registerType == Enums::RT_SCALAR;
        }
        return false;
    }
    unsigned int
    regIndex()
    {
        if (!is_imm) {
            return reg_op.regIndex();
        }
        return 0;
    }
 };
 template<typename RegOperand, typename T>
 void
 RegOrImmOperand<RegOperand, T>::init(unsigned opOffset, const BrigObject *obj)
 {
    is_imm = false;
    if (reg_op.init(opOffset, obj)) {
        return;
    }
    if (imm_op.init(opOffset, obj)) {
        is_imm = true;
        return;
    }
    fatal("RegOrImmOperand::init(): bad operand kind %d\n",
          obj->getOperand(opOffset)->kind);
 }
 template<typename RegOperand, typename T>
 void
 RegOrImmOperand<RegOperand, T>::init_from_vect(unsigned opOffset,
                                               const BrigObject *obj, int at)
 {
    if (reg_op.init_from_vect(opOffset, obj, at)) {
        is_imm = false;
        return;
    }
    if (imm_op.init_from_vect(opOffset, obj, at)) {
        is_imm = true;
        return;
    }
    fatal("RegOrImmOperand::init(): bad operand kind %d\n",
          obj->getOperand(opOffset)->kind);
 }
 template<typename RegOperand, typename T>
 std::string
 RegOrImmOperand<RegOperand, T>::disassemble()
 {
    return is_imm ? imm_op.disassemble() : reg_op.disassemble();
 }
 typedef RegOrImmOperand<SRegOperand, uint32_t> SRegOrImmOperand;
 typedef RegOrImmOperand<DRegOperand, uint64_t> DRegOrImmOperand;
 typedef RegOrImmOperand<CRegOperand, bool> CRegOrImmOperand;
 class AddrOperandBase : public BaseOperand
 {
  protected:
    // helper function for init()
    void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj);
    // helper function for disassemble()
    std::string disassemble(std::string reg_disassembly);
    uint64_t calcUniformBase();
  public:
    virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
    virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
    uint64_t offset;
    const char *name = nullptr;
    StorageElement *storageElement;
 };
 template<typename RegOperandType>
 class RegAddrOperand : public AddrOperandBase
 {
  public:
    RegOperandType reg;
    void init(unsigned opOffset, const BrigObject *obj);
    uint64_t calcUniform();
    void calcVector(Wavefront *w, uint64_t *addrVec);
    uint64_t calcLane(Wavefront *w, int lane=0);
    uint32_t opSize() { return reg.opSize(); }
    bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
    bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; }
    bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; }
    unsigned int regIndex() { return reg.regIndex(); }
    std::string disassemble();
 };
 template<typename RegOperandType>
 void
 RegAddrOperand<RegOperandType>::init(unsigned opOffset, const BrigObject *obj)
 {
    using namespace Brig;
    const BrigOperand *baseOp = obj->getOperand(opOffset);
    switch (baseOp->kind) {
      case BRIG_KIND_OPERAND_ADDRESS:
        {
            const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
            storageElement = nullptr;
            offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo);
            reg.init(op->reg, obj);
            if (reg.regFileChar == 's') {
                reg.regOperandSize = sizeof(uint32_t);
                registerType = Enums::RT_VECTOR;
            }
            else if (reg.regFileChar == 'd') {
                reg.regOperandSize = sizeof(uint64_t);
                registerType = Enums::RT_VECTOR;
            }
        }
        break;
      default:
        fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind);
        break;
    }
 }
 template<typename RegOperandType>
 uint64_t
 RegAddrOperand<RegOperandType>::calcUniform()
 {
    fatal("can't do calcUniform() on register-based address\n");
    return 0;
 }
 template<typename RegOperandType>
 void
 RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
 {
    Addr address = calcUniformBase();
    for (int lane = 0; lane < VSZ; ++lane) {
        if (w->execMask(lane)) {
            if (reg.regFileChar == 's') {
                addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
            } else {
                addrVec[lane] = address + reg.template get<Addr>(w, lane);
            }
        }
    }
 }
 template<typename RegOperandType>
 uint64_t
 RegAddrOperand<RegOperandType>::calcLane(Wavefront *w, int lane)
 {
    Addr address = calcUniformBase();
    return address + reg.template get<Addr>(w, lane);
 }
 template<typename RegOperandType>
 std::string
 RegAddrOperand<RegOperandType>::disassemble()
 {
    return AddrOperandBase::disassemble(reg.disassemble());
 }
 typedef RegAddrOperand<SRegOperand> SRegAddrOperand;
 typedef RegAddrOperand<DRegOperand> DRegAddrOperand;
 class NoRegAddrOperand : public AddrOperandBase
 {
  public:
    void init(unsigned opOffset, const BrigObject *obj);
    uint64_t calcUniform();
    void calcVector(Wavefront *w, uint64_t *addrVec);
    uint64_t calcLane(Wavefront *w, int lane=0);
    std::string disassemble();
 };
 inline uint64_t
 NoRegAddrOperand::calcUniform()
 {
    return AddrOperandBase::calcUniformBase();
 }
 inline uint64_t
 NoRegAddrOperand::calcLane(Wavefront *w, int lane)
 {
    return calcUniform();
 }
 inline void
 NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
 {
    uint64_t address = calcUniformBase();
    for (int lane = 0; lane < VSZ; ++lane)
        addrVec[lane] = address;
 }
 class LabelOperand : public BaseOperand
 {
  public:
    Label *label;
    void init(unsigned opOffset, const BrigObject *obj);
    std::string disassemble();
    // special get method for compatibility with SRegOperand
    uint32_t getTarget(Wavefront *w, int lane);
 };
 class ListOperand : public BaseOperand
 {
  public:
    int elementCount;
    std::vector<StorageElement*> callArgs;
    int
    getSrcOperand(int idx)
    {
        DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx,
                callArgs.size());
        return callArgs.at(idx)->offset;
    }
    void init(unsigned opOffset, const BrigObject *obj);
    std::string disassemble();
    template<typename OperandType>
    OperandType
    get(Wavefront *w, int lane, int arg_idx)
    {
        return w->readCallArgMem<OperandType>(lane, getSrcOperand(arg_idx));
    }
    template<typename OperandType>
    void
    set(Wavefront *w, int lane, OperandType val)
    {
        w->writeCallArgMem<OperandType>(lane, getSrcOperand(0), val);
        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n",
                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane,
                getSrcOperand(0), val);
    }
 };
 class FunctionRefOperand : public BaseOperand
 {
  public:
    const char *func_name;
    void init(unsigned opOffset, const BrigObject *obj);
    std::string disassemble();
 };
 #endif // __ARCH_HSAIL_OPERAND_HH__
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@ -0,0 +1,310 @@
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Steve Reinhardt
 #
 from ClockedObject import ClockedObject
 from Device import DmaDevice
 from m5.defines import buildEnv
 from m5.params import *
 from m5.proxy import *
 from m5.SimObject import SimObject
 from MemObject import MemObject
 from Process import EmulatedDriver
 from Bridge import Bridge
 from LdsState import LdsState
 class PrefetchType(Enum): vals = [
    'PF_CU',
    'PF_PHASE',
    'PF_WF',
    'PF_STRIDE',
    'PF_END',
    ]
 class VectorRegisterFile(SimObject):
    type = 'VectorRegisterFile'
    cxx_class = 'VectorRegisterFile'
    cxx_header = 'gpu-compute/vector_register_file.hh'
    simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
    num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
 class Wavefront(SimObject):
    type = 'Wavefront'
    cxx_class = 'Wavefront'
    cxx_header = 'gpu-compute/wavefront.hh'
    simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
    wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
 class ComputeUnit(MemObject):
    type = 'ComputeUnit'
    cxx_class = 'ComputeUnit'
    cxx_header = 'gpu-compute/compute_unit.hh'
    wavefronts = VectorParam.Wavefront('Number of wavefronts')
    wfSize = Param.Int(64, 'Wavefront size (in work items)')
    num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
    spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
                                        'latency')
    dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
                                        'latency')
    issue_period = Param.Int(4, 'number of cycles per issue period')
    num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
    num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
    mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
                                "Represents the pipeline to reach the TCP and "\
                                "specified in GPU clock cycles")
    mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
                                 "cu. Represents the pipeline between the TCP "\
                                 "and cu as well as TCP data array access. "\
                                 "Specified in GPU clock cycles")
    system = Param.System(Parent.any, "system object")
    cu_id = Param.Int('CU id')
    vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
                                           "in bytes")
    coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
                                           "in bytes")
    memory_port = VectorMasterPort("Port to the memory system")
    translation_port = VectorMasterPort('Port to the TLB hierarchy')
    sqc_port = MasterPort("Port to the SQC (I-cache")
    sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
    perLaneTLB = Param.Bool(False, "enable per-lane TLB")
    prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
                               "(0 turns off prefetching)")
    prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
    prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
                                            "from last mem req in lane of "\
                                            "CU|Phase|Wavefront")
    execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
    xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
    debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
    functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
    localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
                                        "kernel end")
    countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
                                   "and how many times")
    global_mem_queue_size = Param.Int(256, "Number of entries in the global "
                                      "memory pipeline's queues")
    local_mem_queue_size = Param.Int(256, "Number of entries in the local "
                                      "memory pipeline's queues")
    ldsBus = Bridge() # the bridge between the CU and its LDS
    ldsPort = MasterPort("The port that goes to the LDS")
    localDataStore = Param.LdsState("the LDS for this CU")
    vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
                                                          "file")
 class Shader(ClockedObject):
    type = 'Shader'
    cxx_class = 'Shader'
    cxx_header = 'gpu-compute/shader.hh'
    CUs = VectorParam.ComputeUnit('Number of compute units')
    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
    impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
                                                  ruby at kernel boundaries""")
    separate_acquire_release = Param.Bool(False,
        """Do ld_acquire/st_release generate separate requests for the
        acquire and release?""")
    globalmem = Param.MemorySize('64kB', 'Memory size')
    timing = Param.Bool(False, 'timing memory accesses')
    cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
    translation = Param.Bool(False, "address translation");
 class ClDriver(EmulatedDriver):
    type = 'ClDriver'
    cxx_header = 'gpu-compute/cl_driver.hh'
    codefile = VectorParam.String('code file name(s)')
 class GpuDispatcher(DmaDevice):
    type = 'GpuDispatcher'
    cxx_header = 'gpu-compute/dispatcher.hh'
    # put at 8GB line for now
    pio_addr = Param.Addr(0x200000000, "Device Address")
    pio_latency = Param.Latency('1ns', "Programmed IO latency")
    shader_pointer = Param.Shader('pointer to shader')
    translation_port = MasterPort('Port to the dispatcher TLB')
    cpu = Param.BaseCPU("CPU to wake up on kernel completion")
    cl_driver = Param.ClDriver('pointer to driver')
 class OpType(Enum): vals = [
    'OT_NULL',
    'OT_ALU',
    'OT_SPECIAL',
    'OT_GLOBAL_READ',
    'OT_GLOBAL_WRITE',
    'OT_GLOBAL_ATOMIC',
    'OT_GLOBAL_HIST',
    'OT_GLOBAL_LDAS',
    'OT_SHARED_READ',
    'OT_SHARED_WRITE',
    'OT_SHARED_ATOMIC',
    'OT_SHARED_HIST',
    'OT_SHARED_LDAS',
    'OT_PRIVATE_READ',
    'OT_PRIVATE_WRITE',
    'OT_PRIVATE_ATOMIC',
    'OT_PRIVATE_HIST',
    'OT_PRIVATE_LDAS',
    'OT_SPILL_READ',
    'OT_SPILL_WRITE',
    'OT_SPILL_ATOMIC',
    'OT_SPILL_HIST',
    'OT_SPILL_LDAS',
    'OT_READONLY_READ',
    'OT_READONLY_WRITE',
    'OT_READONLY_ATOMIC',
    'OT_READONLY_HIST',
    'OT_READONLY_LDAS',
    'OT_FLAT_READ',
    'OT_FLAT_WRITE',
    'OT_FLAT_ATOMIC',
    'OT_FLAT_HIST',
    'OT_FLAT_LDAS',
    'OT_KERN_READ',
    'OT_BRANCH',
    # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
    #       of the compiler.
    'OT_SHARED_MEMFENCE',
    'OT_GLOBAL_MEMFENCE',
    'OT_BOTH_MEMFENCE',
    'OT_BARRIER',
    'OT_PRINT',
    'OT_RET',
    'OT_NOP',
    'OT_ARG'
    ]
 class MemType(Enum): vals = [
    'M_U8',
    'M_U16',
    'M_U32',
    'M_U64',
    'M_S8',
    'M_S16',
    'M_S32',
    'M_S64',
    'M_F16',
    'M_F32',
    'M_F64',
    ]
 class MemOpType(Enum): vals = [
    'MO_LD',
    'MO_ST',
    'MO_LDAS',
    'MO_LDA',
    'MO_AAND',
    'MO_AOR',
    'MO_AXOR',
    'MO_ACAS',
    'MO_AEXCH',
    'MO_AADD',
    'MO_ASUB',
    'MO_AINC',
    'MO_ADEC',
    'MO_AMAX',
    'MO_AMIN',
    'MO_ANRAND',
    'MO_ANROR',
    'MO_ANRXOR',
    'MO_ANRCAS',
    'MO_ANREXCH',
    'MO_ANRADD',
    'MO_ANRSUB',
    'MO_ANRINC',
    'MO_ANRDEC',
    'MO_ANRMAX',
    'MO_ANRMIN',
    'MO_HAND',
    'MO_HOR',
    'MO_HXOR',
    'MO_HCAS',
    'MO_HEXCH',
    'MO_HADD',
    'MO_HSUB',
    'MO_HINC',
    'MO_HDEC',
    'MO_HMAX',
    'MO_HMIN',
    'MO_UNDEF'
    ]
 class StorageClassType(Enum): vals = [
    'SC_SPILL',
    'SC_GLOBAL',
    'SC_SHARED',
    'SC_PRIVATE',
    'SC_READONLY',
    'SC_KERNARG',
    'SC_NONE',
    ]
 class RegisterType(Enum): vals = [
    'RT_VECTOR',
    'RT_SCALAR',
    'RT_CONDITION',
    'RT_HARDWARE',
    'RT_NONE',
    ]
 class GenericMemoryOrder(Enum): vals = [
    'MEMORY_ORDER_NONE',
    'MEMORY_ORDER_RELAXED',
    'MEMORY_ORDER_SC_ACQUIRE',
    'MEMORY_ORDER_SC_RELEASE',
    'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
    ]
 class GenericMemoryScope(Enum): vals = [
    'MEMORY_SCOPE_NONE',
    'MEMORY_SCOPE_WORKITEM',
    'MEMORY_SCOPE_WAVEFRONT',
    'MEMORY_SCOPE_WORKGROUP',
    'MEMORY_SCOPE_DEVICE',
    'MEMORY_SCOPE_SYSTEM',
    ]
--- a/src/gpu-compute/LdsState.py
+++ b/src/gpu-compute/LdsState.py
@ -0,0 +1,51 @@
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Joe Gross
 #
 from m5.defines import buildEnv
 from m5.params import *
 from m5.proxy import *
 from MemObject import MemObject
 class LdsState(MemObject):
    type = 'LdsState'
    cxx_class = 'LdsState'
    cxx_header = 'gpu-compute/lds_state.hh'
    size = Param.Int(65536, 'the size of the LDS')
    range = Param.AddrRange('64kB', "address space of the LDS")
    bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\
                                    'accessing data')
    banks = Param.Int(32, 'Number of LDS banks')
    cuPort = SlavePort("port that goes to the compute unit")
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@ -0,0 +1,99 @@
 # -*- mode:python -*-
 #
 #  Copyright (c) 2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Anthony Gutierrez
 #
 Import('*')
 if not env['BUILD_GPU']:
    Return()
 SimObject('GPU.py')
 SimObject('LdsState.py')
 SimObject('X86GPUTLB.py')
 if env['TARGET_GPU_ISA'] == 'hsail':
    Source('brig_object.cc')
    Source('hsail_code.cc')
 Source('cl_driver.cc')
 Source('compute_unit.cc')
 Source('condition_register_state.cc')
 Source('dispatcher.cc')
 Source('exec_stage.cc')
 Source('fetch_stage.cc')
 Source('fetch_unit.cc')
 Source('global_memory_pipeline.cc')
 Source('gpu_dyn_inst.cc')
 Source('gpu_exec_context.cc')
 Source('gpu_static_inst.cc')
 Source('gpu_tlb.cc')
 Source('hsa_object.cc')
 Source('kernel_cfg.cc')
 Source('lds_state.cc')
 Source('local_memory_pipeline.cc')
 Source('of_scheduling_policy.cc')
 Source('pool_manager.cc')
 Source('rr_scheduling_policy.cc')
 Source('schedule_stage.cc')
 Source('scheduler.cc')
 Source('scoreboard_check_stage.cc')
 Source('shader.cc')
 Source('simple_pool_manager.cc')
 Source('tlb_coalescer.cc')
 Source('vector_register_file.cc')
 Source('vector_register_state.cc')
 Source('wavefront.cc')
 DebugFlag('BRIG')
 DebugFlag('GPUCoalescer')
 DebugFlag('GPUDisp')
 DebugFlag('GPUExec')
 DebugFlag('GPUFetch')
 DebugFlag('GPUHsailCFInfo')
 DebugFlag('GPUMem')
 DebugFlag('GPUPort')
 DebugFlag('GPUPrefetch')
 DebugFlag('GPUReg')
 DebugFlag('GPUSync')
 DebugFlag('GPUTLB')
 DebugFlag('HSALoader')
 DebugFlag('HSAIL')
 DebugFlag('HSAILObject')
 DebugFlag('Predictor')
 DebugFlag('WavefrontStack')
 CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
                        'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL'])
--- a/src/gpu-compute/X86GPUTLB.py
+++ b/src/gpu-compute/X86GPUTLB.py
@ -0,0 +1,77 @@
 #
 #  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 #  All rights reserved.
 #
 #  For use for simulation and test purposes only
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #  this list of conditions and the following disclaimer.
 #
 #  2. Redistributions in binary form must reproduce the above copyright notice,
 #  this list of conditions and the following disclaimer in the documentation
 #  and/or other materials provided with the distribution.
 #
 #  3. Neither the name of the copyright holder nor the names of its contributors
 #  may be used to endorse or promote products derived from this software
 #  without specific prior written permission.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGE.
 #
 #  Author: Lisa Hsu
 #
 from m5.defines import buildEnv
 from m5.params import *
 from m5.proxy import *
 from m5.objects.MemObject import MemObject
 if buildEnv['FULL_SYSTEM']:
    class X86PagetableWalker(MemObject):
        type = 'X86PagetableWalker'
        cxx_class = 'X86ISA::Walker'
        port = SlavePort("Port for the hardware table walker")
        system = Param.System(Parent.any, "system object")
 class X86GPUTLB(MemObject):
    type = 'X86GPUTLB'
    cxx_class = 'X86ISA::GpuTLB'
    cxx_header = 'gpu-compute/gpu_tlb.hh'
    size = Param.Int(64, "TLB size (number of entries)")
    assoc = Param.Int(64, "TLB associativity")
    if buildEnv['FULL_SYSTEM']:
        walker = Param.X86PagetableWalker(X86PagetableWalker(),
                                          "page table walker")
    hitLatency = Param.Int(2, "Latency of a TLB hit")
    missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
    missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
    maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
    slave = VectorSlavePort("Port on side closer to CPU/CU")
    master = VectorMasterPort("Port on side closer to memory")
    allocationPolicy = Param.Bool(True, "Allocate on an access")
    accessDistance = Param.Bool(False, "print accessDistance stats")
 class TLBCoalescer(MemObject):
    type = 'TLBCoalescer'
    cxx_class = 'TLBCoalescer'
    cxx_header = 'gpu-compute/tlb_coalescer.hh'
    probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
    coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
    slave = VectorSlavePort("Port on side closer to CPU/CU")
    master = VectorMasterPort("Port on side closer to memory")
    disableCoalescing = Param.Bool(False,"Dispable Coalescing")
--- a/src/gpu-compute/brig_object.cc
+++ b/src/gpu-compute/brig_object.cc
@ -0,0 +1,474 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt, Anthony Gutierrez
 */
 #include "gpu-compute/brig_object.hh"
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <cassert>
 #include <cstddef>
 #include <cstdlib>
 #include "arch/hsail/Brig.h"
 #include "base/misc.hh"
 #include "base/trace.hh"
 #include "debug/BRIG.hh"
 #include "debug/HSAILObject.hh"
 #include "debug/HSALoader.hh"
 using namespace Brig;
 std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>>
    HsaObject::tryFileFuncs = { BrigObject::tryFile };
 extern int getBrigDataTypeBytes(BrigType16_t t);
 const char *BrigObject::sectionNames[] =
 {
    "hsa_data",
    "hsa_code",
    "hsa_operand",
    ".shstrtab"
 };
 const char *segmentNames[] =
 {
    "none",
    "flat",
    "global",
    "readonly",
    "kernarg",
    "group",
    "private",
    "spill",
    "args"
 };
 const uint8_t*
 BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const
 {
    // allow offs == size for dummy end pointers
    assert(offs <= sectionInfo[sec].size);
    return sectionInfo[sec].ptr + offs;
 }
 const char*
 BrigObject::getString(int offs) const
 {
    return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4);
 }
 const BrigBase*
 BrigObject::getCodeSectionEntry(int offs) const
 {
    return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs);
 }
 const BrigData*
 BrigObject::getBrigBaseData(int offs) const
 {
    return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs));
 }
 const uint8_t*
 BrigObject::getData(int offs) const
 {
    return getSectionOffset(DataSectionIndex, offs);
 }
 const BrigOperand*
 BrigObject::getOperand(int offs) const
 {
    return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs);
 }
 unsigned
 BrigObject::getOperandPtr(int offs, int index) const
 {
    unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1)));
    return *op_offs;
 }
 const BrigInstBase*
 BrigObject::getInst(int offs) const
 {
    return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs);
 }
 HsaCode*
 BrigObject::getKernel(const std::string &name) const
 {
    return nullptr;
 }
 HsaCode*
 BrigObject::getFunction(const std::string &name) const
 {
    for (int i = 0; i < functions.size(); ++i) {
        if (functions[i]->name() == name) {
            return functions[i];
        }
    }
    return nullptr;
 }
 void
 BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr,
                              StorageMap *storageMap)
 {
    while (dirPtr < endPtr) {
        if (!dirPtr->byteCount) {
            fatal("Bad directive size 0\n");
        }
        // calculate next pointer now so we can override it if needed
        const BrigBase *nextDirPtr = brigNext(dirPtr);
        DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n",
                dirPtr->kind, dirPtr->byteCount);
        switch (dirPtr->kind) {
          case BRIG_KIND_DIRECTIVE_FUNCTION:
            {
                const BrigDirectiveExecutable *p M5_VAR_USED =
                    reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
                DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: "
                        "%d next: %d\n", getString(p->name),
                        p->firstCodeBlockEntry, p->nextModuleEntry);
                if (p->firstCodeBlockEntry != p->nextModuleEntry) {
                    panic("Function calls are not fully supported yet!!: %s\n",
                          getString(p->name));
                    const char *name = getString(p->name);
                    HsailCode *code_obj = nullptr;
                    for (int i = 0; i < functions.size(); ++i) {
                        if (functions[i]->name() == name) {
                            code_obj = functions[i];
                            break;
                        }
                    }
                    if (!code_obj) {
                        // create new local storage map for kernel-local symbols
                        code_obj = new HsailCode(name, p, this,
                                                 new StorageMap(storageMap));
                        functions.push_back(code_obj);
                    } else {
                        panic("Multiple definition of Function!!: %s\n",
                              getString(p->name));
                    }
                }
                nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
            }
            break;
          case BRIG_KIND_DIRECTIVE_KERNEL:
            {
                const BrigDirectiveExecutable *p =
                    reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
                DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: "
                        "next: %d\n", getString(p->name),
                        p->firstCodeBlockEntry, p->nextModuleEntry);
                const char *name = getString(p->name);
                if (name[0] == '&')
                    name++;
                std::string str = name;
                char *temp;
                int len = str.length();
                if (str[len - 1] >= 'a' && str[len - 1] <= 'z') {
                    temp = new char[str.size() + 1];
                    std::copy(str.begin(), str.end() , temp);
                    temp[str.size()] = '\0';
                } else {
                    temp = new char[str.size()];
                    std::copy(str.begin(), str.end() - 1 , temp);
                    temp[str.size() - 1 ] = '\0';
                }
                std::string kernel_name = temp;
                delete[] temp;
                HsailCode *code_obj = nullptr;
                for (const auto &kernel : kernels) {
                    if (kernel->name() == kernel_name) {
                        code_obj = kernel;
                        break;
                    }
                }
                if (!code_obj) {
                    // create new local storage map for kernel-local symbols
                    code_obj = new HsailCode(kernel_name, p, this,
                                             new StorageMap(storageMap));
                    kernels.push_back(code_obj);
                }
                nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
            }
            break;
          case BRIG_KIND_DIRECTIVE_VARIABLE:
            {
                const BrigDirectiveVariable *p =
                    reinterpret_cast<const BrigDirectiveVariable*>(dirPtr);
                uint64_t readonlySize_old =
                    storageMap->getSize(BRIG_SEGMENT_READONLY);
                StorageElement* se = storageMap->addSymbol(p, this);
                DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n",
                        getString(p->name));
                if (p->segment == BRIG_SEGMENT_READONLY) {
                    // readonly memory has initialization data
                    uint8_t* readonlyData_old = readonlyData;
                    readonlyData =
                        new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)];
                    if (p->init) {
                        if ((p->type == BRIG_TYPE_ROIMG) ||
                            (p->type == BRIG_TYPE_WOIMG) ||
                            (p->type == BRIG_TYPE_SAMP) ||
                            (p->type == BRIG_TYPE_SIG32) ||
                            (p->type == BRIG_TYPE_SIG64)) {
                            panic("Read only data type not supported: %s\n",
                                  getString(p->name));
                        }
                        const BrigOperand *brigOp = getOperand(p->init);
                        assert(brigOp->kind ==
                               BRIG_KIND_OPERAND_CONSTANT_BYTES);
                        const Brig::BrigData *operand_data M5_VAR_USED =
                            getBrigBaseData(((BrigOperandConstantBytes*)
                                            brigOp)->bytes);
                        assert((operand_data->byteCount / 4) > 0);
                        uint8_t *symbol_data =
                            (uint8_t*)getData(((BrigOperandConstantBytes*)
                                              brigOp)->bytes + 4);
                        // copy the old data and add the new data
                        if (readonlySize_old > 0) {
                            memcpy(readonlyData, readonlyData_old,
                                   readonlySize_old);
                        }
                        memcpy(readonlyData + se->offset, symbol_data,
                               se->size);
                        delete[] readonlyData_old;
                   }
                }
            }
            break;
          case BRIG_KIND_DIRECTIVE_LABEL:
            {
              const BrigDirectiveLabel M5_VAR_USED *p =
                    reinterpret_cast<const BrigDirectiveLabel*>(dirPtr);
              panic("Label directives cannot be at the module level: %s\n",
                    getString(p->name));
            }
            break;
          case BRIG_KIND_DIRECTIVE_COMMENT:
            {
              const BrigDirectiveComment M5_VAR_USED *p =
                  reinterpret_cast<const BrigDirectiveComment*>(dirPtr);
              DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n",
                      getString(p->name));
            }
            break;
          case BRIG_KIND_DIRECTIVE_LOC:
            {
                DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n");
            }
            break;
          case BRIG_KIND_DIRECTIVE_MODULE:
            {
                const BrigDirectiveModule M5_VAR_USED *p =
                    reinterpret_cast<const BrigDirectiveModule*>(dirPtr);
                DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n",
                        getString(p->name));
            }
            break;
          case BRIG_KIND_DIRECTIVE_CONTROL:
            {
                DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n");
            }
            break;
          case BRIG_KIND_DIRECTIVE_PRAGMA:
            {
                DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n");
            }
            break;
          case BRIG_KIND_DIRECTIVE_EXTENSION:
            {
                DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n");
            }
            break;
          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
            {
                DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n");
            }
            break;
          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
            {
                DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n");
            }
            break;
          default:
            if (dirPtr->kind >= BRIG_KIND_INST_BEGIN &&
                dirPtr->kind <= BRIG_KIND_INST_END)
                break;
            if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
                dirPtr->kind <= BRIG_KIND_OPERAND_END)
                break;
            warn("Unknown Brig directive kind: %d\n", dirPtr->kind);
            break;
        }
        dirPtr = nextDirPtr;
    }
 }
 HsaObject*
 BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData)
 {
    const char *brig_ident = "HSA BRIG";
    if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH))
        return nullptr;
    return new BrigObject(fname, len, fileData);
 }
 BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData)
    : HsaObject(fname), storageMap(new StorageMap())
 {
    const char *brig_ident = "HSA BRIG";
    BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData;
    fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH),
             "%s is not a BRIG file\n", fname);
    if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR ||
        mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) {
        fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n",
              fname, mod_hdr->brigMajor, mod_hdr->brigMinor,
              BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR);
    }
    fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section "
             "count (%d) != expected value (%d)\n", fname,
             mod_hdr->sectionCount, NumSectionIndices);
    for (int i = 0; i < NumSectionIndices; ++i) {
        sectionInfo[i].ptr = nullptr;
    }
    uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex);
    for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) {
        uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx];
        BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr;
        // It doesn't look like cprintf supports string precision values,
        // but if this breaks, the right answer is to fix that
        DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength,
                sec_hdr->name);
        sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount];
        memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount);
        sectionInfo[sec_idx].size = sec_hdr->byteCount;
    }
    BrigSectionHeader *code_hdr =
        (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr;
    DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, "
            "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount,
            code_hdr->nameLength);
    // start at offset 4 to skip initial null entry (see Brig spec)
    processDirectives(getCodeSectionEntry(code_hdr->headerByteCount),
                      getCodeSectionEntry(sectionInfo[CodeSectionIndex].size),
                      storageMap);
    delete[] fileData;
    DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname);
 }
 BrigObject::~BrigObject()
 {
    for (int i = 0; i < NumSectionIndices; ++i)
        if (sectionInfo[i].ptr)
            delete[] sectionInfo[i].ptr;
 }
--- a/src/gpu-compute/brig_object.hh
+++ b/src/gpu-compute/brig_object.hh
@ -0,0 +1,134 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt, Anthony Gutierrez
 */
 #ifndef __BRIG_OBJECT_HH__
 #define __BRIG_OBJECT_HH__
 #include <cassert>
 #include <cstdint>
 #include <string>
 #include <vector>
 #include "arch/hsail/Brig.h"
 #include "gpu-compute/hsa_object.hh"
 #include "gpu-compute/hsail_code.hh"
 class LabelMap;
 class StorageMap;
 /* @class BrigObject
 * this class implements the BRIG loader object, and
 * is used when the simulator directly executes HSAIL.
 * this class is responsible for extracting all
 * information about kernels contained in BRIG format
 * and converts them to HsailCode objects that are
 * usable by the simulator and emulated runtime.
 */
 class BrigObject final : public HsaObject
 {
  public:
    enum SectionIndex
    {
        DataSectionIndex,
        CodeSectionIndex,
        OperandsSectionIndex,
        NumSectionIndices
    };
    static const char *sectionNames[];
    struct SectionInfo
    {
        uint8_t *ptr;
        int size;
    };
    static HsaObject* tryFile(const std::string &fname, int len,
                              uint8_t *fileData);
    SectionInfo sectionInfo[NumSectionIndices];
    const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const;
    std::vector<HsailCode*> kernels;
    std::vector<HsailCode*> functions;
    std::string kern_block_name;
    void processDirectives(const Brig::BrigBase *dirPtr,
                           const Brig::BrigBase *endPtr,
                           StorageMap *storageMap);
    BrigObject(const std::string &fname, int len, uint8_t *fileData);
    ~BrigObject();
    // eventually these will need to be per-kernel not per-object-file
    StorageMap *storageMap;
    LabelMap *labelMap;
    const char* getString(int offs) const;
    const Brig::BrigData* getBrigBaseData(int offs) const;
    const uint8_t* getData(int offs) const;
    const Brig::BrigBase* getCodeSectionEntry(int offs) const;
    const Brig::BrigOperand* getOperand(int offs) const;
    unsigned getOperandPtr(int offs, int index) const;
    const Brig::BrigInstBase* getInst(int offs) const;
    HsaCode* getKernel(const std::string &name) const override;
    HsaCode* getFunction(const std::string &name) const override;
    int numKernels() const override { return kernels.size(); }
    HsaCode* getKernel(int i) const override { return kernels[i]; }
    // pointer to the current kernel/function we're processing, so elements
    // under construction can reference it.  kinda ugly, but easier
    // than passing it all over for the few places it's needed.
    mutable HsailCode *currentCode;
 };
 // Utility function to bump Brig item pointer to next element given
 // item size in bytes.  Really just an add but with lots of casting.
 template<typename T>
 T*
 brigNext(T *ptr)
 {
    Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr;
    int size = base_ptr->byteCount;
    assert(size);
    return (T*)((uint8_t*)ptr + size);
 }
 #endif // __BRIG_OBJECT_HH__
--- a/src/gpu-compute/cl_driver.cc
+++ b/src/gpu-compute/cl_driver.cc
@ -0,0 +1,272 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #include "gpu-compute/cl_driver.hh"
 #include "base/intmath.hh"
 #include "cpu/thread_context.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "gpu-compute/hsa_code.hh"
 #include "gpu-compute/hsa_kernel_info.hh"
 #include "gpu-compute/hsa_object.hh"
 #include "params/ClDriver.hh"
 #include "sim/process.hh"
 #include "sim/syscall_emul_buf.hh"
 ClDriver::ClDriver(ClDriverParams *p)
    : EmulatedDriver(p), hsaCode(0)
 {
    for (const auto &codeFile : p->codefile)
        codeFiles.push_back(&codeFile);
    maxFuncArgsSize = 0;
    for (int i = 0; i < codeFiles.size(); ++i) {
        HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]);
        for (int k = 0; k < obj->numKernels(); ++k) {
            assert(obj->getKernel(k));
            kernels.push_back(obj->getKernel(k));
            kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData);
            int kern_funcargs_size = kernels.back()->funcarg_size;
            maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ?
                kern_funcargs_size : maxFuncArgsSize;
        }
    }
    int name_offs = 0;
    int code_offs = 0;
    for (int i = 0; i < kernels.size(); ++i) {
        kernelInfo.push_back(HsaKernelInfo());
        HsaCode *k = kernels[i];
        k->generateHsaKernelInfo(&kernelInfo[i]);
        kernelInfo[i].name_offs = name_offs;
        kernelInfo[i].code_offs = code_offs;
        name_offs += k->name().size() + 1;
        code_offs += k->numInsts() * sizeof(GPUStaticInst*);
    }
 }
 void
 ClDriver::handshake(GpuDispatcher *_dispatcher)
 {
    dispatcher = _dispatcher;
    dispatcher->setFuncargsSize(maxFuncArgsSize);
 }
 int
 ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags)
 {
    int fd = p->allocFD(-1, filename, 0, 0, false);
    FDEntry *fde = p->getFDEntry(fd);
    fde->driver = this;
    return fd;
 }
 int
 ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
 {
    int index = 2;
    Addr buf_addr = process->getSyscallArg(tc, index);
    switch (req) {
      case HSA_GET_SIZES:
        {
            TypedBufferArg<HsaDriverSizes> sizes(buf_addr);
            sizes->num_kernels = kernels.size();
            sizes->string_table_size = 0;
            sizes->code_size = 0;
            sizes->readonly_size = 0;
            if (kernels.size() > 0) {
                // all kernels will share the same read-only memory
                sizes->readonly_size =
                    kernels[0]->getSize(HsaCode::MemorySegment::READONLY);
                // check our assumption
                for (int i = 1; i<kernels.size(); ++i) {
                    assert(sizes->readonly_size ==
                    kernels[i]->getSize(HsaCode::MemorySegment::READONLY));
                }
            }
            for (int i = 0; i < kernels.size(); ++i) {
                HsaCode *k = kernels[i];
                // add one for terminating '\0'
                sizes->string_table_size += k->name().size() + 1;
                sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*);
            }
            sizes.copyOut(tc->getMemProxy());
        }
        break;
      case HSA_GET_KINFO:
        {
            TypedBufferArg<HsaKernelInfo>
                kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size());
            for (int i = 0; i < kernels.size(); ++i) {
                HsaKernelInfo *ki = &kinfo[i];
                ki->name_offs = kernelInfo[i].name_offs;
                ki->code_offs = kernelInfo[i].code_offs;
                ki->sRegCount = kernelInfo[i].sRegCount;
                ki->dRegCount = kernelInfo[i].dRegCount;
                ki->cRegCount = kernelInfo[i].cRegCount;
                ki->static_lds_size  = kernelInfo[i].static_lds_size;
                ki->private_mem_size = kernelInfo[i].private_mem_size;
                ki->spill_mem_size   = kernelInfo[i].spill_mem_size;
            }
            kinfo.copyOut(tc->getMemProxy());
        }
        break;
      case HSA_GET_STRINGS:
        {
            int string_table_size = 0;
            for (int i = 0; i < kernels.size(); ++i) {
                HsaCode *k = kernels[i];
                string_table_size += k->name().size() + 1;
            }
            BufferArg buf(buf_addr, string_table_size);
            char *bufp = (char*)buf.bufferPtr();
            for (int i = 0; i < kernels.size(); ++i) {
                HsaCode *k = kernels[i];
                const char *n = k->name().c_str();
                // idiomatic string copy
                while ((*bufp++ = *n++));
            }
            assert(bufp - (char *)buf.bufferPtr() == string_table_size);
            buf.copyOut(tc->getMemProxy());
        }
        break;
      case HSA_GET_READONLY_DATA:
        {
            // we can pick any kernel --- they share the same
            // readonly segment (this assumption is checked in GET_SIZES)
            uint64_t size =
                kernels.back()->getSize(HsaCode::MemorySegment::READONLY);
            BufferArg data(buf_addr, size);
            char *datap = (char *)data.bufferPtr();
            memcpy(datap,
                   kernels.back()->readonly_data,
                   size);
            data.copyOut(tc->getMemProxy());
        }
        break;
      case HSA_GET_CODE:
        {
            // set hsaCode pointer
            hsaCode = buf_addr;
            int code_size = 0;
            for (int i = 0; i < kernels.size(); ++i) {
                HsaCode *k = kernels[i];
                code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst);
            }
            TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size);
            TheGpuISA::RawMachInst *bufp = buf;
            int buf_idx = 0;
            for (int i = 0; i < kernels.size(); ++i) {
                HsaCode *k = kernels[i];
                for (int j = 0; j < k->numInsts(); ++j) {
                    bufp[buf_idx] = k->insts()->at(j);
                    ++buf_idx;
                }
            }
            buf.copyOut(tc->getMemProxy());
        }
        break;
      case HSA_GET_CU_CNT:
        {
            BufferArg buf(buf_addr, sizeof(uint32_t));
            *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs();
            buf.copyOut(tc->getMemProxy());
        }
        break;
      case HSA_GET_VSZ:
        {
            BufferArg buf(buf_addr, sizeof(uint32_t));
            *((uint32_t*)buf.bufferPtr()) = VSZ;
            buf.copyOut(tc->getMemProxy());
        }
        break;
      default:
        fatal("ClDriver: bad ioctl %d\n", req);
    }
    return 0;
 }
 const char*
 ClDriver::codeOffToKernelName(uint64_t code_ptr)
 {
    assert(hsaCode);
    uint32_t code_offs = code_ptr - hsaCode;
    for (int i = 0; i < kernels.size(); ++i) {
        if (code_offs == kernelInfo[i].code_offs) {
            return kernels[i]->name().c_str();
        }
    }
    return nullptr;
 }
 ClDriver*
 ClDriverParams::create()
 {
    return new ClDriver(this);
 }
--- a/src/gpu-compute/cl_driver.hh
+++ b/src/gpu-compute/cl_driver.hh
@ -0,0 +1,77 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __CL_DRIVER_HH__
 #define __CL_DRIVER_HH__
 #include <vector>
 #include "gpu-compute/hsa_kernel_info.hh"
 #include "sim/emul_driver.hh"
 class GpuDispatcher;
 class HsaCode;
 class LiveProcess;
 class ThreadContext;
 struct ClDriverParams;
 class ClDriver final : public EmulatedDriver
 {
  public:
    ClDriver(ClDriverParams *p);
    void handshake(GpuDispatcher *_dispatcher);
    int open(LiveProcess *p, ThreadContext *tc, int mode, int flags);
    int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req);
    const char* codeOffToKernelName(uint64_t code_ptr);
  private:
    GpuDispatcher *dispatcher;
    std::vector<const std::string*> codeFiles;
    // All the kernels we know about
    std::vector<HsaCode*> kernels;
    std::vector<HsaCode*> functions;
    std::vector<HsaKernelInfo> kernelInfo;
    // maximum size necessary for function arguments
    int maxFuncArgsSize;
    // The host virtual address for the kernel code
    uint64_t hsaCode;
 };
 #endif // __CL_DRIVER_HH__
--- a/src/gpu-compute/cl_event.hh
+++ b/src/gpu-compute/cl_event.hh
@ -0,0 +1,51 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Authors: Marc Orr
 */
 #ifndef __GPU_CL_EVENT_HH__
 #define __GPU_CL_EVENT_HH__
 struct HsaQueueEntry;
 class _cl_event {
  public:
    _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { }
    volatile bool done;
    HsaQueueEntry *hsaTaskPtr;
    uint64_t start;
    uint64_t end;
 };
 #endif // __GPU_CL_EVENT_HH__
--- a/src/gpu-compute/code_enums.hh
+++ b/src/gpu-compute/code_enums.hh
@ -0,0 +1,116 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __CODE_ENUMS_HH__
 #define __CODE_ENUMS_HH__
 #define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \
                    && (a)<=Enums::OT_GLOBAL_LDAS)
 #define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \
                    && (a)<=Enums::OT_SHARED_LDAS)
 #define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \
                    && (a)<=Enums::OT_PRIVATE_LDAS)
 #define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \
                    && (a)<=Enums::OT_SPILL_LDAS)
 #define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \
                    && (a)<=Enums::OT_READONLY_LDAS)
 #define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS)
 #define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \
                    ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \
                    ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS)
 #define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \
                    ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \
                    ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ)
 #define IS_OT_READ_GM(a) \
    ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \
    ||(a)==Enums::OT_READONLY_READ)
 #define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ)
 #define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ)
 #define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ)
 #define IS_OT_WRITE(a) \
    ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \
    ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \
    ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE)
 #define IS_OT_WRITE_GM(a) \
    ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \
    ||(a)==Enums::OT_READONLY_WRITE)
 #define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE)
 #define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE)
 #define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
                    ||(a)==Enums::OT_SHARED_ATOMIC \
                    ||(a)==Enums::OT_PRIVATE_ATOMIC \
                    ||(a)==Enums::OT_SPILL_ATOMIC \
                    ||(a)==Enums::OT_READONLY_ATOMIC \
                    ||(a)==Enums::OT_FLAT_ATOMIC)
 #define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
                    ||(a)==Enums::OT_SPILL_ATOMIC \
                    ||(a)==Enums::OT_READONLY_ATOMIC \
                    ||(a)==Enums::OT_GLOBAL_MEMFENCE \
                    ||(a)==Enums::OT_BOTH_MEMFENCE)
 #define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
                    ||(a)==Enums::OT_SHARED_MEMFENCE \
                    ||(a)==Enums::OT_BOTH_MEMFENCE)
 #define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
 #define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \
                    ||(a)==Enums::OT_SHARED_HIST \
                    ||(a)==Enums::OT_PRIVATE_HIST \
                    ||(a)==Enums::OT_SPILL_HIST \
                    ||(a)==Enums::OT_READONLY_HIST \
                    ||(a)==Enums::OT_FLAT_HIST)
 #define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \
                    ||(a)==Enums::OT_SPILL_HIST \
                    ||(a)==Enums::OT_READONLY_HIST)
 #define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST)
 #define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST)
 #endif // __CODE_ENUMS_HH__
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@ -0,0 +1,767 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos, Anthony Gutierrez
 */
 #ifndef __COMPUTE_UNIT_HH__
 #define __COMPUTE_UNIT_HH__
 #include <deque>
 #include <map>
 #include <unordered_map>
 #include <vector>
 #include "base/callback.hh"
 #include "base/statistics.hh"
 #include "base/types.hh"
 #include "enums/PrefetchType.hh"
 #include "gpu-compute/exec_stage.hh"
 #include "gpu-compute/fetch_stage.hh"
 #include "gpu-compute/global_memory_pipeline.hh"
 #include "gpu-compute/local_memory_pipeline.hh"
 #include "gpu-compute/qstruct.hh"
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/mem_object.hh"
 #include "mem/port.hh"
 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
 static const int MAX_WIDTH_FOR_MEM_INST = 32;
 class NDRange;
 class Shader;
 class VectorRegisterFile;
 struct ComputeUnitParams;
 enum EXEC_POLICY
 {
    OLDEST = 0,
    RR
 };
 // List of execution units
 enum EXEC_UNIT
 {
    SIMD0 = 0,
    SIMD1,
    SIMD2,
    SIMD3,
    GLBMEM_PIPE,
    LDSMEM_PIPE,
    NUM_UNITS
 };
 enum TLB_CACHE
 {
    TLB_MISS_CACHE_MISS = 0,
    TLB_MISS_CACHE_HIT,
    TLB_HIT_CACHE_MISS,
    TLB_HIT_CACHE_HIT
 };
 class ComputeUnit : public MemObject
 {
  public:
    FetchStage fetchStage;
    ScoreboardCheckStage scoreboardCheckStage;
    ScheduleStage scheduleStage;
    ExecStage execStage;
    GlobalMemPipeline globalMemoryPipe;
    LocalMemPipeline localMemoryPipe;
    // Buffers used to communicate between various pipeline stages
    // List of waves which are ready to be scheduled.
    // Each execution resource has a ready list. readyList is
    // used to communicate between scoreboardCheck stage and
    // schedule stage
    // TODO: make enum to index readyList
    std::vector<std::vector<Wavefront*>> readyList;
    // Stores the status of waves. A READY implies the
    // wave is ready to be scheduled this cycle and
    // is already present in the readyList. waveStatusList is
    // used to communicate between scoreboardCheck stage and
    // schedule stage
    // TODO: convert std::pair to a class to increase readability
    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
    // List of waves which will be dispatched to
    // each execution resource. A FILLED implies
    // dispatch list is non-empty and
    // execution unit has something to execute
    // this cycle. Currently, the dispatch list of
    // an execution resource can hold only one wave because
    // an execution resource can execute only one wave in a cycle.
    // dispatchList is used to communicate between schedule
    // and exec stage
    // TODO: convert std::pair to a class to increase readability
    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
    int rrNextALUWp;
    typedef ComputeUnitParams Params;
    std::vector<std::vector<Wavefront*>> wfList;
    int cu_id;
    // array of vector register files, one per SIMD
    std::vector<VectorRegisterFile*> vrf;
    // Number of vector ALU units (SIMDs) in CU
    int numSIMDs;
    // number of pipe stages for bypassing data to next dependent single
    // precision vector instruction inside the vector ALU pipeline
    int spBypassPipeLength;
    // number of pipe stages for bypassing data to next dependent double
    // precision vector instruction inside the vector ALU pipeline
    int dpBypassPipeLength;
    // number of cycles per issue period
    int issuePeriod;
    // Number of global and local memory execution resources in CU
    int numGlbMemUnits;
    int numLocMemUnits;
    // tracks the last cycle a vector instruction was executed on a SIMD
    std::vector<uint64_t> lastExecCycle;
    // true if we allow a separate TLB per lane
    bool perLaneTLB;
    // if 0, TLB prefetching is off.
    int prefetchDepth;
    // if fixed-stride prefetching, this is the stride.
    int prefetchStride;
    class LastVaddrWave
    {
      public:
        Addr vaddrs[VSZ];
        Addr& operator[](int idx) {
            return vaddrs[idx];
        }
        LastVaddrWave() {
            for (int i = 0; i < VSZ; ++i)
                vaddrs[i] = 0;
        }
    };
    LastVaddrWave lastVaddrCU;
    std::vector<LastVaddrWave> lastVaddrPhase;
    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
    Enums::PrefetchType prefetchType;
    EXEC_POLICY exec_policy;
    bool xact_cas_mode;
    bool debugSegFault;
    bool functionalTLB;
    bool localMemBarrier;
    /*
     * for Counting page accesses
     *
     * cuExitCallback inherits from Callback. When you register a callback
     * function as an exit callback, it will get added to an exit callback
     * queue, such that on simulation exit, all callbacks in the callback
     * queue will have their process() function called.
     */
    bool countPages;
    Shader *shader;
    uint32_t barrier_id;
    // vector of Vector ALU (MACC) pipelines
    std::vector<WaitClass> aluPipe;
    // minimum issue period per SIMD unit (in cycles)
    std::vector<WaitClass> wfWait;
    // Resource control for Vector Register File->Global Memory pipe buses
    std::vector<WaitClass> vrfToGlobalMemPipeBus;
    // Resource control for Vector Register File->Local Memory pipe buses
    std::vector<WaitClass> vrfToLocalMemPipeBus;
    int nextGlbMemBus;
    int nextLocMemBus;
    // Resource control for global memory to VRF data/address bus
    WaitClass glbMemToVrfBus;
    // Resource control for local memory to VRF data/address bus
    WaitClass locMemToVrfBus;
    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
    Tick req_tick_latency;
    Tick resp_tick_latency;
    // number of vector registers being reserved for each SIMD unit
    std::vector<int> vectorRegsReserved;
    // number of vector registers per SIMD unit
    uint32_t numVecRegsPerSimd;
    // Support for scheduling VGPR status update events
    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
    std::vector<uint64_t> timestampVec;
    std::vector<uint8_t>  statusVec;
    void
    registerEvent(uint32_t simdId,
                  uint32_t regIdx,
                  uint32_t operandSize,
                  uint64_t when,
                  uint8_t newStatus) {
        regIdxVec.push_back(std::make_pair(simdId, regIdx));
        timestampVec.push_back(when);
        statusVec.push_back(newStatus);
        if (operandSize > 4) {
            regIdxVec.push_back(std::make_pair(simdId,
                                               ((regIdx + 1) %
                                                numVecRegsPerSimd)));
            timestampVec.push_back(when);
            statusVec.push_back(newStatus);
        }
    }
    void updateEvents();
    // this hash map will keep track of page divergence
    // per memory instruction per wavefront. The hash map
    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
    std::map<Addr, int> pagesTouched;
    ComputeUnit(const Params *p);
    ~ComputeUnit();
    int spBypassLength() { return spBypassPipeLength; };
    int dpBypassLength() { return dpBypassPipeLength; };
    int storeBusLength() { return numCyclesPerStoreTransfer; };
    int loadBusLength() { return numCyclesPerLoadTransfer; };
    int wfSize() const { return wavefrontSize; };
    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
    void exec();
    void initiateFetch(Wavefront *wavefront);
    void fetch(PacketPtr pkt, Wavefront *wavefront);
    void FillKernelState(Wavefront *w, NDRange *ndr);
    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
                 int trueWgSizeTotal);
    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
                             int trueWgSize[], int trueWgSizeTotal,
                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
    void StartWorkgroup(NDRange *ndr);
    int ReadyWorkgroup(NDRange *ndr);
    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
    int GlbMemUnitId() { return GLBMEM_PIPE; }
    int ShrMemUnitId() { return LDSMEM_PIPE; }
    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
    /* This function cycles through all the wavefronts in all the phases to see
     * if all of the wavefronts which should be associated with one barrier
     * (denoted with _barrier_id), are all at the same barrier in the program
     * (denoted by bcnt). When the number at the barrier matches bslots, then
     * return true.
     */
    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
    bool cedeSIMD(int simdId, int wfSlotId);
    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
    virtual void init();
    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
                              bool kernelLaunch=true,
                              RequestPtr req=nullptr);
    void handleMemPacket(PacketPtr pkt, int memport_index);
    bool processTimingPacket(PacketPtr pkt);
    void processFetchReturn(PacketPtr pkt);
    void updatePageDivergenceDist(Addr addr);
    MasterID masterId() { return _masterId; }
    bool isDone() const;
    bool isSimdDone(uint32_t) const;
  protected:
    MasterID _masterId;
    LdsState &lds;
  public:
    // the following stats compute the avg. TLB accesslatency per
    // uncoalesced request (only for data)
    Stats::Scalar tlbRequests;
    Stats::Scalar tlbCycles;
    Stats::Formula tlbLatency;
    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
    Stats::Vector hitsPerTLBLevel;
    Stats::Scalar ldsBankAccesses;
    Stats::Distribution ldsBankConflictDist;
    // over all memory instructions executed over all wavefronts
    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
    Stats::Distribution pageDivergenceDist;
    Stats::Scalar dynamicGMemInstrCnt;
    Stats::Scalar dynamicLMemInstrCnt;
    Stats::Scalar wgBlockedDueLdsAllocation;
    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
    // when the instruction is committed, this number is still incremented by 1
    Stats::Scalar numInstrExecuted;
    // Number of cycles among successive instruction executions across all
    // wavefronts of the same CU
    Stats::Distribution execRateDist;
    // number of individual vector operations executed
    Stats::Scalar numVecOpsExecuted;
    // Total cycles that something is running on the GPU
    Stats::Scalar totalCycles;
    Stats::Formula vpc; // vector ops per cycle
    Stats::Formula ipc; // vector instructions per cycle
    Stats::Distribution controlFlowDivergenceDist;
    Stats::Distribution activeLanesPerGMemInstrDist;
    Stats::Distribution activeLanesPerLMemInstrDist;
    // number of vector ALU instructions received
    Stats::Formula numALUInstsExecuted;
    // number of times a WG can not start due to lack of free VGPRs in SIMDs
    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
    Stats::Scalar numCASOps;
    Stats::Scalar numFailedCASOps;
    Stats::Scalar completedWfs;
    // flag per vector SIMD unit that is set when there is at least one
    // WV that has a vector ALU instruction as the oldest in its
    // Instruction Buffer: Defined in the Scoreboard stage, consumed
    // by the Execute stage.
    std::vector<bool> vectorAluInstAvail;
    // number of available (oldest) LDS instructions that could have
    // been issued to the LDS at a specific issue slot
    int shrMemInstAvail;
    // number of available Global memory instructions that could have
    // been issued to TCP at a specific issue slot
    int glbMemInstAvail;
    void
    regStats();
    LdsState &
    getLds() const
    {
        return lds;
    }
    int32_t
    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
    bool
    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
    pageDataStruct pageAccesses;
    class CUExitCallback : public Callback
    {
      private:
        ComputeUnit *computeUnit;
      public:
        virtual ~CUExitCallback() { }
        CUExitCallback(ComputeUnit *_cu)
        {
            computeUnit = _cu;
        }
        virtual void
        process();
    };
    CUExitCallback *cuExitCallback;
    /** Data access Port **/
    class DataPort : public MasterPort
    {
      public:
        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
            : MasterPort(_name, _cu), computeUnit(_cu),
              index(_index) { }
        bool snoopRangeSent;
        struct SenderState : public Packet::SenderState
        {
            GPUDynInstPtr _gpuDynInst;
            int port_index;
            Packet::SenderState *saved;
            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
                        Packet::SenderState *sender_state=nullptr)
                : _gpuDynInst(gpuDynInst),
                  port_index(_port_index),
                  saved(sender_state) { }
        };
        class MemReqEvent : public Event
        {
          private:
            DataPort *dataPort;
            PacketPtr pkt;
          public:
            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
                : Event(), dataPort(_data_port), pkt(_pkt)
            {
              setFlags(Event::AutoDelete);
            }
            void process();
            const char *description() const;
        };
        class MemRespEvent : public Event
        {
          private:
            DataPort *dataPort;
            PacketPtr pkt;
          public:
            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
                : Event(), dataPort(_data_port), pkt(_pkt)
            {
              setFlags(Event::AutoDelete);
            }
            void process();
            const char *description() const;
        };
        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
      protected:
        ComputeUnit *computeUnit;
        int index;
        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt) { }
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();
        virtual void
        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
        {
            resp.clear();
            snoop = true;
        }
    };
    // Instruction cache access port
    class SQCPort : public MasterPort
    {
      public:
        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
            : MasterPort(_name, _cu), computeUnit(_cu),
              index(_index) { }
        bool snoopRangeSent;
        struct SenderState : public Packet::SenderState
        {
            Wavefront *wavefront;
            Packet::SenderState *saved;
            SenderState(Wavefront *_wavefront, Packet::SenderState
                    *sender_state=nullptr)
                : wavefront(_wavefront), saved(sender_state) { }
        };
        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
      protected:
        ComputeUnit *computeUnit;
        int index;
        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt) { }
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();
        virtual void
        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
        {
            resp.clear();
            snoop = true;
        }
     };
    /** Data TLB port **/
    class DTLBPort : public MasterPort
    {
      public:
        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
            : MasterPort(_name, _cu), computeUnit(_cu),
              index(_index), stalled(false)
        { }
        bool isStalled() { return stalled; }
        void stallPort() { stalled = true; }
        void unstallPort() { stalled = false; }
        /**
         * here we queue all the translation requests that were
         * not successfully sent.
         */
        std::deque<PacketPtr> retries;
        /** SenderState is information carried along with the packet
         * throughout the TLB hierarchy
         */
        struct SenderState: public Packet::SenderState
        {
            // the memInst that this is associated with
            GPUDynInstPtr _gpuDynInst;
            // the lane in the memInst this is associated with, so we send
            // the memory request down the right port
            int portIndex;
            // constructor used for packets involved in timing accesses
            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
        };
      protected:
        ComputeUnit *computeUnit;
        int index;
        bool stalled;
        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt) { }
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();
    };
    class ITLBPort : public MasterPort
    {
      public:
        ITLBPort(const std::string &_name, ComputeUnit *_cu)
            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
        bool isStalled() { return stalled; }
        void stallPort() { stalled = true; }
        void unstallPort() { stalled = false; }
        /**
         * here we queue all the translation requests that were
         * not successfully sent.
         */
        std::deque<PacketPtr> retries;
        /** SenderState is information carried along with the packet
         * throughout the TLB hierarchy
         */
        struct SenderState: public Packet::SenderState
        {
            // The wavefront associated with this request
            Wavefront *wavefront;
            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
        };
      protected:
        ComputeUnit *computeUnit;
        bool stalled;
        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt) { }
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();
    };
    /**
     * the port intended to communicate between the CU and its LDS
     */
    class LDSPort : public MasterPort
    {
      public:
        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
        : MasterPort(_name, _cu, _id), computeUnit(_cu)
        {
        }
        bool isStalled() const { return stalled; }
        void stallPort() { stalled = true; }
        void unstallPort() { stalled = false; }
        /**
         * here we queue all the requests that were
         * not successfully sent.
         */
        std::queue<PacketPtr> retries;
        /**
         *  SenderState is information carried along with the packet, esp. the
         *  GPUDynInstPtr
         */
        class SenderState: public Packet::SenderState
        {
          protected:
            // The actual read/write/atomic request that goes with this command
            GPUDynInstPtr _gpuDynInst = nullptr;
          public:
            SenderState(GPUDynInstPtr gpuDynInst):
              _gpuDynInst(gpuDynInst)
            {
            }
            GPUDynInstPtr
            getMemInst() const
            {
              return _gpuDynInst;
            }
        };
        virtual bool
        sendTimingReq(PacketPtr pkt);
      protected:
        bool stalled = false; ///< whether or not it is stalled
        ComputeUnit *computeUnit;
        virtual bool
        recvTimingResp(PacketPtr pkt);
        virtual Tick
        recvAtomic(PacketPtr pkt) { return 0; }
        virtual void
        recvFunctional(PacketPtr pkt)
        {
        }
        virtual void
        recvRangeChange()
        {
        }
        virtual void
        recvReqRetry();
    };
    /** The port to access the Local Data Store
     *  Can be connected to a LDS object
     */
    LDSPort *ldsPort = nullptr;
    LDSPort *
    getLdsPort() const
    {
        return ldsPort;
    }
    /** The memory port for SIMD data accesses.
     *  Can be connected to PhysMem for Ruby for timing simulations
     */
    std::vector<DataPort*> memPort;
    // port to the TLB hierarchy (i.e., the L1 TLB)
    std::vector<DTLBPort*> tlbPort;
    // port to the SQC (i.e. the I-cache)
    SQCPort *sqcPort;
    // port to the SQC TLB (there's a separate TLB for each I-cache)
    ITLBPort *sqcTLBPort;
    virtual BaseMasterPort&
    getMasterPort(const std::string &if_name, PortID idx)
    {
        if (if_name == "memory_port") {
            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
                                        this, idx);
            return *memPort[idx];
        } else if (if_name == "translation_port") {
            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
                                        this, idx);
            return *tlbPort[idx];
        } else if (if_name == "sqc_port") {
            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
                                  this, idx);
            return *sqcPort;
        } else if (if_name == "sqc_tlb_port") {
            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
            return *sqcTLBPort;
        } else if (if_name == "ldsPort") {
            if (ldsPort) {
                fatal("an LDS port was already allocated");
            }
            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
            return *ldsPort;
        } else {
            panic("incorrect port name");
        }
    }
    // xact_cas_load()
    class waveIdentifier
    {
      public:
        waveIdentifier() { }
        waveIdentifier(int _simdId, int _wfSlotId)
          : simdId(_simdId), wfSlotId(_wfSlotId) { }
        int simdId;
        int wfSlotId;
    };
    class waveQueue
    {
      public:
        std::list<waveIdentifier> waveIDQueue;
    };
    std::map<unsigned, waveQueue> xactCasLoadMap;
    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
  private:
    uint64_t globalSeqNum;
    int wavefrontSize;
 };
 #endif // __COMPUTE_UNIT_HH__
--- a/src/gpu-compute/condition_register_state.cc
+++ b/src/gpu-compute/condition_register_state.cc
@ -0,0 +1,83 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos
 */
 #include "gpu-compute/condition_register_state.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"
 ConditionRegisterState::ConditionRegisterState()
 {
    computeUnit = nullptr;
    c_reg.clear();
    busy.clear();
 }
 void
 ConditionRegisterState::setParent(ComputeUnit *_computeUnit)
 {
    computeUnit = _computeUnit;
    _name = computeUnit->name() + ".CondRegState";
 }
 void
 ConditionRegisterState::init(uint32_t _size)
 {
    c_reg.resize(_size);
    busy.resize(_size, 0);
 }
 void
 ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w)
 {
    // iterate over all operands
    for (auto i = 0; i < ii->getNumOperands(); ++i) {
        // is this a condition register destination operand?
        if (ii->isCondRegister(i) && ii->isDstOperand(i)) {
            // mark the register as busy
            markReg(ii->getRegisterIndex(i), 1);
            uint32_t pipeLen =  w->computeUnit->spBypassLength();
            // schedule an event for marking the register as ready
            w->computeUnit->
                registerEvent(w->simdId, ii->getRegisterIndex(i),
                              ii->getOperandSize(i),
                              w->computeUnit->shader->tick_cnt +
                              w->computeUnit->shader->ticks(pipeLen), 0);
        }
    }
 }
--- a/src/gpu-compute/condition_register_state.hh
+++ b/src/gpu-compute/condition_register_state.hh
@ -0,0 +1,101 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos
 */
 #ifndef __CONDITION_REGISTER_STATE_HH__
 #define __CONDITION_REGISTER_STATE_HH__
 #include <string>
 #include <vector>
 #include "gpu-compute/misc.hh"
 class ComputeUnit;
 class GPUStaticInst;
 class Shader;
 class Wavefront;
 // Condition Register State (used only when executing HSAIL)
 class ConditionRegisterState
 {
  public:
    ConditionRegisterState();
    void init(uint32_t _size);
    const std::string name() const { return _name; }
    void setParent(ComputeUnit *_computeUnit);
    void regStats() { }
    template<typename T>
    T
    read(int regIdx, int threadId)
    {
        bool tmp = c_reg[regIdx][threadId];
        T *p0 = (T*)(&tmp);
        return *p0;
    }
    template<typename T>
    void
    write(int regIdx, int threadId, T value)
    {
        c_reg[regIdx][threadId] = (bool)(value & 0x01);
    }
    void
    markReg(int regIdx, uint8_t value)
    {
        busy.at(regIdx) = value;
    }
    uint8_t
    regBusy(int idx)
    {
        uint8_t status = busy.at(idx);
        return status;
    }
    int numRegs() { return c_reg.size(); }
    void exec(GPUStaticInst *ii, Wavefront *w);
  private:
    ComputeUnit* computeUnit;
    std::string _name;
    // Condition Register state
    std::vector<VectorMask> c_reg;
    // flag indicating if a register is busy
    std::vector<uint8_t> busy;
 };
 #endif
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@ -0,0 +1,394 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Brad Beckmann, Marc Orr
 */
 #include "gpu-compute/dispatcher.hh"
 #include "cpu/base.hh"
 #include "debug/GPUDisp.hh"
 #include "gpu-compute/cl_driver.hh"
 #include "gpu-compute/cl_event.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"
 #include "mem/packet_access.hh"
 GpuDispatcher *GpuDispatcher::instance = nullptr;
 GpuDispatcher::GpuDispatcher(const Params *p)
    : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
      shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
 {
    shader->handshake(this);
    driver->handshake(this);
    ndRange.wg_disp_rem = false;
    ndRange.globalWgId = 0;
    schedule(&tickEvent, 0);
    // translation port for the dispatcher
    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
    num_kernelLaunched
    .name(name() + ".num_kernel_launched")
    .desc("number of kernel launched")
    ;
 }
 GpuDispatcher *GpuDispatcherParams::create()
 {
    GpuDispatcher *dispatcher = new GpuDispatcher(this);
    GpuDispatcher::setInstance(dispatcher);
    return GpuDispatcher::getInstance();
 }
 void
 GpuDispatcher::serialize(CheckpointOut &cp) const
 {
    Tick event_tick = 0;
    if (ndRange.wg_disp_rem)
        fatal("Checkpointing not supported during active workgroup execution");
    if (tickEvent.scheduled())
        event_tick = tickEvent.when();
    SERIALIZE_SCALAR(event_tick);
 }
 void
 GpuDispatcher::unserialize(CheckpointIn &cp)
 {
    Tick event_tick;
    if (tickEvent.scheduled())
        deschedule(&tickEvent);
    UNSERIALIZE_SCALAR(event_tick);
    if (event_tick)
        schedule(&tickEvent, event_tick);
 }
 AddrRangeList
 GpuDispatcher::getAddrRanges() const
 {
    AddrRangeList ranges;
    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
            pioAddr, pioSize);
    ranges.push_back(RangeSize(pioAddr, pioSize));
    return ranges;
 }
 Tick
 GpuDispatcher::read(PacketPtr pkt)
 {
    assert(pkt->getAddr() >= pioAddr);
    assert(pkt->getAddr() < pioAddr + pioSize);
    int offset = pkt->getAddr() - pioAddr;
    pkt->allocate();
    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
    if (offset < 8) {
        assert(!offset);
        assert(pkt->getSize() == 8);
        uint64_t retval = dispatchActive;
        pkt->set(retval);
    } else {
        offset -= 8;
        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
        char *curTaskPtr = (char*)&curTask;
        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
    }
    pkt->makeAtomicResponse();
    return pioDelay;
 }
 Tick
 GpuDispatcher::write(PacketPtr pkt)
 {
    assert(pkt->getAddr() >= pioAddr);
    assert(pkt->getAddr() < pioAddr + pioSize);
    int offset = pkt->getAddr() - pioAddr;
 #if TRACING_ON
    uint64_t data_val = 0;
    switch (pkt->getSize()) {
      case 1:
        data_val = pkt->get<uint8_t>();
        break;
      case 2:
        data_val = pkt->get<uint16_t>();
        break;
      case 4:
        data_val = pkt->get<uint32_t>();
        break;
      case 8:
        data_val = pkt->get<uint64_t>();
        break;
      default:
        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
    }
    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
            pkt->getSize());
 #endif
    if (!offset) {
        static int nextId = 0;
        // The depends field of the qstruct, which was previously unused, is
        // used to communicate with simulated application.
        if (curTask.depends) {
            HostState hs;
            shader->ReadMem((uint64_t)(curTask.depends), &hs,
                            sizeof(HostState), 0);
            // update event start time (in nano-seconds)
            uint64_t start = curTick() / 1000;
            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
                             &start, sizeof(uint64_t), 0);
        }
        // launch kernel
        ++num_kernelLaunched;
        NDRange *ndr = &(ndRangeMap[nextId]);
        // copy dispatch info
        ndr->q = curTask;
        // update the numDispTask polled by the runtime
        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
        ndr->numWgTotal = 1;
        for (int i = 0; i < 3; ++i) {
            ndr->wgId[i] = 0;
            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
            ndr->numWgTotal *= ndr->numWg[i];
        }
        ndr->numWgCompleted = 0;
        ndr->globalWgId = 0;
        ndr->wg_disp_rem = true;
        ndr->execDone = false;
        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
        ndr->dispatchId = nextId;
        ndr->curTid = pkt->req->threadId();
        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
        execIds.push(nextId);
        ++nextId;
        dispatchActive = true;
        if (!tickEvent.scheduled()) {
            schedule(&tickEvent, curTick() + shader->ticks(1));
        }
    } else {
        // populate current task struct
        // first 64 bits are launch reg
        offset -= 8;
        assert(offset < sizeof(HsaQueueEntry));
        char *curTaskPtr = (char*)&curTask;
        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
    }
    pkt->makeAtomicResponse();
    return pioDelay;
 }
 BaseMasterPort&
 GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
 {
    if (if_name == "translation_port") {
        return *tlbPort;
    }
    return DmaDevice::getMasterPort(if_name, idx);
 }
 void
 GpuDispatcher::exec()
 {
    int fail_count = 0;
    // There are potentially multiple outstanding kernel launches.
    // It is possible that the workgroups in a different kernel
    // can fit on the GPU even if another kernel's workgroups cannot
    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
    while (execIds.size() > fail_count) {
        int execId = execIds.front();
        while (ndRangeMap[execId].wg_disp_rem) {
            //update the thread context
            shader->updateThreadContext(ndRangeMap[execId].curTid);
            // attempt to dispatch_workgroup
            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
                // if we failed try the next kernel,
                // it may have smaller workgroups.
                // put it on the queue to rety latter
                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
                execIds.push(execId);
                ++fail_count;
                break;
            }
        }
        // let's try the next kernel_id
        execIds.pop();
    }
    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
    if (doneIds.size() && cpu) {
        shader->hostWakeUp(cpu);
    }
    while (doneIds.size()) {
        // wakeup the CPU if any Kernels completed this cycle
        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
        doneIds.pop();
    }
 }
 void
 GpuDispatcher::notifyWgCompl(Wavefront *w)
 {
    int kern_id = w->kern_id;
    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
    assert(ndRangeMap[kern_id].dispatchId == kern_id);
    ndRangeMap[kern_id].numWgCompleted++;
    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
        ndRangeMap[kern_id].execDone = true;
        doneIds.push(kern_id);
        if (ndRangeMap[kern_id].addrToNotify) {
            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
                          0);
        }
        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
        // update event end time (in nano-seconds)
        if (ndRangeMap[kern_id].q.depends) {
            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
            uint64_t event;
            shader->ReadMem((uint64_t)(&host_state->event), &event,
                            sizeof(uint64_t), 0);
            uint64_t end = curTick() / 1000;
            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
                             sizeof(uint64_t), 0);
        }
    }
    if (!tickEvent.scheduled()) {
        schedule(&tickEvent, curTick() + shader->ticks(1));
    }
 }
 void
 GpuDispatcher::scheduleDispatch()
 {
    if (!tickEvent.scheduled())
        schedule(&tickEvent, curTick() + shader->ticks(1));
 }
 void
 GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
 {
    if (cpu) {
        if (off) {
            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
                              true);
            val += off;
        }
        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
    } else {
        panic("Cannot find host");
    }
 }
 GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
    : Event(CPU_Tick_Pri), dispatcher(_dispatcher)
 {
 }
 void
 GpuDispatcher::TickEvent::process()
 {
    dispatcher->exec();
 }
 const char*
 GpuDispatcher::TickEvent::description() const
 {
    return "GPU Dispatcher tick";
 }
 // helper functions for driver to retrieve GPU attributes
 int
 GpuDispatcher::getNumCUs()
 {
    return shader->cuList.size();
 }
 void
 GpuDispatcher::setFuncargsSize(int funcargs_size)
 {
    shader->funcargs_size = funcargs_size;
 }
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@ -0,0 +1,163 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Brad Beckmann, Marc Orr
 */
 #ifndef __GPU_DISPATCHER_HH__
 #define __GPU_DISPATCHER_HH__
 #include <queue>
 #include <vector>
 #include "base/statistics.hh"
 #include "dev/dma_device.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/ndrange.hh"
 #include "gpu-compute/qstruct.hh"
 #include "mem/port.hh"
 #include "params/GpuDispatcher.hh"
 class BaseCPU;
 class Shader;
 class GpuDispatcher : public DmaDevice
 {
    public:
        typedef GpuDispatcherParams Params;
        class TickEvent : public Event
        {
            private:
                GpuDispatcher *dispatcher;
            public:
                TickEvent(GpuDispatcher *);
                void process();
                const char *description() const;
        };
        MasterID masterId() { return _masterId; }
    protected:
        MasterID _masterId;
        // Base and length of PIO register space
        Addr pioAddr;
        Addr pioSize;
        Tick pioDelay;
        HsaQueueEntry curTask;
        std::unordered_map<int, NDRange> ndRangeMap;
        NDRange ndRange;
        // list of kernel_ids to launch
        std::queue<int> execIds;
        // list of kernel_ids that have finished
        std::queue<int> doneIds;
        uint64_t dispatchCount;
        // is there a kernel in execution?
        bool dispatchActive;
        BaseCPU *cpu;
        Shader *shader;
        ClDriver *driver;
        TickEvent tickEvent;
        static GpuDispatcher *instance;
        // sycall emulation mode can have only 1 application running(?)
        // else we have to do some pid based tagging
        // unused
        typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
        TranslationBuffer tlb;
    public:
        /*statistics*/
        Stats::Scalar num_kernelLaunched;
        GpuDispatcher(const Params *p);
        ~GpuDispatcher() { }
        void exec();
        virtual void serialize(CheckpointOut &cp) const;
        virtual void unserialize(CheckpointIn &cp);
        void notifyWgCompl(Wavefront *w);
        void scheduleDispatch();
        void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
        // using singleton so that glue code can pass pointer locations
        // to the dispatcher. when there are multiple dispatchers, we can
        // call something like getInstance(index)
        static void
         setInstance(GpuDispatcher *_instance)
        {
            instance = _instance;
        }
        static GpuDispatcher* getInstance() { return instance; }
        class TLBPort : public MasterPort
        {
          public:
            TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
                : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
          protected:
            GpuDispatcher *dispatcher;
            virtual bool recvTimingResp(PacketPtr pkt) { return true; }
            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
            virtual void recvFunctional(PacketPtr pkt) { }
            virtual void recvRangeChange() { }
            virtual void recvReqRetry() { }
        };
        TLBPort *tlbPort;
        virtual BaseMasterPort& getMasterPort(const std::string &if_name,
                                              PortID idx);
        AddrRangeList getAddrRanges() const;
        Tick read(PacketPtr pkt);
        Tick write(PacketPtr pkt);
        // helper functions to retrieve/set GPU attributes
        int getNumCUs();
        void setFuncargsSize(int funcargs_size);
 };
 #endif // __GPU_DISPATCHER_HH__
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@ -0,0 +1,203 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos, Sooraj Puthoor
 */
 #include "gpu-compute/exec_stage.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/wavefront.hh"
 ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
    numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
    vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
    shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
    thisTimeInstExecuted(false), instrExecuted (false),
    executionResourcesUsed(0)
 {
    numTransActiveIdle = 0;
    idle_dur = 0;
 }
 void
 ExecStage::init(ComputeUnit *cu)
 {
    computeUnit = cu;
    _name = computeUnit->name() + ".ExecStage";
    dispatchList = &computeUnit->dispatchList;
    vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
    glbMemInstAvail= &(computeUnit->glbMemInstAvail);
    shrMemInstAvail= &(computeUnit->shrMemInstAvail);
    idle_dur = 0;
 }
 void
 ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
    if (stage == IdleExec) {
        // count cycles of no vector ALU instruction executed
        // even if one was the oldest in a WV of that vector SIMD unit
        if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
            numCyclesWithNoInstrTypeIssued[unitId]++;
        }
        // count cycles of no global memory (vector) instruction executed
        // even if one was the oldest in a WV of that vector SIMD unit
        if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
            numCyclesWithNoInstrTypeIssued[unitId]++;
            (*glbMemInstAvail)--;
        }
        // count cycles of no shared memory (vector) instruction executed
        // even if one was the oldest in a WV of that vector SIMD unit
        if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
            numCyclesWithNoInstrTypeIssued[unitId]++;
            (*shrMemInstAvail)--;
        }
    } else if (stage == BusyExec) {
        // count the number of cycles an instruction to a specific unit
        // was issued
        numCyclesWithInstrTypeIssued[unitId]++;
        thisTimeInstExecuted = true;
        instrExecuted = true;
        ++executionResourcesUsed;
    } else if (stage == PostExec) {
        // count the number of transitions from active to idle
        if (lastTimeInstExecuted && !thisTimeInstExecuted) {
            ++numTransActiveIdle;
        }
        if (!lastTimeInstExecuted && thisTimeInstExecuted) {
            idleDur.sample(idle_dur);
            idle_dur = 0;
        } else if (!thisTimeInstExecuted) {
            idle_dur++;
        }
        lastTimeInstExecuted = thisTimeInstExecuted;
        // track the number of cycles we either issued one vector instruction
        // or issued no instructions at all
        if (instrExecuted) {
            numCyclesWithInstrIssued++;
        } else {
            numCyclesWithNoIssue++;
        }
        spc.sample(executionResourcesUsed);
    }
 }
 void
 ExecStage::initStatistics()
 {
    instrExecuted = false;
    executionResourcesUsed = 0;
    thisTimeInstExecuted = false;
 }
 void
 ExecStage::exec()
 {
    initStatistics();
    for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
         // if dispatch list for this execution resource is empty,
         // skip this execution resource this cycle
         if (dispatchList->at(unitId).second == EMPTY) {
             collectStatistics(IdleExec, unitId);
             continue;
         }
         collectStatistics(BusyExec, unitId);
         // execute an instruction for the WF
         dispatchList->at(unitId).first->exec();
         // clear the dispatch list entry
         dispatchList->at(unitId).second = EMPTY;
         dispatchList->at(unitId).first = (Wavefront*)nullptr;
    }
    collectStatistics(PostExec, 0);
 }
 void
 ExecStage::regStats()
 {
    numTransActiveIdle
       .name(name() + ".num_transitions_active_to_idle")
       .desc("number of CU transitions from active to idle")
        ;
    numCyclesWithNoIssue
        .name(name() + ".num_cycles_with_no_issue")
        .desc("number of cycles the CU issues nothing")
        ;
    numCyclesWithInstrIssued
        .name(name() + ".num_cycles_with_instr_issued")
        .desc("number of cycles the CU issued at least one instruction")
        ;
    spc
        .init(0, numSIMDs + numMemUnits, 1)
        .name(name() + ".spc")
        .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
        ;
    idleDur
        .init(0,75,5)
        .name(name() + ".idle_duration_in_cycles")
        .desc("duration of idle periods in cycles")
        ;
    numCyclesWithInstrTypeIssued
        .init(numSIMDs + numMemUnits)
        .name(name() + ".num_cycles_with_instrtype_issue")
        .desc("Number of cycles at least one instruction of specific type "
              "issued")
        ;
    numCyclesWithNoInstrTypeIssued
        .init(numSIMDs + numMemUnits)
       .name(name() + ".num_cycles_with_instr_type_no_issue")
       .desc("Number of cycles no instruction of specific type issued")
       ;
    for (int i = 0; i < numSIMDs; ++i) {
        numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
        numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
    }
    numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
    numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
    numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
    numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
 }
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@ -0,0 +1,129 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos, Sooraj Puthoor
 */
 #ifndef __EXEC_STAGE_HH__
 #define __EXEC_STAGE_HH__
 #include <string>
 #include <utility>
 #include <vector>
 #include "sim/stats.hh"
 class ComputeUnit;
 class Wavefront;
 struct ComputeUnitParams;
 enum STAT_STATUS
 {
    IdleExec,
    BusyExec,
    PostExec
 };
 enum DISPATCH_STATUS
 {
    EMPTY = 0,
    FILLED
 };
 // Execution stage.
 // Each execution resource executes the
 // wave which is in its dispatch list.
 // The schedule stage is responsible for
 // adding a wave into each execution resource's
 // dispatch list.
 class ExecStage
 {
  public:
    ExecStage(const ComputeUnitParams* params);
    ~ExecStage() { }
    void init(ComputeUnit *cu);
    void exec();
    std::string name() { return _name; }
    void regStats();
    // number of idle cycles
    Stats::Scalar numCyclesWithNoIssue;
    // number of busy cycles
    Stats::Scalar numCyclesWithInstrIssued;
    // number of cycles (per execution unit) during which at least one
    // instruction was issued to that unit
    Stats::Vector numCyclesWithInstrTypeIssued;
    // number of idle cycles (per execution unit) during which the unit issued
    // no instruction targeting that unit, even though there is at least one
    // Wavefront with such an instruction as the oldest
    Stats::Vector numCyclesWithNoInstrTypeIssued;
    // SIMDs active per cycle
    Stats::Distribution spc;
  private:
    void collectStatistics(enum STAT_STATUS stage, int unitId);
    void initStatistics();
    ComputeUnit *computeUnit;
    uint32_t numSIMDs;
    // Number of memory execution resources;
    // both global and local memory execution resources in CU
    uint32_t numMemUnits;
    // List of waves which will be dispatched to
    // each execution resource. A FILLED implies
    // dispatch list is non-empty and
    // execution unit has something to execute
    // this cycle. Currently, the dispatch list of
    // an execution resource can hold only one wave because
    // an execution resource can execute only one wave in a cycle.
    // dispatchList is used to communicate between schedule
    // and exec stage
    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
    // flag per vector SIMD unit that is set when there is at least one
    // WV that has a vector ALU instruction as the oldest in its
    // Instruction Buffer
    std::vector<bool> *vectorAluInstAvail;
    int *glbMemInstAvail;
    int *shrMemInstAvail;
    bool lastTimeInstExecuted;
    bool thisTimeInstExecuted;
    bool instrExecuted;
    Stats::Scalar  numTransActiveIdle;
    Stats::Distribution idleDur;
    uint32_t executionResourcesUsed;
    uint64_t idle_dur;
    std::string _name;
 };
 #endif // __EXEC_STAGE_HH__
--- a/src/gpu-compute/fetch_stage.cc
+++ b/src/gpu-compute/fetch_stage.cc
@ -0,0 +1,106 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez, Sooraj Puthoor
 */
 #include "gpu-compute/fetch_stage.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/wavefront.hh"
 FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
    computeUnit(nullptr)
 {
    for (int j = 0; j < numSIMDs; ++j) {
        FetchUnit newFetchUnit(p);
        fetchUnit.push_back(newFetchUnit);
    }
 }
 FetchStage::~FetchStage()
 {
    fetchUnit.clear();
 }
 void
 FetchStage::init(ComputeUnit *cu)
 {
    computeUnit = cu;
    _name = computeUnit->name() + ".FetchStage";
    for (int j = 0; j < numSIMDs; ++j) {
        fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
        fetchUnit[j].init(computeUnit);
    }
 }
 void
 FetchStage::exec()
 {
    for (int j = 0; j < numSIMDs; ++j) {
        fetchUnit[j].exec();
    }
 }
 void
 FetchStage::processFetchReturn(PacketPtr pkt)
 {
    ComputeUnit::SQCPort::SenderState *sender_state =
        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
    Wavefront *wavefront = sender_state->wavefront;
    const unsigned num_instructions = pkt->req->getSize() /
        sizeof(TheGpuISA::RawMachInst);
    instFetchInstReturned.sample(num_instructions);
    uint32_t simdId = wavefront->simdId;
    fetchUnit[simdId].processFetchReturn(pkt);
 }
 void
 FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
 {
    fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
 }
 void
 FetchStage::regStats()
 {
    instFetchInstReturned
        .init(1, 32, 1)
        .name(name() + ".inst_fetch_instr_returned")
        .desc("For each instruction fetch request recieved record how many "
              "instructions you got from it")
        ;
 }
--- a/src/gpu-compute/fetch_stage.hh
+++ b/src/gpu-compute/fetch_stage.hh
@ -0,0 +1,78 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez, Sooraj Puthoor
 */
 #ifndef __FETCH_STAGE_HH__
 #define __FETCH_STAGE_HH__
 #include <string>
 #include <vector>
 #include "gpu-compute/fetch_unit.hh"
 // Instruction fetch stage.
 // All dispatched wavefronts for all SIMDS are analyzed for the
 // need to fetch instructions. From the fetch eligible waves,
 // one wave is selected from each SIMD and fetch is initiated
 // for the selected waves.
 class ComputeUnit;
 class Wavefront;
 class FetchStage
 {
  public:
    FetchStage(const ComputeUnitParams* params);
    ~FetchStage();
    void init(ComputeUnit *cu);
    void exec();
    void processFetchReturn(PacketPtr pkt);
    void fetch(PacketPtr pkt, Wavefront *wave);
    // Stats related variables and methods
    std::string name() { return _name; }
    void regStats();
    Stats::Distribution instFetchInstReturned;
  private:
    uint32_t numSIMDs;
    ComputeUnit *computeUnit;
    // List of fetch units. A fetch unit is
    // instantiated per SIMD
    std::vector<FetchUnit> fetchUnit;
    std::string _name;
 };
 #endif // __FETCH_STAGE_HH__
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@ -0,0 +1,293 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Brad Beckmann, Sooraj Puthoor
 */
 #include "gpu-compute/fetch_unit.hh"
 #include "debug/GPUFetch.hh"
 #include "debug/GPUPort.hh"
 #include "debug/GPUTLB.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"
 #include "mem/ruby/system/RubySystem.hh"
 uint32_t FetchUnit::globalFetchUnitID;
 FetchUnit::FetchUnit(const ComputeUnitParams* params) :
    timingSim(true),
    computeUnit(nullptr),
    fetchScheduler(params),
    waveList(nullptr)
 {
 }
 FetchUnit::~FetchUnit()
 {
    fetchQueue.clear();
    fetchStatusQueue.clear();
 }
 void
 FetchUnit::init(ComputeUnit *cu)
 {
    computeUnit = cu;
    timingSim = computeUnit->shader->timingSim;
    fetchQueue.clear();
    fetchStatusQueue.resize(computeUnit->shader->n_wf);
    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
        fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
    }
    fetchScheduler.bindList(&fetchQueue);
 }
 void
 FetchUnit::exec()
 {
    // re-evaluate waves which are marked as not ready for fetch
    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
        // Following code assumes 64-bit opertaion and all insts are
        // represented by 64-bit pointers to inst objects.
        Wavefront *curWave = fetchStatusQueue[j].first;
        assert (curWave);
        // The wavefront has to be active, the IB occupancy has to be
        // 4 or less instructions and it can not have any branches to
        // prevent speculative instruction fetches
        if (!fetchStatusQueue[j].second) {
            if (curWave->status == Wavefront::S_RUNNING &&
                curWave->instructionBuffer.size() <= 4 &&
                !curWave->instructionBufferHasBranch() &&
                !curWave->pendingFetch) {
                fetchQueue.push_back(curWave);
                fetchStatusQueue[j].second = true;
            }
        }
    }
    // Fetch only if there is some wave ready to be fetched
    // An empty fetchQueue will cause the schedular to panic
    if (fetchQueue.size()) {
        Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
        waveToBeFetched->pendingFetch = true;
        fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
        initiateFetch(waveToBeFetched);
    }
 }
 void
 FetchUnit::initiateFetch(Wavefront *wavefront)
 {
    // calculate the virtual address to fetch from the SQC
    Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
    vaddr = wavefront->base_ptr +  vaddr * sizeof(GPUStaticInst*);
    DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
    // Since this is an instruction prefetch, if you're split then just finish
    // out the current line.
    unsigned block_size = RubySystem::getBlockSizeBytes();
    // check for split accesses
    Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
    unsigned size = block_size;
    if (split_addr > vaddr) {
        // misaligned access, just grab the rest of the line
        size = split_addr - vaddr;
    }
    // set up virtual request
    Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
                               computeUnit->masterId(), 0, 0, 0);
    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
    // This fetchBlock is kind of faux right now - because the translations so
    // far don't actually return Data
    uint64_t fetchBlock;
    pkt->dataStatic(&fetchBlock);
    if (timingSim) {
        // SenderState needed on Return
        pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
        // Sender State needed by TLB hierarchy
        pkt->senderState =
            new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
                                                 computeUnit->shader->gpuTc,
                                                 false, pkt->senderState);
        if (computeUnit->sqcTLBPort->isStalled()) {
            assert(computeUnit->sqcTLBPort->retries.size() > 0);
            DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
                    vaddr);
            computeUnit->sqcTLBPort->retries.push_back(pkt);
        } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
            // Stall the data port;
            // No more packet is issued till
            // ruby indicates resources are freed by
            // a recvReqRetry() call back on this port.
            computeUnit->sqcTLBPort->stallPort();
            DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
                    vaddr);
            computeUnit->sqcTLBPort->retries.push_back(pkt);
        } else {
            DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
        }
    } else {
        pkt->senderState =
            new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
                                                 computeUnit->shader->gpuTc);
        computeUnit->sqcTLBPort->sendFunctional(pkt);
        TheISA::GpuTLB::TranslationState *sender_state =
             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
        delete sender_state->tlbEntry;
        delete sender_state;
        // fetch the instructions from the SQC when we operate in
        // functional mode only
        fetch(pkt, wavefront);
    }
 }
 void
 FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
 {
    assert(pkt->req->hasPaddr());
    assert(pkt->req->hasSize());
    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
            pkt->req->getPaddr());
    // this is necessary because the GPU TLB receives packets instead of
    // requests. when the translation is complete, all relevent fields in the
    // request will be populated, but not in the packet. here we create the
    // new packet so we can set the size, addr, and proper flags.
    PacketPtr oldPkt = pkt;
    pkt = new Packet(oldPkt->req, oldPkt->cmd);
    delete oldPkt;
    TheGpuISA::RawMachInst *data =
        new TheGpuISA::RawMachInst[pkt->req->getSize() /
        sizeof(TheGpuISA::RawMachInst)];
    pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
    // New SenderState for the memory access
    pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
    if (timingSim) {
        // translation is done. Send the appropriate timing memory request.
        if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
            computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
                                                                   wavefront));
            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
                    pkt->req->getPaddr());
        } else {
            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
                    pkt->req->getPaddr());
        }
    } else {
        computeUnit->sqcPort->sendFunctional(pkt);
        processFetchReturn(pkt);
    }
 }
 void
 FetchUnit::processFetchReturn(PacketPtr pkt)
 {
    ComputeUnit::SQCPort::SenderState *sender_state =
        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
    Wavefront *wavefront = sender_state->wavefront;
    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
            "%d bytes, %d instructions!\n", computeUnit->cu_id,
            wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
            pkt->req->getSize(), pkt->req->getSize() /
            sizeof(TheGpuISA::RawMachInst));
    if (wavefront->dropFetch) {
        assert(wavefront->instructionBuffer.empty());
        wavefront->dropFetch = false;
    } else {
        TheGpuISA::RawMachInst *inst_index_ptr =
            (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
        assert(wavefront->instructionBuffer.size() <= 4);
        for (int i = 0; i < pkt->req->getSize() /
             sizeof(TheGpuISA::RawMachInst); ++i) {
            GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
            assert(inst_ptr);
            DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
                    computeUnit->cu_id, wavefront->simdId,
                    wavefront->wfSlotId, inst_ptr->disassemble());
            GPUDynInstPtr gpuDynInst =
                std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
                                             computeUnit->getAndIncSeqNum());
            wavefront->instructionBuffer.push_back(gpuDynInst);
        }
    }
    wavefront->pendingFetch = false;
    delete pkt->senderState;
    delete pkt->req;
    delete pkt;
 }
 void
 FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
 {
    waveList = wave_list;
 }
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@ -0,0 +1,89 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Brad Beckmann, Sooraj Puthoor
 */
 #ifndef __FETCH_UNIT_HH__
 #define __FETCH_UNIT_HH__
 #include <string>
 #include <utility>
 #include <vector>
 #include "arch/gpu_decoder.hh"
 #include "base/statistics.hh"
 #include "config/the_gpu_isa.hh"
 #include "gpu-compute/scheduler.hh"
 #include "mem/packet.hh"
 class ComputeUnit;
 class Wavefront;
 class FetchUnit
 {
  public:
    FetchUnit(const ComputeUnitParams* params);
    ~FetchUnit();
    void init(ComputeUnit *cu);
    void exec();
    void bindWaveList(std::vector<Wavefront*> *list);
    void initiateFetch(Wavefront *wavefront);
    void fetch(PacketPtr pkt, Wavefront *wavefront);
    void processFetchReturn(PacketPtr pkt);
    static uint32_t globalFetchUnitID;
  private:
    bool timingSim;
    ComputeUnit *computeUnit;
    TheGpuISA::Decoder decoder;
    // Fetch scheduler; Selects one wave from
    // the fetch queue for instruction fetching.
    // The selection is made according to
    // a scheduling policy
    Scheduler fetchScheduler;
    // Stores the list of waves that are
    // ready to be fetched this cycle
    std::vector<Wavefront*> fetchQueue;
    // Stores the fetch status of all waves dispatched to this SIMD.
    // TRUE implies the wave is ready to fetch and is already
    // moved to fetchQueue
    std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;
    // Pointer to list of waves dispatched on to this SIMD unit
    std::vector<Wavefront*> *waveList;
 };
 #endif // __FETCH_UNIT_HH__
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@ -0,0 +1,242 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos, Sooraj Puthoor
 */
 #include "gpu-compute/global_memory_pipeline.hh"
 #include "debug/GPUMem.hh"
 #include "debug/GPUReg.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
    computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
    inflightStores(0), inflightLoads(0)
 {
 }
 void
 GlobalMemPipeline::init(ComputeUnit *cu)
 {
    computeUnit = cu;
    globalMemSize = computeUnit->shader->globalMemSize;
    _name = computeUnit->name() + ".GlobalMemPipeline";
 }
 void
 GlobalMemPipeline::exec()
 {
    // apply any returned global memory operations
    GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
        !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
    bool accessVrf = true;
    // check the VRF to see if the operands of a load (or load component
    // of an atomic) are accessible
    if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
        accessVrf =
            w->computeUnit->vrf[m->simdId]->
            vrfOperandAccessReady(m->seqNum(), w, m,
                                  VrfAccessType::WRITE);
    }
    if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
        m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
        accessVrf && m->statusBitVector == VectorMask(0) &&
        (computeUnit->shader->coissue_return ||
         computeUnit->wfWait.at(m->pipeId).rdy())) {
        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
            doGmReturn<uint32_t, uint8_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
            doGmReturn<uint32_t, uint16_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
            doGmReturn<uint32_t, uint32_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
            doGmReturn<int32_t, int8_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
            doGmReturn<int32_t, int16_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
            doGmReturn<int32_t, int32_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
            doGmReturn<float, Float16>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
            doGmReturn<float, float>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
            doGmReturn<uint64_t, uint8_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
            doGmReturn<uint64_t, uint16_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
            doGmReturn<uint64_t, uint32_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
            doGmReturn<uint64_t, uint64_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
            doGmReturn<int64_t, int8_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
            doGmReturn<int64_t, int16_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
            doGmReturn<int64_t, int32_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
            doGmReturn<int64_t, int64_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
            doGmReturn<double, Float16>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
            doGmReturn<double, float>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
            doGmReturn<double, double>(m);
    }
    // If pipeline has executed a global memory instruction
    // execute global memory packets and issue global
    // memory packets to DTLB
    if (!gmIssuedRequests.empty()) {
        GPUDynInstPtr mp = gmIssuedRequests.front();
        if (mp->m_op == Enums::MO_LD ||
            (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) ||
            (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) {
            if (inflightLoads >= gmQueueSize) {
                return;
            } else {
                ++inflightLoads;
            }
        } else {
            if (inflightStores >= gmQueueSize) {
                return;
            } else {
                ++inflightStores;
            }
        }
        mp->initiateAcc(mp);
        gmIssuedRequests.pop();
        DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
                computeUnit->cu_id, mp->simdId, mp->wfSlotId,
                Enums::MemOpTypeStrings[mp->m_op]);
    }
 }
 template<typename c0, typename c1>
 void
 GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
 {
    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
    // Return data to registers
    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
        gmReturnedLoads.pop();
        assert(inflightLoads > 0);
        --inflightLoads;
        if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
            std::vector<uint32_t> regVec;
            // iterate over number of destination register operands since
            // this is a load or atomic operation
            for (int k = 0; k < m->n_reg; ++k) {
                assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
                int dst = m->dst_reg + k;
                if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
                    dst = m->dst_reg_vec[k];
                // virtual->physical VGPR mapping
                int physVgpr = w->remap(dst, sizeof(c0), 1);
                // save the physical VGPR index
                regVec.push_back(physVgpr);
                c1 *p1 = &((c1*)m->d_data)[k * VSZ];
                for (int i = 0; i < VSZ; ++i) {
                    if (m->exec_mask[i]) {
                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                "$%s%d <- %d global ld done (src = wavefront "
                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
                                dst, *p1);
                        // write the value into the physical VGPR. This is a
                        // purely functional operation. No timing is modeled.
                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
                                                                    *p1, i);
                    }
                    ++p1;
                }
            }
            // Schedule the write operation of the load data on the VRF.
            // This simply models the timing aspect of the VRF write operation.
            // It does not modify the physical VGPR.
            loadVrfBankConflictCycles +=
                w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
                                                     w, regVec, sizeof(c0),
                                                     m->time);
        }
    } else {
        gmReturnedStores.pop();
        assert(inflightStores > 0);
        --inflightStores;
    }
    // Decrement outstanding register count
    computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
    if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) ||
        MO_H(m->m_op)) {
        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time,
                                         -1);
    }
    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time,
                                         -1);
    }
    // Mark write bus busy for appropriate amount of time
    computeUnit->glbMemToVrfBus.set(m->time);
    if (!computeUnit->shader->coissue_return)
        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
 }
 void
 GlobalMemPipeline::regStats()
 {
    loadVrfBankConflictCycles
        .name(name() + ".load_vrf_bank_conflict_cycles")
        .desc("total number of cycles GM data are delayed before updating "
              "the VRF")
        ;
 }
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@ -0,0 +1,123 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos, Sooraj Puthoor
 */
 #ifndef __GLOBAL_MEMORY_PIPELINE_HH__
 #define __GLOBAL_MEMORY_PIPELINE_HH__
 #include <queue>
 #include <string>
 #include "gpu-compute/misc.hh"
 #include "params/ComputeUnit.hh"
 #include "sim/stats.hh"
 /*
 * @file global_memory_pipeline.hh
 *
 * The global memory pipeline issues newly created global memory packets
 * from the pipeline to DTLB. The exec() method of the memory packet issues
 * the packet to the DTLB if there is space available in the return fifo.
 * This stage also retires previously issued loads and stores that have
 * returned from the memory sub-system.
 */
 class ComputeUnit;
 class GlobalMemPipeline
 {
  public:
    GlobalMemPipeline(const ComputeUnitParams *params);
    void init(ComputeUnit *cu);
    void exec();
    template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
    std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
    bool
    isGMLdRespFIFOWrRdy() const
    {
        return gmReturnedLoads.size() < gmQueueSize;
    }
    bool
    isGMStRespFIFOWrRdy() const
    {
        return gmReturnedStores.size() < gmQueueSize;
    }
    bool
    isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
    {
        return (gmIssuedRequests.size() + pendReqs) < gmQueueSize;
    }
    const std::string &name() const { return _name; }
    void regStats();
  private:
    ComputeUnit *computeUnit;
    std::string _name;
    int gmQueueSize;
    // number of cycles of delaying the update of a VGPR that is the
    // target of a load instruction (or the load component of an atomic)
    // The delay is due to VRF bank conflicts
    Stats::Scalar loadVrfBankConflictCycles;
    // Counters to track the inflight loads and stores
    // so that we can provide the proper backpressure
    // on the number of inflight memory operations.
    int inflightStores;
    int inflightLoads;
    // The size of global memory.
    int globalMemSize;
    // Global Memory Request FIFO: all global memory requests
    // are issued to this FIFO from the memory pipelines
    std::queue<GPUDynInstPtr> gmIssuedRequests;
    // Globa Store Response FIFO: all responses of global memory
    // stores are sent to this FIFO from TCP
    std::queue<GPUDynInstPtr> gmReturnedStores;
    // Global Load Response FIFO: all responses of global memory
    // loads are sent to this FIFO from TCP
    std::queue<GPUDynInstPtr> gmReturnedLoads;
 };
 #endif // __GLOBAL_MEMORY_PIPELINE_HH__
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@ -0,0 +1,198 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "debug/GPUMem.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"
 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
                       GPUStaticInst *_staticInst, uint64_t instSeqNum)
    : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
      memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
      statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
 {
    tlbHitLevel.assign(VSZ, -1);
 }
 void
 GPUDynInst::execute()
 {
    GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst,
                                                            _seqNum);
    staticInst->execute(gpuDynInst);
 }
 int
 GPUDynInst::numSrcRegOperands()
 {
    return staticInst->numSrcRegOperands();
 }
 int
 GPUDynInst::numDstRegOperands()
 {
    return staticInst->numDstRegOperands();
 }
 int
 GPUDynInst::getNumOperands()
 {
    return staticInst->getNumOperands();
 }
 bool
 GPUDynInst::isVectorRegister(int operandIdx)
 {
    return staticInst->isVectorRegister(operandIdx);
 }
 bool
 GPUDynInst::isScalarRegister(int operandIdx)
 {
    return staticInst->isVectorRegister(operandIdx);
 }
 int
 GPUDynInst::getRegisterIndex(int operandIdx)
 {
    return staticInst->getRegisterIndex(operandIdx);
 }
 int
 GPUDynInst::getOperandSize(int operandIdx)
 {
    return staticInst->getOperandSize(operandIdx);
 }
 bool
 GPUDynInst::isDstOperand(int operandIdx)
 {
    return staticInst->isDstOperand(operandIdx);
 }
 bool
 GPUDynInst::isSrcOperand(int operandIdx)
 {
    return staticInst->isSrcOperand(operandIdx);
 }
 bool
 GPUDynInst::isArgLoad()
 {
    return staticInst->isArgLoad();
 }
 const std::string&
 GPUDynInst::disassemble() const
 {
    return staticInst->disassemble();
 }
 uint64_t
 GPUDynInst::seqNum() const
 {
    return _seqNum;
 }
 Enums::OpType
 GPUDynInst::opType()
 {
    return staticInst->o_type;
 }
 Enums::StorageClassType
 GPUDynInst::executedAs()
 {
    return staticInst->executed_as;
 }
 // Process a memory instruction and (if necessary) submit timing request
 void
 GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
 {
    DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
            cu->cu_id, simdId, wfSlotId, exec_mask);
    staticInst->initiateAcc(gpuDynInst);
    time = 0;
 }
 bool
 GPUDynInst::scalarOp() const
 {
    return staticInst->scalarOp();
 }
 void
 GPUDynInst::updateStats()
 {
    if (staticInst->isLocalMem()) {
        // access to LDS (shared) memory
        cu->dynamicLMemInstrCnt++;
    } else {
        // access to global memory
        // update PageDivergence histogram
        int number_pages_touched = cu->pagesTouched.size();
        assert(number_pages_touched);
        cu->pageDivergenceDist.sample(number_pages_touched);
        std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
        for (auto it : cu->pagesTouched) {
            // see if this page has been touched before. if not, this also
            // inserts the page into the table.
            ret = cu->pageAccesses
                .insert(ComputeUnit::pageDataStruct::value_type(it.first,
                        std::make_pair(1, it.second)));
            // if yes, then update the stats
            if (!ret.second) {
                ret.first->second.first++;
                ret.first->second.second += it.second;
            }
        }
        cu->pagesTouched.clear();
        // total number of memory instructions (dynamic)
        // Atomics are counted as a single memory instruction.
        // this is # memory instructions per wavefronts, not per workitem
        cu->dynamicGMemInstrCnt++;
    }
 }
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@ -0,0 +1,464 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __GPU_DYN_INST_HH__
 #define __GPU_DYN_INST_HH__
 #include <cstdint>
 #include <string>
 #include "enums/GenericMemoryOrder.hh"
 #include "enums/GenericMemoryScope.hh"
 #include "enums/MemOpType.hh"
 #include "enums/MemType.hh"
 #include "enums/OpType.hh"
 #include "enums/StorageClassType.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_exec_context.hh"
 class GPUStaticInst;
 template<typename T>
 class AtomicOpAnd : public TypedAtomicOpFunctor<T>
 {
  public:
    T a;
    AtomicOpAnd(T _a) : a(_a) { }
    void execute(T *b) { *b &= a; }
 };
 template<typename T>
 class AtomicOpOr : public TypedAtomicOpFunctor<T>
 {
  public:
    T a;
    AtomicOpOr(T _a) : a(_a) { }
    void execute(T *b) { *b |= a; }
 };
 template<typename T>
 class AtomicOpXor : public TypedAtomicOpFunctor<T>
 {
  public:
    T a;
    AtomicOpXor(T _a) : a(_a) {}
    void execute(T *b) { *b ^= a; }
 };
 template<typename T>
 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
 {
  public:
    T c;
    T s;
    ComputeUnit *computeUnit;
    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
      : c(_c), s(_s), computeUnit(compute_unit) { }
    void
    execute(T *b)
    {
        computeUnit->numCASOps++;
        if (*b == c) {
            *b = s;
        } else {
            computeUnit->numFailedCASOps++;
        }
        if (computeUnit->xact_cas_mode) {
            computeUnit->xactCasLoadMap.clear();
        }
    }
 };
 template<typename T>
 class AtomicOpExch : public TypedAtomicOpFunctor<T>
 {
  public:
    T a;
    AtomicOpExch(T _a) : a(_a) { }
    void execute(T *b) { *b = a; }
 };
 template<typename T>
 class AtomicOpAdd : public TypedAtomicOpFunctor<T>
 {
  public:
    T a;
    AtomicOpAdd(T _a) : a(_a) { }
    void execute(T *b) { *b += a; }
 };
 template<typename T>
 class AtomicOpSub : public TypedAtomicOpFunctor<T>
 {
  public:
    T a;
    AtomicOpSub(T _a) : a(_a) { }
    void execute(T *b) { *b -= a; }
 };
 template<typename T>
 class AtomicOpInc : public TypedAtomicOpFunctor<T>
 {
  public:
    AtomicOpInc() { }
    void execute(T *b) { *b += 1; }
 };
 template<typename T>
 class AtomicOpDec : public TypedAtomicOpFunctor<T>
 {
  public:
    AtomicOpDec() {}
    void execute(T *b) { *b -= 1; }
 };
 template<typename T>
 class AtomicOpMax : public TypedAtomicOpFunctor<T>
 {
  public:
    T a;
    AtomicOpMax(T _a) : a(_a) { }
    void
    execute(T *b)
    {
        if (a > *b)
            *b = a;
    }
 };
 template<typename T>
 class AtomicOpMin : public TypedAtomicOpFunctor<T>
 {
  public:
    T a;
    AtomicOpMin(T _a) : a(_a) {}
    void
    execute(T *b)
    {
        if (a < *b)
            *b = a;
    }
 };
 #define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
 #define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
 #define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
 typedef enum
 {
    VT_32,
    VT_64,
 } vgpr_type;
 typedef enum
 {
    SEG_PRIVATE,
    SEG_SPILL,
    SEG_GLOBAL,
    SEG_SHARED,
    SEG_READONLY,
    SEG_FLAT
 } seg_type;
 class GPUDynInst : public GPUExecContext
 {
  public:
    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
               uint64_t instSeqNum);
    void execute();
    int numSrcRegOperands();
    int numDstRegOperands();
    int getNumOperands();
    bool isVectorRegister(int operandIdx);
    bool isScalarRegister(int operandIdx);
    int getRegisterIndex(int operandIdx);
    int getOperandSize(int operandIdx);
    bool isDstOperand(int operandIdx);
    bool isSrcOperand(int operandIdx);
    bool isArgLoad();
    const std::string &disassemble() const;
    uint64_t seqNum() const;
    Enums::OpType opType();
    Enums::StorageClassType executedAs();
    // The address of the memory operation
    Addr addr[VSZ];
    Addr pAddr;
    // The data to get written
    uint8_t d_data[VSZ * 16];
    // Additional data (for atomics)
    uint8_t a_data[VSZ * 8];
    // Additional data (for atomics)
    uint8_t x_data[VSZ * 8];
    // The execution mask
    VectorMask exec_mask;
    // The memory type (M_U32, M_S32, ...)
    Enums::MemType m_type;
    // The memory operation (MO_LD, MO_ST, ...)
    Enums::MemOpType m_op;
    Enums::GenericMemoryOrder memoryOrder;
    // Scope of the request
    Enums::GenericMemoryScope scope;
    // The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
    seg_type s_type;
    // The equivalency class
    int equiv;
    // The return VGPR type (VT_32 or VT_64)
    vgpr_type v_type;
    // Number of VGPR's accessed (1, 2, or 4)
    int n_reg;
    // The return VGPR index
    int dst_reg;
    // There can be max 4 dest regs>
    int dst_reg_vec[4];
    // SIMD where the WF of the memory instruction has been mapped to
    int simdId;
    // unique id of the WF where the memory instruction belongs to
    int wfDynId;
    // The kernel id of the requesting wf
    int kern_id;
    // The CU id of the requesting wf
    int cu_id;
    // HW slot id where the WF is mapped to inside a SIMD unit
    int wfSlotId;
    // execution pipeline id where the memory instruction has been scheduled
    int pipeId;
    // The execution time of this operation
    Tick time;
    // The latency of this operation
    WaitClass latency;
    // A list of bank conflicts for the 4 cycles.
    uint32_t bc[4];
    // A pointer to ROM
    uint8_t *rom;
    // The size of the READONLY segment
    int sz_rom;
    // Initiate the specified memory operation, by creating a
    // memory request and sending it off to the memory system.
    void initiateAcc(GPUDynInstPtr gpuDynInst);
    void updateStats();
    GPUStaticInst* staticInstruction() { return staticInst; }
    // Is the instruction a scalar or vector op?
    bool scalarOp() const;
    /*
     * Loads/stores/atomics may have acquire/release semantics associated
     * withthem. Some protocols want to see the acquire/release as separate
     * requests from the load/store/atomic. We implement that separation
     * using continuations (i.e., a function pointer with an object associated
     * with it). When, for example, the front-end generates a store with
     * release semantics, we will first issue a normal store and set the
     * continuation in the GPUDynInst to a function that generate a
     * release request. That continuation will be called when the normal
     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
     * continuation will be called in the context of the same GPUDynInst
     * that generated the initial store.
     */
    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
    // when true, call execContinuation when response arrives
    bool useContinuation;
    template<typename c0> AtomicOpFunctor*
    makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
    {
        using namespace Enums;
        switch(op) {
          case MO_AAND:
          case MO_ANRAND:
            return new AtomicOpAnd<c0>(*reg0);
          case MO_AOR:
          case MO_ANROR:
            return new AtomicOpOr<c0>(*reg0);
          case MO_AXOR:
          case MO_ANRXOR:
            return new AtomicOpXor<c0>(*reg0);
          case MO_ACAS:
          case MO_ANRCAS:
            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
          case MO_AEXCH:
          case MO_ANREXCH:
            return new AtomicOpExch<c0>(*reg0);
          case MO_AADD:
          case MO_ANRADD:
            return new AtomicOpAdd<c0>(*reg0);
          case MO_ASUB:
          case MO_ANRSUB:
            return new AtomicOpSub<c0>(*reg0);
          case MO_AINC:
          case MO_ANRINC:
            return new AtomicOpInc<c0>();
          case MO_ADEC:
          case MO_ANRDEC:
            return new AtomicOpDec<c0>();
          case MO_AMAX:
          case MO_ANRMAX:
            return new AtomicOpMax<c0>(*reg0);
          case MO_AMIN:
          case MO_ANRMIN:
            return new AtomicOpMin<c0>(*reg0);
          default:
            panic("Unrecognized atomic operation");
        }
    }
    void
    setRequestFlags(Request *req, bool setMemOrder=true)
    {
        // currently these are the easy scopes to deduce
        switch (s_type) {
          case SEG_PRIVATE:
            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
            break;
          case SEG_SPILL:
            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
            break;
          case SEG_GLOBAL:
            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
            break;
          case SEG_READONLY:
            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
            break;
          case SEG_SHARED:
            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
            break;
          case SEG_FLAT:
            // TODO: translate to correct scope
            assert(false);
          default:
            panic("Bad segment type");
            break;
        }
        switch (scope) {
          case Enums::MEMORY_SCOPE_NONE:
          case Enums::MEMORY_SCOPE_WORKITEM:
            break;
          case Enums::MEMORY_SCOPE_WAVEFRONT:
            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                        Request::WAVEFRONT_SCOPE);
            break;
          case Enums::MEMORY_SCOPE_WORKGROUP:
            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                        Request::WORKGROUP_SCOPE);
            break;
          case Enums::MEMORY_SCOPE_DEVICE:
            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                        Request::DEVICE_SCOPE);
            break;
          case Enums::MEMORY_SCOPE_SYSTEM:
            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                        Request::SYSTEM_SCOPE);
            break;
          default:
            panic("Bad scope type");
            break;
        }
        if (setMemOrder) {
            // set acquire and release flags
            switch (memoryOrder){
              case Enums::MEMORY_ORDER_SC_ACQUIRE:
                req->setFlags(Request::ACQUIRE);
                break;
              case Enums::MEMORY_ORDER_SC_RELEASE:
                req->setFlags(Request::RELEASE);
                break;
              case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
                req->setFlags(Request::ACQUIRE | Request::RELEASE);
                break;
              default:
                break;
            }
        }
        // set atomic type
        // currently, the instruction genenerator only produces atomic return
        // but a magic instruction can produce atomic no return
        if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
            m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
            m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
            m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
            m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
            m_op == Enums::MO_ACAS) {
            req->setFlags(Request::ATOMIC_RETURN_OP);
        } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
                   m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
                   m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
                   m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
                   m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
                   m_op == Enums::MO_ANRCAS) {
            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
        }
    }
    // Map returned packets and the addresses they satisfy with which lane they
    // were requested from
    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
    StatusVector memStatusVector;
    // Track the status of memory requests per lane, a bit per lane
    VectorMask statusBitVector;
    // for ld_v# or st_v#
    std::vector<int> statusVector;
    std::vector<int> tlbHitLevel;
  private:
    GPUStaticInst *staticInst;
    uint64_t _seqNum;
 };
 #endif // __GPU_DYN_INST_HH__
--- a/src/gpu-compute/gpu_exec_context.cc
+++ b/src/gpu-compute/gpu_exec_context.cc
@ -0,0 +1,53 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #include "gpu-compute/gpu_exec_context.hh"
 GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
    : cu(_cu), wf(_wf)
 {
 }
 ComputeUnit*
 GPUExecContext::computeUnit()
 {
    return cu;
 }
 Wavefront*
 GPUExecContext::wavefront()
 {
    return wf;
 }
--- a/src/gpu-compute/gpu_exec_context.hh
+++ b/src/gpu-compute/gpu_exec_context.hh
@ -0,0 +1,54 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __GPU_EXEC_CONTEXT_HH__
 #define __GPU_EXEC_CONTEXT_HH__
 class ComputeUnit;
 class Wavefront;
 class GPUExecContext
 {
  public:
    GPUExecContext(ComputeUnit *_cu, Wavefront *_wf);
    Wavefront* wavefront();
    ComputeUnit* computeUnit();
  protected:
    ComputeUnit *cu;
    Wavefront *wf;
 };
 #endif // __GPU_EXEC_CONTEXT_HH__
--- a/src/gpu-compute/gpu_static_inst.cc
+++ b/src/gpu-compute/gpu_static_inst.cc
@ -0,0 +1,42 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #include "gpu-compute/gpu_static_inst.hh"
 GPUStaticInst::GPUStaticInst(const std::string &opcode)
    : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode),
      _instNum(0), _scalarOp(false)
 {
 }
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@ -0,0 +1,166 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __GPU_STATIC_INST_HH__
 #define __GPU_STATIC_INST_HH__
 /*
 * @file gpu_static_inst.hh
 *
 * Defines the base class representing static instructions for the GPU. The
 * instructions are "static" because they contain no dynamic instruction
 * information. GPUStaticInst corresponds to the StaticInst class for the CPU
 * models.
 */
 #include <cstdint>
 #include <string>
 #include "enums/OpType.hh"
 #include "enums/StorageClassType.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/misc.hh"
 class BaseOperand;
 class BaseRegOperand;
 class Wavefront;
 class GPUStaticInst
 {
  public:
    GPUStaticInst(const std::string &opcode);
    void instNum(int num) { _instNum = num; }
    int instNum() { return _instNum;  }
    void ipdInstNum(int num) { _ipdInstNum = num; }
    int ipdInstNum() const { return _ipdInstNum; }
    virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
    virtual void generateDisassembly() = 0;
    virtual const std::string &disassemble() = 0;
    virtual int getNumOperands() = 0;
    virtual bool isCondRegister(int operandIndex) = 0;
    virtual bool isScalarRegister(int operandIndex) = 0;
    virtual bool isVectorRegister(int operandIndex) = 0;
    virtual bool isSrcOperand(int operandIndex) = 0;
    virtual bool isDstOperand(int operandIndex) = 0;
    virtual int getOperandSize(int operandIndex) = 0;
    virtual int getRegisterIndex(int operandIndex) = 0;
    virtual int numDstRegOperands() = 0;
    virtual int numSrcRegOperands() = 0;
    /*
     * Most instructions (including all HSAIL instructions)
     * are vector ops, so _scalarOp will be false by default.
     * Derived instruction objects that are scalar ops must
     * set _scalarOp to true in their constructors.
     */
    bool scalarOp() const { return _scalarOp; }
    virtual bool isLocalMem() const
    {
        fatal("calling isLocalMem() on non-memory instruction.\n");
        return false;
    }
    bool isArgLoad() { return false; }
    virtual uint32_t instSize() = 0;
    // only used for memory instructions
    virtual void
    initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        fatal("calling initiateAcc() on a non-memory instruction.\n");
    }
    virtual uint32_t getTargetPc() { return 0; }
    /**
     * Query whether the instruction is an unconditional jump i.e., the jump
     * is always executed because there is no condition to be evaluated.
     *
     * If the instruction is not of branch type, the result is always false.
     *
     * @return True if the instruction is an unconditional jump.
     */
    virtual bool unconditionalJumpInstruction() { return false; }
    static uint64_t dynamic_id_count;
    Enums::OpType o_type;
    // For flat memory accesses
    Enums::StorageClassType executed_as;
  protected:
    virtual void
    execLdAcq(GPUDynInstPtr gpuDynInst)
    {
        fatal("calling execLdAcq() on a non-load instruction.\n");
    }
    virtual void
    execSt(GPUDynInstPtr gpuDynInst)
    {
        fatal("calling execLdAcq() on a non-load instruction.\n");
    }
    virtual void
    execAtomic(GPUDynInstPtr gpuDynInst)
    {
        fatal("calling execAtomic() on a non-atomic instruction.\n");
    }
    virtual void
    execAtomicAcq(GPUDynInstPtr gpuDynInst)
    {
        fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
    }
    const std::string opcode;
    std::string disassembly;
    int _instNum;
    /**
     * Identifier of the immediate post-dominator instruction.
     */
    int _ipdInstNum;
    bool _scalarOp;
 };
 #endif // __GPU_STATIC_INST_HH__
--- a/src/gpu-compute/gpu_tlb.cc
+++ b/src/gpu-compute/gpu_tlb.cc
--- a/src/gpu-compute/gpu_tlb.hh
+++ b/src/gpu-compute/gpu_tlb.hh
@ -0,0 +1,465 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Lisa Hsu
 */
 #ifndef __GPU_TLB_HH__
 #define __GPU_TLB_HH__
 #include <fstream>
 #include <list>
 #include <queue>
 #include <string>
 #include <vector>
 #include "arch/generic/tlb.hh"
 #include "arch/x86/pagetable.hh"
 #include "arch/x86/pagetable_walker.hh"
 #include "arch/x86/regs/segment.hh"
 #include "base/callback.hh"
 #include "base/misc.hh"
 #include "base/statistics.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "mem/mem_object.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/X86GPUTLB.hh"
 #include "sim/sim_object.hh"
 class BaseTLB;
 class Packet;
 class ThreadContext;
 namespace X86ISA
 {
    class GpuTlbEntry : public TlbEntry
    {
      public:
        GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
          : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
        GpuTlbEntry() : TlbEntry() { }
        bool valid;
    };
    class GpuTLB : public MemObject
    {
      protected:
        friend class Walker;
        typedef std::list<GpuTlbEntry*> EntryList;
        uint32_t configAddress;
        // TLB clock: will inherit clock from shader's clock period in terms
        // of nuber of ticks of curTime (aka global simulation clock)
        // The assignment of TLB clock from shader clock is done in the python
        // config files.
        int clock;
      public:
        // clock related functions ; maps to-and-from Simulation ticks and
        // object clocks.
        Tick frequency() const { return SimClock::Frequency / clock; }
        Tick
        ticks(int numCycles) const
        {
            return (Tick)clock * numCycles;
        }
        Tick curCycle() const { return curTick() / clock; }
        Tick tickToCycles(Tick val) const { return val / clock;}
        typedef X86GPUTLBParams Params;
        GpuTLB(const Params *p);
        ~GpuTLB();
        typedef enum BaseTLB::Mode Mode;
        class Translation
        {
          public:
            virtual ~Translation() { }
            /**
             * Signal that the translation has been delayed due to a hw page
             * table walk.
             */
            virtual void markDelayed() = 0;
            /**
             * The memory for this object may be dynamically allocated, and it
             * may be responsible for cleaning itslef up which will happen in
             * this function. Once it's called the object is no longer valid.
             */
            virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
                    Mode mode) = 0;
        };
        void dumpAll();
        GpuTlbEntry *lookup(Addr va, bool update_lru=true);
        void setConfigAddress(uint32_t addr);
      protected:
        EntryList::iterator lookupIt(Addr va, bool update_lru=true);
        Walker *walker;
      public:
        Walker *getWalker();
        void invalidateAll();
        void invalidateNonGlobal();
        void demapPage(Addr va, uint64_t asn);
      protected:
        int size;
        int assoc;
        int numSets;
        /**
         *  true if this is a fully-associative TLB
         */
        bool FA;
        Addr setMask;
        /**
         * Allocation Policy: true if we always allocate on a hit, false
         * otherwise. Default is true.
         */
        bool allocationPolicy;
        /**
         * if true, then this is not the last level TLB
         */
        bool hasMemSidePort;
        /**
         * Print out accessDistance stats. One stat file
         * per TLB.
         */
        bool accessDistance;
        GpuTlbEntry *tlb;
        /*
         * It's a per-set list. As long as we have not reached
         * the full capacity of the given set, grab an entry from
         * the freeList.
         */
        std::vector<EntryList> freeList;
        /**
         * An entryList per set is the equivalent of an LRU stack;
         * it's used to guide replacement decisions. The head of the list
         * contains the MRU TLB entry of the given set. If the freeList
         * for this set is empty, the last element of the list
         * is evicted (i.e., dropped on the floor).
         */
        std::vector<EntryList> entryList;
        Fault translateInt(RequestPtr req, ThreadContext *tc);
        Fault translate(RequestPtr req, ThreadContext *tc,
                Translation *translation, Mode mode, bool &delayedResponse,
                bool timing, int &latency);
      public:
        // latencies for a TLB hit, miss and page fault
        int hitLatency;
        int missLatency1;
        int missLatency2;
        // local_stats are as seen from the TLB
        // without taking into account coalescing
        Stats::Scalar localNumTLBAccesses;
        Stats::Scalar localNumTLBHits;
        Stats::Scalar localNumTLBMisses;
        Stats::Formula localTLBMissRate;
        // global_stats are as seen from the
        // CU's perspective taking into account
        // all coalesced requests.
        Stats::Scalar globalNumTLBAccesses;
        Stats::Scalar globalNumTLBHits;
        Stats::Scalar globalNumTLBMisses;
        Stats::Formula globalTLBMissRate;
        // from the CU perspective (global)
        Stats::Scalar accessCycles;
        // from the CU perspective (global)
        Stats::Scalar pageTableCycles;
        Stats::Scalar numUniquePages;
        // from the perspective of this TLB
        Stats::Scalar localCycles;
        // from the perspective of this TLB
        Stats::Formula localLatency;
        // I take the avg. per page and then
        // the avg. over all pages.
        Stats::Scalar avgReuseDistance;
        void regStats();
        void updatePageFootprint(Addr virt_page_addr);
        void printAccessPattern();
        Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
                              int &latency);
        void translateTiming(RequestPtr req, ThreadContext *tc,
                             Translation *translation, Mode mode,
                             int &latency);
        Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
        Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
        GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
        // Checkpointing
        virtual void serialize(CheckpointOut& cp) const;
        virtual void unserialize(CheckpointIn& cp);
        void issueTranslation();
        enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
        bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
        void handleTranslationReturn(Addr addr, tlbOutcome outcome,
                                     PacketPtr pkt);
        void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
        void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
                                    GpuTlbEntry *tlb_entry, Mode mode);
        void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
                                 Addr phys_page_addr);
        void issueTLBLookup(PacketPtr pkt);
        // CpuSidePort is the TLB Port closer to the CPU/CU side
        class CpuSidePort : public SlavePort
        {
          public:
            CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
                        PortID _index)
                : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
          protected:
            GpuTLB *tlb;
            int index;
            virtual bool recvTimingReq(PacketPtr pkt);
            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
            virtual void recvFunctional(PacketPtr pkt);
            virtual void recvRangeChange() { }
            virtual void recvReqRetry();
            virtual void recvRespRetry() { assert(false); }
            virtual AddrRangeList getAddrRanges() const;
        };
        /**
         * MemSidePort is the TLB Port closer to the memory side
         * If this is a last level TLB then this port will not be connected.
         *
         * Future action item: if we ever do real page walks, then this port
         * should be connected to a RubyPort.
         */
        class MemSidePort : public MasterPort
        {
          public:
            MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
                        PortID _index)
                : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
            std::deque<PacketPtr> retries;
          protected:
            GpuTLB *tlb;
            int index;
            virtual bool recvTimingResp(PacketPtr pkt);
            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
            virtual void recvFunctional(PacketPtr pkt) { }
            virtual void recvRangeChange() { }
            virtual void recvReqRetry();
        };
        // TLB ports on the cpu Side
        std::vector<CpuSidePort*> cpuSidePort;
        // TLB ports on the memory side
        std::vector<MemSidePort*> memSidePort;
        BaseMasterPort &getMasterPort(const std::string &if_name,
                                      PortID idx=InvalidPortID);
        BaseSlavePort &getSlavePort(const std::string &if_name,
                                    PortID idx=InvalidPortID);
        /**
         * TLB TranslationState: this currently is a somewhat bastardization of
         * the usage of SenderState, whereby the receiver of a packet is not
         * usually supposed to need to look at the contents of the senderState,
         * you're really only supposed to look at what you pushed on, pop it
         * off, and send it back.
         *
         * However, since there is state that we want to pass to the TLBs using
         * the send/recv Timing/Functional/etc. APIs, which don't allow for new
         * arguments, we need a common TLB senderState to pass between TLBs,
         * both "forwards" and "backwards."
         *
         * So, basically, the rule is that any packet received by a TLB port
         * (cpuside OR memside) must be safely castable to a TranslationState.
         */
        struct TranslationState : public Packet::SenderState
        {
            // TLB mode, read or write
            Mode tlbMode;
            // Thread context associated with this req
            ThreadContext *tc;
            /*
            * TLB entry to be populated and passed back and filled in
            * previous TLBs.  Equivalent to the data cache concept of
            * "data return."
            */
            GpuTlbEntry *tlbEntry;
            // Is this a TLB prefetch request?
            bool prefetch;
            // When was the req for this translation issued
            uint64_t issueTime;
            // Remember where this came from
            std::vector<SlavePort*>ports;
            // keep track of #uncoalesced reqs per packet per TLB level;
            // reqCnt per level >= reqCnt higher level
            std::vector<int> reqCnt;
            // TLB level this packet hit in; 0 if it hit in the page table
            int hitLevel;
            Packet::SenderState *saved;
            TranslationState(Mode tlb_mode, ThreadContext *_tc,
                             bool _prefetch=false,
                             Packet::SenderState *_saved=nullptr)
                : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
                  prefetch(_prefetch), issueTime(0),
                  hitLevel(0),saved(_saved) { }
        };
        // maximum number of permitted coalesced requests per cycle
        int maxCoalescedReqs;
        // Current number of outstandings coalesced requests.
        // Should be <= maxCoalescedReqs
        int outstandingReqs;
        /**
         * A TLBEvent is scheduled after the TLB lookup and helps us take the
         * appropriate actions:
         *  (e.g., update TLB on a hit,
         *  send request to lower level TLB on a miss,
         *  or start a page walk if this was the last-level TLB).
         */
        void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
                               PacketPtr pkt);
        class TLBEvent : public Event
        {
            private:
                GpuTLB *tlb;
                Addr virtPageAddr;
                /**
                 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
                 */
                tlbOutcome outcome;
                PacketPtr pkt;
            public:
                TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
                        PacketPtr _pkt);
                void process();
                const char *description() const;
                // updateOutcome updates the tlbOutcome of a TLBEvent
                void updateOutcome(tlbOutcome _outcome);
                Addr getTLBEventVaddr();
        };
        std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
        // this FIFO queue keeps track of the virt. page addresses
        // that are pending cleanup
        std::queue<Addr> cleanupQueue;
        // the cleanupEvent is scheduled after a TLBEvent triggers in order to
        // free memory and do the required clean-up
        void cleanup();
        EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
        /**
         * This hash map will use the virtual page address as a key
         * and will keep track of total number of accesses per page
         */
        struct AccessInfo
        {
            unsigned int lastTimeAccessed; // last access to this page
            unsigned int accessesPerPage;
            // need to divide it by accessesPerPage at the end
            unsigned int totalReuseDistance;
            /**
             * The field below will help us compute the access distance,
             * that is the number of (coalesced) TLB accesses that
             * happened in between each access to this page
             *
             * localTLBAccesses[x] is the value of localTLBNumAccesses
             * when the page <Addr> was accessed for the <x>th time
             */
            std::vector<unsigned int> localTLBAccesses;
            unsigned int sumDistance;
            unsigned int meanDistance;
        };
        typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
        AccessPatternTable TLBFootprint;
        // Called at the end of simulation to dump page access stats.
        void exitCallback();
        EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
    };
 }
 #endif // __GPU_TLB_HH__
--- a/src/gpu-compute/hsa_code.hh
+++ b/src/gpu-compute/hsa_code.hh
@ -0,0 +1,101 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __HSA_CODE_HH__
 #define __HSA_CODE_HH__
 #include <string>
 #include <vector>
 #include "arch/gpu_types.hh"
 #include "config/the_gpu_isa.hh"
 class HsaKernelInfo;
 /* @class HsaCode
 * base code object for the set of HSA kernels associated
 * with a single application. this class provides the common
 * methods for creating, accessing, and storing information
 * about kernel and variable symbols, symbol name, memory
 * segment sizes, and instruction count, etc.
 */
 class HsaCode
 {
  public:
    HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0),
                                       _name(name)
    {
    }
    enum class MemorySegment {
        NONE,
        FLAT,
        GLOBAL,
        READONLY,
        KERNARG,
        GROUP,
        PRIVATE,
        SPILL,
        ARG,
        EXTSPACE0
    };
    const std::string& name() const { return _name; }
    int numInsts() const { return _insts.size(); }
    std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; }
    void
    setReadonlyData(uint8_t *_readonly_data)
    {
        readonly_data = _readonly_data;
    }
    virtual int getSize(MemorySegment segment) const = 0;
    virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0;
    uint8_t *readonly_data;
    int funcarg_size;
  protected:
    // An array that stores instruction indices (0 through kernel size)
    // for a kernel passed to code object constructor as an argument.
    std::vector<TheGpuISA::RawMachInst> _insts;
  private:
    const std::string _name;
 };
 #endif // __HSA_CODE_HH__
--- a/src/gpu-compute/hsa_kernel_info.hh
+++ b/src/gpu-compute/hsa_kernel_info.hh
@ -0,0 +1,79 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #ifndef __HSA_KERNEL_INFO_HH__
 #define __HSA_KERNEL_INFO_HH__
 // This file defines the public interface between the HSA emulated
 // driver and application programs.
 #include <cstdint>
 static const int HSA_GET_SIZES = 0x4801;
 static const int HSA_GET_KINFO = 0x4802;
 static const int HSA_GET_STRINGS = 0x4803;
 static const int HSA_GET_CODE = 0x4804;
 static const int HSA_GET_READONLY_DATA = 0x4805;
 static const int HSA_GET_CU_CNT = 0x4806;
 static const int HSA_GET_VSZ = 0x4807;
 // Return value (via buffer ptr) for HSA_GET_SIZES
 struct HsaDriverSizes
 {
    uint32_t num_kernels;
    uint32_t string_table_size;
    uint32_t code_size;
    uint32_t readonly_size;
 };
 // HSA_GET_KINFO returns an array of num_kernels of these structs
 struct HsaKernelInfo
 {
    // byte offset into string table
    uint32_t name_offs;
    // byte offset into code array
    uint32_t code_offs;
    uint32_t static_lds_size;
    uint32_t private_mem_size;
    uint32_t spill_mem_size;
    // Number of s registers
    uint32_t sRegCount;
    // Number of d registers
    uint32_t dRegCount;
    // Number of c registers
    uint32_t cRegCount;
 };
 #endif // __HSA_KERNEL_INFO_HH__
--- a/src/gpu-compute/hsa_object.cc
+++ b/src/gpu-compute/hsa_object.cc
@ -0,0 +1,76 @@
 /*
 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #include "gpu-compute/hsa_object.hh"
 #include <fstream>
 #include "gpu-compute/brig_object.hh"
 HsaObject::HsaObject(const std::string &fname)
    : readonlyData(nullptr), filename(fname)
 {
 }
 HsaObject*
 HsaObject::createHsaObject(const std::string &fname)
 {
    HsaObject *hsaObj = nullptr;
    uint8_t *file_data = nullptr;
    int file_length = 0;
    std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in |
                            std::ifstream::binary);
    assert(code_file.is_open());
    assert(code_file.good());
    file_length = code_file.tellg();
    code_file.seekg(0, code_file.beg);
    file_data = new uint8_t[file_length];
    code_file.read((char*)file_data, file_length);
    code_file.close();
    for (const auto &tryFile : tryFileFuncs) {
        if ((hsaObj = tryFile(fname, file_length, file_data))) {
            return hsaObj;
        }
    }
    delete[] file_data;
    fatal("Unknown HSA object type for file: %s.\n", fname);
    return nullptr;
 }
--- a/src/gpu-compute/hsa_object.hh
+++ b/src/gpu-compute/hsa_object.hh
@ -0,0 +1,74 @@
 /*
 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Anthony Gutierrez
 */
 #ifndef __HSA_OBJECT_HH__
 #define __HSA_OBJECT_HH__
 #include <functional>
 #include <string>
 #include <vector>
 class HsaCode;
 /* @class HsaObject
 * base loader object for HSA kernels. this class provides
 * the base method definitions for loading, storing, and
 * accessing HSA kernel objects into the simulator.
 */
 class HsaObject
 {
  public:
    HsaObject(const std::string &fileName);
    static HsaObject* createHsaObject(const std::string &fname);
    static std::vector<std::function<HsaObject*(const std::string&, int,
                                                uint8_t*)>> tryFileFuncs;
    virtual HsaCode* getKernel(const std::string &name) const = 0;
    virtual HsaCode* getKernel(int i) const = 0;
    virtual HsaCode* getFunction(const std::string &name) const = 0;
    virtual int numKernels() const = 0;
    const std::string& name() const { return filename; }
    uint8_t *readonlyData;
  protected:
    const std::string filename;
 };
 #endif // __HSA_OBJECT_HH__
--- a/src/gpu-compute/hsail_code.cc
+++ b/src/gpu-compute/hsail_code.cc
@ -0,0 +1,453 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #include "gpu-compute/hsail_code.hh"
 #include "arch/gpu_types.hh"
 #include "arch/hsail/Brig.h"
 #include "arch/hsail/operand.hh"
 #include "config/the_gpu_isa.hh"
 #include "debug/BRIG.hh"
 #include "debug/HSAILObject.hh"
 #include "gpu-compute/brig_object.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/kernel_cfg.hh"
 using namespace Brig;
 int getBrigDataTypeBytes(BrigType16_t t);
 HsailCode::HsailCode(const std::string &name_str)
    : HsaCode(name_str), private_size(-1), readonly_size(-1)
 {
 }
 void
 HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj,
                StorageMap *objStorageMap)
 {
    storageMap = objStorageMap;
    // set pointer so that decoding process can find this kernel context when
    // needed
    obj->currentCode = this;
    if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION &&
        code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) {
        fatal("unexpected directive kind %d inside kernel/function init\n",
              code_dir->base.kind);
    }
    DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n",
            code_dir->firstCodeBlockEntry);
    // clear these static vars so we can properly track the max index
    // for this kernel
    SRegOperand::maxRegIdx = 0;
    DRegOperand::maxRegIdx = 0;
    CRegOperand::maxRegIdx = 0;
    setPrivateSize(0);
    const BrigBase *entryPtr = brigNext((BrigBase*)code_dir);
    const BrigBase *endPtr =
        obj->getCodeSectionEntry(code_dir->nextModuleEntry);
    int inst_idx = 0;
    std::vector<GPUStaticInst*> instructions;
    int funcarg_size_scope = 0;
    // walk through instructions in code section and directives in
    // directive section in parallel, processing directives that apply
    // when we reach the relevant code point.
    while (entryPtr < endPtr) {
        switch (entryPtr->kind) {
          case BRIG_KIND_DIRECTIVE_VARIABLE:
           {
                const BrigDirectiveVariable *sym =
                    (const BrigDirectiveVariable*)entryPtr;
                DPRINTF(HSAILObject,"Initializing code, directive is "
                        "kind_variable, symbol is: %s\n",
                        obj->getString(sym->name));
                StorageElement *se = storageMap->addSymbol(sym, obj);
                if (sym->segment == BRIG_SEGMENT_PRIVATE) {
                    setPrivateSize(se->size);
                } else { // spill
                    funcarg_size_scope += se->size;
                }
            }
            break;
          case BRIG_KIND_DIRECTIVE_LABEL:
            {
                const BrigDirectiveLabel *lbl =
                    (const BrigDirectiveLabel*)entryPtr;
                DPRINTF(HSAILObject,"Initializing code, directive is "
                        "kind_label, label is: %s \n",
                        obj->getString(lbl->name));
                labelMap.addLabel(lbl, inst_idx, obj);
            }
            break;
          case BRIG_KIND_DIRECTIVE_PRAGMA:
            {
                DPRINTF(HSAILObject, "Initializing code, directive "
                        "is kind_pragma\n");
            }
            break;
          case BRIG_KIND_DIRECTIVE_COMMENT:
            {
                DPRINTF(HSAILObject, "Initializing code, directive is "
                        "kind_comment\n");
            }
            break;
          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
            {
                DPRINTF(HSAILObject, "Initializing code, directive is "
                        "kind_arg_block_start\n");
                storageMap->resetOffset(BRIG_SEGMENT_ARG);
                funcarg_size_scope = 0;
            }
            break;
          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
            {
                DPRINTF(HSAILObject, "Initializing code, directive is "
                        "kind_arg_block_end\n");
                funcarg_size = funcarg_size < funcarg_size_scope ?
                                              funcarg_size_scope : funcarg_size;
            }
            break;
          case BRIG_KIND_DIRECTIVE_END:
            DPRINTF(HSAILObject, "Initializing code, dircetive is "
                    "kind_end\n");
            break;
          default:
            if (entryPtr->kind >= BRIG_KIND_INST_BEGIN &&
                entryPtr->kind <= BRIG_KIND_INST_END) {
                BrigInstBase *instPtr = (BrigInstBase*)entryPtr;
                TheGpuISA::MachInst machInst = { instPtr, obj };
                GPUStaticInst *iptr = decoder.decode(machInst);
                if (iptr) {
                    DPRINTF(HSAILObject, "Initializing code, processing inst "
                            "#%d idx %d: OPCODE=%d\n",
                            inst_idx,  _insts.size(), instPtr->opcode);
                    TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr);
                    iptr->instNum(inst_idx);
                    _insts.push_back(inst_num);
                    instructions.push_back(iptr);
                }
                ++inst_idx;
            } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
                       entryPtr->kind < BRIG_KIND_OPERAND_END) {
                warn("unexpected operand entry in code segment\n");
            } else {
                // there are surely some more cases we will need to handle,
                // but we'll deal with them as we find them.
                fatal("unexpected directive kind %d inside kernel scope\n",
                      entryPtr->kind);
            }
        }
        entryPtr = brigNext(entryPtr);
    }
    // compute Control Flow Graph for current kernel
    ControlFlowInfo::assignImmediatePostDominators(instructions);
    max_sreg = SRegOperand::maxRegIdx;
    max_dreg = DRegOperand::maxRegIdx;
    max_creg = CRegOperand::maxRegIdx;
    obj->currentCode = nullptr;
 }
 HsailCode::HsailCode(const std::string &name_str,
                     const BrigDirectiveExecutable *code_dir,
                     const BrigObject *obj, StorageMap *objStorageMap)
    : HsaCode(name_str), private_size(-1), readonly_size(-1)
 {
    init(code_dir, obj, objStorageMap);
 }
 void
 LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index,
                   const BrigObject *obj)
 {
    std::string lbl_name = obj->getString(lblDir->name);
    Label &lbl = map[lbl_name];
    if (lbl.defined()) {
        fatal("Attempt to redefine existing label %s\n", lbl_name);
    }
    lbl.define(lbl_name, inst_index);
    DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index);
 }
 Label*
 LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir,
                   const BrigObject *obj)
 {
    std::string name = obj->getString(lblDir->name);
    Label &lbl = map[name];
    lbl.checkName(name);
    return &lbl;
 }
 int
 getBrigDataTypeBytes(BrigType16_t t)
 {
    switch (t) {
      case BRIG_TYPE_S8:
      case BRIG_TYPE_U8:
      case BRIG_TYPE_B8:
        return 1;
      case BRIG_TYPE_S16:
      case BRIG_TYPE_U16:
      case BRIG_TYPE_B16:
      case BRIG_TYPE_F16:
        return 2;
      case BRIG_TYPE_S32:
      case BRIG_TYPE_U32:
      case BRIG_TYPE_B32:
      case BRIG_TYPE_F32:
        return 4;
      case BRIG_TYPE_S64:
      case BRIG_TYPE_U64:
      case BRIG_TYPE_B64:
      case BRIG_TYPE_F64:
        return 8;
      case BRIG_TYPE_B1:
      default:
        fatal("unhandled symbol data type %d", t);
        return 0;
    }
 }
 StorageElement*
 StorageSpace::addSymbol(const BrigDirectiveVariable *sym,
                        const BrigObject *obj)
 {
    const char *sym_name = obj->getString(sym->name);
    uint64_t size = 0;
    uint64_t offset = 0;
    if (sym->type & BRIG_TYPE_ARRAY) {
        size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY);
        size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo);
        offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type &
                         ~BRIG_TYPE_ARRAY));
    } else {
        size = getBrigDataTypeBytes(sym->type);
        offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type));
    }
    nextOffset = offset + size;
    DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n",
            segmentNames[segment], sym_name, size, offset, sym->init);
    StorageElement* se = new StorageElement(sym_name, offset, size, sym);
    elements.push_back(se);
    elements_by_addr.insert(AddrRange(offset, offset + size - 1), se);
    elements_by_brigptr[sym] = se;
    return se;
 }
 StorageElement*
 StorageSpace::findSymbol(std::string name)
 {
    for (auto it : elements) {
        if (it->name == name) {
            return it;
        }
    }
    return nullptr;
 }
 StorageElement*
 StorageSpace::findSymbol(uint64_t addr)
 {
    assert(elements_by_addr.size() > 0);
    auto se = elements_by_addr.find(addr);
    if (se == elements_by_addr.end()) {
        return nullptr;
    } else {
        return se->second;
    }
 }
 StorageElement*
 StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr)
 {
    assert(elements_by_brigptr.size() > 0);
    auto se = elements_by_brigptr.find(brigptr);
    if (se == elements_by_brigptr.end()) {
        return nullptr;
    } else {
        return se->second;
    }
 }
 StorageMap::StorageMap(StorageMap *outerScope)
    : outerScopeMap(outerScope)
 {
    for (int i = 0; i < NumSegments; ++i)
        space[i] = new StorageSpace((BrigSegment)i);
 }
 StorageElement*
 StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj)
 {
    BrigSegment8_t segment = sym->segment;
    assert(segment >= Brig::BRIG_SEGMENT_FLAT);
    assert(segment < NumSegments);
    return space[segment]->addSymbol(sym, obj);
 }
 int
 StorageMap::getSize(Brig::BrigSegment segment)
 {
    assert(segment > Brig::BRIG_SEGMENT_GLOBAL);
    assert(segment < NumSegments);
    if (segment != Brig::BRIG_SEGMENT_GROUP &&
        segment != Brig::BRIG_SEGMENT_READONLY) {
        return space[segment]->getSize();
    } else {
        int ret = space[segment]->getSize();
        if (outerScopeMap) {
            ret += outerScopeMap->getSize(segment);
        }
        return ret;
    }
 }
 void
 StorageMap::resetOffset(Brig::BrigSegment segment)
 {
    space[segment]->resetOffset();
 }
 StorageElement*
 StorageMap::findSymbol(BrigSegment segment, std::string name)
 {
    StorageElement *se = space[segment]->findSymbol(name);
    if (se)
        return se;
    if (outerScopeMap)
        return outerScopeMap->findSymbol(segment, name);
    return nullptr;
 }
 StorageElement*
 StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr)
 {
    StorageSpace *sp = space[segment];
    if (!sp) {
        // there is no memory in segment?
        return nullptr;
    }
    StorageElement *se = sp->findSymbol(addr);
    if (se)
        return se;
    if (outerScopeMap)
        return outerScopeMap->findSymbol(segment, addr);
    return nullptr;
 }
 StorageElement*
 StorageMap::findSymbol(Brig::BrigSegment segment,
                       const BrigDirectiveVariable *brigptr)
 {
    StorageSpace *sp = space[segment];
    if (!sp) {
        // there is no memory in segment?
        return nullptr;
    }
    StorageElement *se = sp->findSymbol(brigptr);
    if (se)
        return se;
    if (outerScopeMap)
        return outerScopeMap->findSymbol(segment, brigptr);
    return nullptr;
 }
--- a/src/gpu-compute/hsail_code.hh
+++ b/src/gpu-compute/hsail_code.hh
@ -0,0 +1,447 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #ifndef __HSAIL_CODE_HH__
 #define __HSAIL_CODE_HH__
 #include <cassert>
 #include <list>
 #include <map>
 #include <string>
 #include <vector>
 #include "arch/gpu_decoder.hh"
 #include "arch/hsail/Brig.h"
 #include "base/addr_range_map.hh"
 #include "base/intmath.hh"
 #include "config/the_gpu_isa.hh"
 #include "gpu-compute/hsa_code.hh"
 #include "gpu-compute/hsa_kernel_info.hh"
 #include "gpu-compute/misc.hh"
 class BrigObject;
 class GPUStaticInst;
 inline int
 popcount(uint64_t src, int sz)
 {
    int cnt = 0;
    for (int i = 0; i < sz; ++i) {
        if (src & 1)
            ++cnt;
        src >>= 1;
    }
    return cnt;
 }
 inline int
 firstbit(uint64_t src, int sz)
 {
    int i;
    for (i = 0; i < sz; ++i) {
        if (src & 1)
            break;
        src >>= 1;
    }
    return i;
 }
 inline int
 lastbit(uint64_t src, int sz)
 {
    int i0 = -1;
    for (int i = 0; i < sz; ++i) {
        if (src & 1)
            i0 = i;
        src >>= 1;
    }
    return i0;
 }
 inline int
 signbit(uint64_t src, int sz)
 {
    int i0 = -1;
    if (src & (1 << (sz - 1))) {
        for (int i = 0; i < sz - 1; ++i) {
            if (!(src & 1))
                i0 = i;
            src >>= 1;
        }
    } else {
        for (int i = 0; i < sz - 1; ++i) {
            if (src & 1)
                i0 = i;
            src >>= 1;
        }
    }
    return i0;
 }
 inline uint64_t
 bitrev(uint64_t src, int sz)
 {
    uint64_t r = 0;
    for (int i = 0; i < sz; ++i) {
        r <<= 1;
        if (src & 1)
            r |= 1;
        src >>= 1;
    }
    return r;
 }
 inline uint64_t
 mul_hi(uint32_t a, uint32_t b)
 {
    return ((uint64_t)a * (uint64_t)b) >> 32;
 }
 inline uint64_t
 mul_hi(int32_t a, int32_t b)
 {
    return ((int64_t)a * (int64_t)b) >> 32;
 }
 inline uint64_t
 mul_hi(uint64_t a, uint64_t b)
 {
    return ((uint64_t)a * (uint64_t)b) >> 32;
 }
 inline uint64_t
 mul_hi(int64_t a, int64_t b)
 {
    return ((int64_t)a * (int64_t)b) >> 32;
 }
 inline uint64_t
 mul_hi(double a, double b)
 {
    return 0;
 }
 class Label
 {
  public:
    std::string name;
    int value;
    Label() : value(-1)
    {
    }
    bool defined() { return value != -1; }
    void
    checkName(std::string &_name)
    {
        if (name.empty()) {
            name = _name;
        } else {
            assert(name == _name);
        }
    }
    void
    define(std::string &_name, int _value)
    {
        assert(!defined());
        assert(_value != -1);
        value = _value;
        checkName(_name);
    }
    int
    get()
    {
        assert(defined());
        return value;
    }
 };
 class LabelMap
 {
    std::map<std::string, Label> map;
  public:
    LabelMap() { }
    void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index,
                  const BrigObject *obj);
    Label *refLabel(const Brig::BrigDirectiveLabel *lbl,
                    const BrigObject *obj);
 };
 const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN;
 extern const char *segmentNames[];
 class StorageElement
 {
  public:
    std::string name;
    uint64_t offset;
    uint64_t size;
    const Brig::BrigDirectiveVariable *brigSymbol;
    StorageElement(const char *_name, uint64_t _offset, int _size,
                   const Brig::BrigDirectiveVariable *sym)
        : name(_name), offset(_offset), size(_size), brigSymbol(sym)
    {
    }
 };
 class StorageSpace
 {
    typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*>
            DirVarToSE_map;
    std::list<StorageElement*> elements;
    AddrRangeMap<StorageElement*> elements_by_addr;
    DirVarToSE_map elements_by_brigptr;
    uint64_t nextOffset;
    Brig::BrigSegment segment;
  public:
    StorageSpace(Brig::BrigSegment _class)
        : nextOffset(0), segment(_class)
    {
    }
    StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
                              const BrigObject *obj);
    StorageElement* findSymbol(std::string name);
    StorageElement* findSymbol(uint64_t addr);
    StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr);
    int getSize() { return nextOffset; }
    void resetOffset() { nextOffset = 0; }
 };
 class StorageMap
 {
    StorageMap *outerScopeMap;
    StorageSpace *space[NumSegments];
  public:
    StorageMap(StorageMap *outerScope = nullptr);
    StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
                              const BrigObject *obj);
    StorageElement* findSymbol(Brig::BrigSegment segment, std::string name);
    StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr);
    StorageElement* findSymbol(Brig::BrigSegment segment,
                               const Brig::BrigDirectiveVariable *brigptr);
    // overloaded version to avoid casting
    StorageElement*
    findSymbol(Brig::BrigSegment8_t segment, std::string name)
    {
        return findSymbol((Brig::BrigSegment)segment, name);
    }
    int getSize(Brig::BrigSegment segment);
    void resetOffset(Brig::BrigSegment segment);
 };
 typedef enum
 {
    BT_DEFAULT,
    BT_B8,
    BT_U8,
    BT_U16,
    BT_U32,
    BT_U64,
    BT_S8,
    BT_S16,
    BT_S32,
    BT_S64,
    BT_F16,
    BT_F32,
    BT_F64,
    BT_NULL
 } base_type_e;
 /* @class HsailCode
 * the HsailCode class is used to store information
 * about HSA kernels stored in the BRIG format. it holds
 * all information about a kernel, function, or variable
 * symbol and provides methods for accessing that
 * information.
 */
 class HsailCode final : public HsaCode
 {
  public:
    TheGpuISA::Decoder decoder;
    StorageMap *storageMap;
    LabelMap labelMap;
    uint32_t kernarg_start;
    uint32_t kernarg_end;
    int32_t private_size;
    int32_t readonly_size;
    // We track the maximum register index used for each register
    // class when we load the code so we can size the register files
    // appropriately (i.e., one more than the max index).
    uint32_t max_creg;    // maximum c-register index
    uint32_t max_sreg;    // maximum s-register index
    uint32_t max_dreg;    // maximum d-register index
    HsailCode(const std::string &name_str,
              const Brig::BrigDirectiveExecutable *code_dir,
              const BrigObject *obj,
              StorageMap *objStorageMap);
    // this version is used to create a placeholder when
    // we encounter a kernel-related directive before the
    // kernel itself
    HsailCode(const std::string &name_str);
    void init(const Brig::BrigDirectiveExecutable *code_dir,
              const BrigObject *obj, StorageMap *objStorageMap);
    void
    generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const
    {
        hsaKernelInfo->sRegCount = max_sreg + 1;
        hsaKernelInfo->dRegCount = max_dreg + 1;
        hsaKernelInfo->cRegCount = max_creg + 1;
        hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP);
        hsaKernelInfo->private_mem_size =
            roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8);
        hsaKernelInfo->spill_mem_size =
            roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8);
    }
    int
    getSize(MemorySegment segment) const
    {
        Brig::BrigSegment brigSeg;
        switch (segment) {
          case MemorySegment::NONE:
            brigSeg = Brig::BRIG_SEGMENT_NONE;
            break;
          case MemorySegment::FLAT:
            brigSeg = Brig::BRIG_SEGMENT_FLAT;
            break;
          case MemorySegment::GLOBAL:
            brigSeg = Brig::BRIG_SEGMENT_GLOBAL;
            break;
          case MemorySegment::READONLY:
            brigSeg = Brig::BRIG_SEGMENT_READONLY;
            break;
          case MemorySegment::KERNARG:
            brigSeg = Brig::BRIG_SEGMENT_KERNARG;
            break;
          case MemorySegment::GROUP:
            brigSeg = Brig::BRIG_SEGMENT_GROUP;
            break;
          case MemorySegment::PRIVATE:
            brigSeg = Brig::BRIG_SEGMENT_PRIVATE;
            break;
          case MemorySegment::SPILL:
            brigSeg = Brig::BRIG_SEGMENT_SPILL;
            break;
          case MemorySegment::ARG:
            brigSeg = Brig::BRIG_SEGMENT_ARG;
            break;
          case MemorySegment::EXTSPACE0:
            brigSeg = Brig::BRIG_SEGMENT_AMD_GCN;
            break;
          default:
            fatal("Unknown BrigSegment type.\n");
        }
        return getSize(brigSeg);
    }
  private:
    int
    getSize(Brig::BrigSegment segment) const
    {
        if (segment == Brig::BRIG_SEGMENT_PRIVATE) {
            // with the code generated by new HSA compiler the assertion
            // does not hold anymore..
            //assert(private_size != -1);
            return private_size;
        } else {
            return storageMap->getSize(segment);
        }
    }
  public:
    StorageElement*
    findSymbol(Brig::BrigSegment segment, uint64_t addr)
    {
        return storageMap->findSymbol(segment, addr);
    }
    void
    setPrivateSize(int32_t _private_size)
    {
        private_size = _private_size;
    }
    Label*
    refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj)
    {
        return labelMap.refLabel(lbl, obj);
    }
 };
 #endif // __HSAIL_CODE_HH__
--- a/src/gpu-compute/kernel_cfg.cc
+++ b/src/gpu-compute/kernel_cfg.cc
@ -0,0 +1,296 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #include "gpu-compute/kernel_cfg.hh"
 #include <algorithm>
 #include <cassert>
 #include <cstdio>
 #include <cstring>
 #include <iostream>
 #include <iterator>
 #include <map>
 #include <string>
 #include "gpu-compute/gpu_static_inst.hh"
 void
 ControlFlowInfo::assignImmediatePostDominators(
        const std::vector<GPUStaticInst*>& instructions)
 {
    ControlFlowInfo cfg(instructions);
    cfg.findImmediatePostDominators();
 }
 ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) :
        instructions(insts)
 {
    createBasicBlocks();
    connectBasicBlocks();
 }
 BasicBlock*
 ControlFlowInfo::basicBlock(int inst_num) const {
    for (auto& block: basicBlocks) {
       int first_block_id = block->firstInstruction->instNum();
       if (inst_num >= first_block_id &&
               inst_num < first_block_id + block->size) {
           return block.get();
       }
    }
    return nullptr;
 }
 GPUStaticInst*
 ControlFlowInfo::lastInstruction(const BasicBlock* block) const
 {
    if (block->isExit()) {
        return nullptr;
    }
    return instructions.at(block->firstInstruction->instNum() +
                           block->size - 1);
 }
 BasicBlock*
 ControlFlowInfo::postDominator(const BasicBlock* block) const
 {
    if (block->isExit()) {
        return nullptr;
    }
    return basicBlock(lastInstruction(block)->ipdInstNum());
 }
 void
 ControlFlowInfo::createBasicBlocks()
 {
    assert(!instructions.empty());
    std::set<int> leaders;
    // first instruction is a leader
    leaders.insert(0);
    for (int i = 1; i < instructions.size(); i++) {
        GPUStaticInst* instruction = instructions[i];
        if (instruction->o_type == Enums::OT_BRANCH) {
            const int target_pc = instruction->getTargetPc();
            leaders.insert(target_pc);
            leaders.insert(i + 1);
        }
    }
    size_t block_size = 0;
    for (int i = 0; i < instructions.size(); i++) {
        if (leaders.find(i) != leaders.end()) {
            uint32_t id = basicBlocks.size();
            if (id > 0) {
                basicBlocks.back()->size = block_size;
            }
            block_size = 0;
            basicBlocks.emplace_back(new BasicBlock(id, instructions[i]));
        }
        block_size++;
    }
    basicBlocks.back()->size = block_size;
    // exit basic block
    basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr));
 }
 void
 ControlFlowInfo::connectBasicBlocks()
 {
    BasicBlock* exit_bb = basicBlocks.back().get();
    for (auto& bb : basicBlocks) {
        if (bb->isExit()) {
            break;
        }
        GPUStaticInst* last = lastInstruction(bb.get());
        if (last->o_type == Enums::OT_RET) {
            bb->successorIds.insert(exit_bb->id);
            break;
        }
        if (last->o_type == Enums::OT_BRANCH) {
            const uint32_t target_pc = last->getTargetPc();
            BasicBlock* target_bb = basicBlock(target_pc);
            bb->successorIds.insert(target_bb->id);
        }
        // Unconditional jump instructions have a unique successor
        if (!last->unconditionalJumpInstruction()) {
            BasicBlock* next_bb = basicBlock(last->instNum() + 1);
            bb->successorIds.insert(next_bb->id);
        }
    }
 }
 // In-place set intersection
 static void
 intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b)
 {
    std::set<uint32_t>::iterator it = a.begin();
    while (it != a.end()) {
        it = b.find(*it) != b.end() ? ++it : a.erase(it);
    }
 }
 void
 ControlFlowInfo::findPostDominators()
 {
    // the only postdominator of the exit block is itself
    basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id);
    //copy all basic blocks to all postdominator lists except for exit block
    for (auto& block : basicBlocks) {
        if (!block->isExit()) {
            for (uint32_t i = 0; i < basicBlocks.size(); i++) {
                block->postDominatorIds.insert(i);
            }
        }
    }
    bool change = true;
    while (change) {
        change = false;
        for (int h = basicBlocks.size() - 2; h >= 0; --h) {
            size_t num_postdominators =
                    basicBlocks[h]->postDominatorIds.size();
            for (int s : basicBlocks[h]->successorIds) {
                intersect(basicBlocks[h]->postDominatorIds,
                          basicBlocks[s]->postDominatorIds);
            }
            basicBlocks[h]->postDominatorIds.insert(h);
            change |= (num_postdominators
                    != basicBlocks[h]->postDominatorIds.size());
        }
    }
 }
 // In-place set difference
 static void
 setDifference(std::set<uint32_t>&a,
           const std::set<uint32_t>& b, uint32_t exception)
 {
    for (uint32_t b_elem : b) {
        if (b_elem != exception) {
            a.erase(b_elem);
        }
    }
 }
 void
 ControlFlowInfo::findImmediatePostDominators()
 {
    assert(basicBlocks.size() > 1); // Entry and exit blocks must be present
    findPostDominators();
    for (auto& basicBlock : basicBlocks) {
        if (basicBlock->isExit()) {
            continue;
        }
        std::set<uint32_t> candidates = basicBlock->postDominatorIds;
        candidates.erase(basicBlock->id);
        for (uint32_t postDominatorId : basicBlock->postDominatorIds) {
            if (postDominatorId != basicBlock->id) {
                setDifference(candidates,
                           basicBlocks[postDominatorId]->postDominatorIds,
                           postDominatorId);
            }
        }
        assert(candidates.size() == 1);
        GPUStaticInst* last_instruction = lastInstruction(basicBlock.get());
        BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get();
        if (!ipd_block->isExit()) {
            GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction;
            last_instruction->ipdInstNum(ipd_first_inst->instNum());
        } else {
            last_instruction->ipdInstNum(last_instruction->instNum() + 1);
        }
    }
 }
 void
 ControlFlowInfo::printPostDominators() const
 {
    for (auto& block : basicBlocks) {
        std::cout << "PD(" << block->id << ") = {";
        std::copy(block->postDominatorIds.begin(),
                  block->postDominatorIds.end(),
                  std::ostream_iterator<uint32_t>(std::cout, ", "));
        std::cout << "}" << std::endl;
    }
 }
 void
 ControlFlowInfo::printImmediatePostDominators() const
 {
    for (const auto& block : basicBlocks) {
        if (block->isExit()) {
            continue;
        }
        std::cout << "IPD(" << block->id << ") = ";
        std::cout << postDominator(block.get())->id << ", ";
    }
    std::cout << std::endl;
 }
 void
 ControlFlowInfo::printBasicBlocks() const
 {
    for (GPUStaticInst* inst : instructions) {
        int inst_num = inst->instNum();
        std::cout << inst_num << " [" << basicBlock(inst_num)->id
                << "]: " << inst->disassemble();
        if (inst->o_type == Enums::OT_BRANCH) {
            std::cout << ", PC = " << inst->getTargetPc();
        }
        std::cout << std::endl;
    }
 }
 void
 ControlFlowInfo::printBasicBlockDot() const
 {
    printf("digraph {\n");
    for (const auto& basic_block : basicBlocks) {
        printf("\t");
        for (uint32_t successorId : basic_block->successorIds) {
            printf("%d -> %d; ", basic_block->id, successorId);
        }
        printf("\n");
    }
    printf("}\n");
 }
--- a/src/gpu-compute/kernel_cfg.hh
+++ b/src/gpu-compute/kernel_cfg.hh
@ -0,0 +1,133 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #ifndef __KERNEL_CFG_HH__
 #define __KERNEL_CFG_HH__
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <set>
 #include <vector>
 class GPUStaticInst;
 class HsailCode;
 struct BasicBlock
 {
    BasicBlock(uint32_t num, GPUStaticInst* begin) :
            id(num), size(0), firstInstruction(begin)
    {
    }
    bool
    isEntry() const
    {
        return !id;
    }
    bool
    isExit() const
    {
        return !size;
    }
    /**
     * Unique identifier for the block within a given kernel.
     */
    const uint32_t id;
    /**
     * Number of instructions contained in the block
     */
    size_t size;
    /**
     * Pointer to first instruction of the block.
     */
    GPUStaticInst* firstInstruction;
    /**
     * Identifiers of the blocks that follow (are reachable from) this block.
     */
    std::set<uint32_t> successorIds;
    /**
     * Identifiers of the blocks that will be visited from this block.
     */
    std::set<uint32_t> postDominatorIds;
 };
 class ControlFlowInfo
 {
 public:
    /**
     * Compute immediate post-dominator instruction for kernel instructions.
     */
    static void assignImmediatePostDominators(
            const std::vector<GPUStaticInst*>& instructions);
 private:
    ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions);
    GPUStaticInst* lastInstruction(const BasicBlock* block) const;
    BasicBlock* basicBlock(int inst_num) const;
    BasicBlock* postDominator(const BasicBlock* block) const;
    void createBasicBlocks();
    void connectBasicBlocks();
    void findPostDominators();
    void findImmediatePostDominators();
    void printBasicBlocks() const;
    void printBasicBlockDot() const;
    void printPostDominators() const;
    void printImmediatePostDominators() const;
    std::vector<std::unique_ptr<BasicBlock>> basicBlocks;
    std::vector<GPUStaticInst*> instructions;
 };
 #endif // __KERNEL_CFG_HH__
--- a/src/gpu-compute/lds_state.cc
+++ b/src/gpu-compute/lds_state.cc
@ -0,0 +1,341 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos, Joe Gross
 */
 #include "gpu-compute/lds_state.hh"
 #include <array>
 #include <cstdio>
 #include <cstdlib>
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/shader.hh"
 /**
 * the default constructor that works with SWIG
 */
 LdsState::LdsState(const Params *params) :
    MemObject(params),
    tickEvent(this),
    cuPort(name() + ".port", this),
    maximumSize(params->size),
    range(params->range),
    bankConflictPenalty(params->bankConflictPenalty),
    banks(params->banks)
 {
    fatal_if(params->banks <= 0,
             "Number of LDS banks should be positive number");
    fatal_if((params->banks & (params->banks - 1)) != 0,
             "Number of LDS banks should be a power of 2");
    fatal_if(params->size <= 0,
             "cannot allocate an LDS with a size less than 1");
    fatal_if(params->size % 2,
          "the LDS should be an even number");
 }
 /**
 * Needed by the SWIG compiler
 */
 LdsState *
 LdsStateParams::create()
 {
    return new LdsState(this);
 }
 /**
 * set the parent and name based on the parent
 */
 void
 LdsState::setParent(ComputeUnit *x_parent)
 {
    // check that this gets assigned to the same thing each time
    fatal_if(!x_parent, "x_parent should not be nullptr");
    fatal_if(x_parent == parent,
             "should not be setting the parent twice");
    parent = x_parent;
    _name = x_parent->name() + ".LdsState";
 }
 /**
 * derive the gpu mem packet from the packet and then count the bank conflicts
 */
 unsigned
 LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
 {
    Packet::SenderState *baseSenderState = packet->senderState;
    while (baseSenderState->predecessor) {
        baseSenderState = baseSenderState->predecessor;
    }
    const ComputeUnit::LDSPort::SenderState *senderState =
            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
    fatal_if(!senderState,
             "did not get the right sort of sender state");
    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
    return countBankConflicts(gpuDynInst, bankAccesses);
 }
 // Count the total number of bank conflicts for the local memory packet
 unsigned
 LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
                             unsigned *numBankAccesses)
 {
    int bank_conflicts = 0;
    std::vector<int> bank;
    // the number of LDS banks being touched by the memory instruction
    int numBanks = std::min(parent->wfSize(), banks);
    // if the wavefront size is larger than the number of LDS banks, we
    // need to iterate over all work items to calculate the total
    // number of bank conflicts
    int groups = (parent->wfSize() > numBanks) ?
        (parent->wfSize() / numBanks) : 1;
    for (int i = 0; i < groups; i++) {
        // Address Array holding all the work item addresses of an instruction
        std::vector<Addr> addr_array;
        addr_array.resize(numBanks, 0);
        bank.clear();
        bank.resize(banks, 0);
        int max_bank = 0;
        // populate the address array for all active work items
        for (int j = 0; j < numBanks; j++) {
            if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
                addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
            } else {
                addr_array[j] = std::numeric_limits<Addr>::max();
            }
        }
        if (gpuDynInst->m_op == Enums::MO_LD ||
            gpuDynInst->m_op == Enums::MO_ST) {
            // mask identical addresses
            for (int j = 0; j < numBanks; ++j) {
                for (int j0 = 0; j0 < j; j0++) {
                    if (addr_array[j] != std::numeric_limits<Addr>::max()
                                    && addr_array[j] == addr_array[j0]) {
                        addr_array[j] = std::numeric_limits<Addr>::max();
                    }
                }
            }
        }
        // calculate bank conflicts
        for (int j = 0; j < numBanks; ++j) {
            if (addr_array[j] != std::numeric_limits<Addr>::max()) {
                int bankId = addr_array[j] % banks;
                bank[bankId]++;
                max_bank = std::max(max_bank, bank[bankId]);
                // Count the number of LDS banks accessed.
                // Since we have masked identical addresses all remaining
                // accesses will need to be serialized if they access
                // the same bank (bank conflict).
                (*numBankAccesses)++;
            }
        }
        bank_conflicts += max_bank;
    }
    panic_if(bank_conflicts > parent->wfSize(),
             "Max bank conflicts should match num of work items per instr");
    return bank_conflicts;
 }
 /**
 * receive the packet from the CU
 */
 bool
 LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
 {
    return ownerLds->processPacket(packet);
 }
 GPUDynInstPtr
 LdsState::getDynInstr(PacketPtr packet)
 {
    ComputeUnit::LDSPort::SenderState *ss =
        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
                     packet->senderState);
    return ss->getMemInst();
 }
 /**
 * process an incoming packet, add it to the return queue
 */
 bool
 LdsState::processPacket(PacketPtr packet)
 {
    unsigned bankAccesses = 0;
    // the number of conflicts this packet will have when accessing the LDS
    unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
    // count the total number of physical LDS bank accessed
    parent->ldsBankAccesses += bankAccesses;
    // count the LDS bank conflicts. A number set to 1 indicates one
    // access per bank maximum so there are no bank conflicts
    parent->ldsBankConflictDist.sample(bankConflicts-1);
    GPUDynInstPtr dynInst = getDynInstr(packet);
    // account for the LDS bank conflict overhead
    int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
        (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
        parent->loadBusLength();
    // delay for accessing the LDS
    Tick processingTime =
        parent->shader->ticks(bankConflicts * bankConflictPenalty) +
        parent->shader->ticks(busLength);
    // choose (delay + last packet in queue) or (now + delay) as the time to
    // return this
    Tick doneAt = earliestReturnTime() + processingTime;
    // then store it for processing
    return returnQueuePush(std::make_pair(doneAt, packet));
 }
 /**
 * add this to the queue of packets to be returned
 */
 bool
 LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
 {
    // TODO add time limits (e.g. one packet per cycle) and queue size limits
    // and implement flow control
    returnQueue.push(thePair);
    // if there is no set wakeup time, look through the queue
    if (!tickEvent.scheduled()) {
        process();
    }
    return true;
 }
 /**
 * receive a packet in functional mode
 */
 void
 LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
 {
    fatal("not implemented");
 }
 /**
 * receive a retry for a response
 */
 void
 LdsState::CuSidePort::recvRespRetry()
 {
    // TODO verify that this is the right way to do this
    assert(ownerLds->isRetryResp());
    ownerLds->setRetryResp(false);
    ownerLds->process();
 }
 /**
 * receive a retry
 */
 void
 LdsState::CuSidePort::recvRetry()
 {
    fatal("not implemented");
 }
 /**
 * look for packets to return at this time
 */
 bool
 LdsState::process()
 {
    Tick now = clockEdge();
    // send back completed packets
    while (!returnQueue.empty() && returnQueue.front().first <= now) {
        PacketPtr packet = returnQueue.front().second;
        ComputeUnit::LDSPort::SenderState *ss =
            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
                            packet->senderState);
        GPUDynInstPtr gpuDynInst = ss->getMemInst();
        gpuDynInst->initiateAcc(gpuDynInst);
        packet->makeTimingResponse();
        returnQueue.pop();
        bool success = cuPort.sendTimingResp(packet);
        if (!success) {
            retryResp = true;
            panic("have not handled timing responses being NACK'd when sent"
                            "back");
        }
    }
    // determine the next wakeup time
    if (!returnQueue.empty()) {
        Tick next = returnQueue.front().first;
        if (tickEvent.scheduled()) {
            if (next < tickEvent.when()) {
                tickEvent.deschedule();
                tickEvent.schedule(next);
            }
        } else {
            tickEvent.schedule(next);
        }
    }
    return true;
 }
 /**
 * wake up at this time and perform specified actions
 */
 void
 LdsState::TickEvent::process()
 {
    ldsState->process();
 }
 /**
 *
 */
 void
 LdsState::regStats()
 {
 }
--- a/src/gpu-compute/lds_state.hh
+++ b/src/gpu-compute/lds_state.hh
@ -0,0 +1,512 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos, Joe Gross
 */
 #ifndef __LDS_STATE_HH__
 #define __LDS_STATE_HH__
 #include <array>
 #include <queue>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include "enums/MemOpType.hh"
 #include "enums/MemType.hh"
 #include "gpu-compute/misc.hh"
 #include "mem/mem_object.hh"
 #include "mem/port.hh"
 #include "params/LdsState.hh"
 class ComputeUnit;
 /**
 * this represents a slice of the overall LDS, intended to be associated with an
 * individual workgroup
 */
 class LdsChunk
 {
  public:
    LdsChunk(const uint32_t x_size):
        chunk(x_size)
    {
    }
    LdsChunk() {}
    /**
     * a read operation
     */
    template<class T>
    T
    read(const uint32_t index)
    {
        fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
        T *p0 = (T *) (&(chunk.at(index)));
        return *p0;
    }
    /**
     * a write operation
     */
    template<class T>
    void
    write(const uint32_t index, const T value)
    {
        fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
        T *p0 = (T *) (&(chunk.at(index)));
        *p0 = value;
    }
    /**
     * get the size of this chunk
     */
    std::vector<uint8_t>::size_type
    size() const
    {
        return chunk.size();
    }
  protected:
    // the actual data store for this slice of the LDS
    std::vector<uint8_t> chunk;
 };
 // Local Data Share (LDS) State per Wavefront (contents of the LDS region
 // allocated to the WorkGroup of this Wavefront)
 class LdsState: public MemObject
 {
  protected:
    /**
     * an event to allow event-driven execution
     */
    class TickEvent: public Event
    {
      protected:
        LdsState *ldsState = nullptr;
        Tick nextTick = 0;
      public:
        TickEvent(LdsState *_ldsState) :
            ldsState(_ldsState)
        {
        }
        virtual void
        process();
        void
        schedule(Tick when)
        {
            mainEventQueue[0]->schedule(this, when);
        }
        void
        deschedule()
        {
            mainEventQueue[0]->deschedule(this);
        }
    };
    /**
     * CuSidePort is the LDS Port closer to the CU side
     */
    class CuSidePort: public SlavePort
    {
      public:
        CuSidePort(const std::string &_name, LdsState *_ownerLds) :
                SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
        {
        }
      protected:
        LdsState *ownerLds;
        virtual bool
        recvTimingReq(PacketPtr pkt);
        virtual Tick
        recvAtomic(PacketPtr pkt)
        {
          return 0;
        }
        virtual void
        recvFunctional(PacketPtr pkt);
        virtual void
        recvRangeChange()
        {
        }
        virtual void
        recvRetry();
        virtual void
        recvRespRetry();
        virtual AddrRangeList
        getAddrRanges() const
        {
          AddrRangeList ranges;
          ranges.push_back(ownerLds->getAddrRange());
          return ranges;
        }
        template<typename T>
        void
        loadData(PacketPtr packet);
        template<typename T>
        void
        storeData(PacketPtr packet);
        template<typename T>
        void
        atomicOperation(PacketPtr packet);
    };
  protected:
    // the lds reference counter
    // The key is the workgroup ID and dispatch ID
    // The value is the number of wavefronts that reference this LDS, as
    // wavefronts are launched, the counter goes up for that workgroup and when
    // they return it decreases, once it reaches 0 then this chunk of the LDS is
    // returned to the available pool. However,it is deallocated on the 1->0
    // transition, not whenever the counter is 0 as it always starts with 0 when
    // the workgroup asks for space
    std::unordered_map<uint32_t,
                       std::unordered_map<uint32_t, int32_t>> refCounter;
    // the map that allows workgroups to access their own chunk of the LDS
    std::unordered_map<uint32_t,
                       std::unordered_map<uint32_t, LdsChunk>> chunkMap;
    // an event to allow the LDS to wake up at a specified time
    TickEvent tickEvent;
    // the queue of packets that are going back to the CU after a
    // read/write/atomic op
    // TODO need to make this have a maximum size to create flow control
    std::queue<std::pair<Tick, PacketPtr>> returnQueue;
    // whether or not there are pending responses
    bool retryResp = false;
    bool
    process();
    GPUDynInstPtr
    getDynInstr(PacketPtr packet);
    bool
    processPacket(PacketPtr packet);
    unsigned
    countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
    unsigned
    countBankConflicts(GPUDynInstPtr gpuDynInst,
                       unsigned *numBankAccesses);
  public:
    typedef LdsStateParams Params;
    LdsState(const Params *params);
    // prevent copy construction
    LdsState(const LdsState&) = delete;
    ~LdsState()
    {
        parent = nullptr;
    }
    const Params *
    params() const
    {
        return dynamic_cast<const Params *>(_params);
    }
    bool
    isRetryResp() const
    {
        return retryResp;
    }
    void
    setRetryResp(const bool value)
    {
        retryResp = value;
    }
    // prevent assignment
    LdsState &
    operator=(const LdsState &) = delete;
    /**
     * use the dynamic wave id to create or just increase the reference count
     */
    int
    increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
    {
        int refCount = getRefCounter(dispatchId, wgId);
        fatal_if(refCount < 0,
                 "reference count should not be below zero");
        return ++refCounter[dispatchId][wgId];
    }
    /**
     * decrease the reference count after making sure it is in the list
     * give back this chunk if the ref counter has reached 0
     */
    int
    decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
    {
      int refCount = getRefCounter(dispatchId, wgId);
      fatal_if(refCount <= 0,
              "reference count should not be below zero or at zero to"
              "decrement");
      refCounter[dispatchId][wgId]--;
      if (refCounter[dispatchId][wgId] == 0) {
        releaseSpace(dispatchId, wgId);
        return 0;
      } else {
        return refCounter[dispatchId][wgId];
      }
    }
    /**
     * return the current reference count for this workgroup id
     */
    int
    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
    {
      auto dispatchIter = chunkMap.find(dispatchId);
      fatal_if(dispatchIter == chunkMap.end(),
               "could not locate this dispatch id [%d]", dispatchId);
      auto workgroup = dispatchIter->second.find(wgId);
      fatal_if(workgroup == dispatchIter->second.end(),
               "could not find this workgroup id within this dispatch id"
               " did[%d] wgid[%d]", dispatchId, wgId);
      auto refCountIter = refCounter.find(dispatchId);
      if (refCountIter == refCounter.end()) {
        fatal("could not locate this dispatch id [%d]", dispatchId);
      } else {
        auto workgroup = refCountIter->second.find(wgId);
        if (workgroup == refCountIter->second.end()) {
          fatal("could not find this workgroup id within this dispatch id"
                  " did[%d] wgid[%d]", dispatchId, wgId);
        } else {
          return refCounter.at(dispatchId).at(wgId);
        }
      }
      fatal("should not reach this point");
      return 0;
    }
    /**
     * assign a parent and request this amount of space be set aside
     * for this wgid
     */
    LdsChunk *
    reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
            const uint32_t size)
    {
        if (chunkMap.find(dispatchId) != chunkMap.end()) {
            fatal_if(
                chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
                "duplicate workgroup ID asking for space in the LDS "
                "did[%d] wgid[%d]", dispatchId, wgId);
        }
        fatal_if(bytesAllocated + size > maximumSize,
                 "request would ask for more space than is available");
        bytesAllocated += size;
        chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
        // make an entry for this workgroup
        refCounter[dispatchId][wgId] = 0;
        return &chunkMap[dispatchId][wgId];
    }
    bool
    returnQueuePush(std::pair<Tick, PacketPtr> thePair);
    Tick
    earliestReturnTime() const
    {
        // TODO set to max(lastCommand+1, curTick())
        return returnQueue.empty() ? curTick() : returnQueue.back().first;
    }
    void
    setParent(ComputeUnit *x_parent);
    void
    regStats();
    // accessors
    ComputeUnit *
    getParent() const
    {
        return parent;
    }
    std::string
    getName()
    {
        return _name;
    }
    int
    getBanks() const
    {
        return banks;
    }
    ComputeUnit *
    getComputeUnit() const
    {
        return parent;
    }
    int
    getBankConflictPenalty() const
    {
        return bankConflictPenalty;
    }
    /**
     * get the allocated size for this workgroup
     */
    std::size_t
    ldsSize(const uint32_t x_wgId)
    {
        return chunkMap[x_wgId].size();
    }
    AddrRange
    getAddrRange() const
    {
        return range;
    }
    virtual BaseSlavePort &
    getSlavePort(const std::string& if_name, PortID idx)
    {
        if (if_name == "cuPort") {
            // TODO need to set name dynamically at this point?
            return cuPort;
        } else {
            fatal("cannot resolve the port name " + if_name);
        }
    }
    /**
     * can this much space be reserved for a workgroup?
     */
    bool
    canReserve(uint32_t x_size) const
    {
      return bytesAllocated + x_size <= maximumSize;
    }
  private:
    /**
     * give back the space
     */
    bool
    releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
    {
        auto dispatchIter = chunkMap.find(x_dispatchId);
        if (dispatchIter == chunkMap.end()) {
          fatal("dispatch id not found [%d]", x_dispatchId);
        } else {
          auto workgroupIter = dispatchIter->second.find(x_wgId);
          if (workgroupIter == dispatchIter->second.end()) {
            fatal("workgroup id [%d] not found in dispatch id [%d]",
                    x_wgId, x_dispatchId);
          }
        }
        fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
                 "releasing more space than was allocated");
        bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
        chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
        return true;
    }
    // the port that connects this LDS to its owner CU
    CuSidePort cuPort;
    ComputeUnit* parent = nullptr;
    std::string _name;
    // the number of bytes currently reserved by all workgroups
    int bytesAllocated = 0;
    // the size of the LDS, the most bytes available
    int maximumSize;
    // Address range of this memory
    AddrRange range;
    // the penalty, in cycles, for each LDS bank conflict
    int bankConflictPenalty = 0;
    // the number of banks in the LDS underlying data store
    int banks = 0;
 };
 #endif // __LDS_STATE_HH__
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@ -0,0 +1,200 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #include "gpu-compute/local_memory_pipeline.hh"
 #include "debug/GPUPort.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) :
    computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size)
 {
 }
 void
 LocalMemPipeline::init(ComputeUnit *cu)
 {
    computeUnit = cu;
    _name = computeUnit->name() + ".LocalMemPipeline";
 }
 void
 LocalMemPipeline::exec()
 {
    // apply any returned shared (LDS) memory operations
    GPUDynInstPtr m = !lmReturnedRequests.empty() ?
        lmReturnedRequests.front() : nullptr;
    bool accessVrf = true;
    if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
        accessVrf =
            w->computeUnit->vrf[m->simdId]->
            vrfOperandAccessReady(m->seqNum(), w, m,
                                  VrfAccessType::WRITE);
    }
    if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
        computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
                 || computeUnit->wfWait.at(m->pipeId).rdy())) {
        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
            doSmReturn<uint32_t, uint8_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
            doSmReturn<uint32_t, uint16_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
            doSmReturn<uint32_t, uint32_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
            doSmReturn<int32_t, int8_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
            doSmReturn<int32_t, int16_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
            doSmReturn<int32_t, int32_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
            doSmReturn<float, Float16>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
            doSmReturn<float, float>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
            doSmReturn<uint64_t, uint8_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
            doSmReturn<uint64_t, uint16_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
            doSmReturn<uint64_t, uint32_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
            doSmReturn<uint64_t, uint64_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
            doSmReturn<int64_t, int8_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
            doSmReturn<int64_t, int16_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
            doSmReturn<int64_t, int32_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
            doSmReturn<int64_t, int64_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
            doSmReturn<double, Float16>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
            doSmReturn<double, float>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
            doSmReturn<double, double>(m);
    }
    // If pipeline has executed a local memory instruction
    // execute local memory packet and issue the packets
    // to LDS
    if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
        GPUDynInstPtr m = lmIssuedRequests.front();
        bool returnVal = computeUnit->sendToLds(m);
        if (!returnVal) {
            DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
        }
        lmIssuedRequests.pop();
    }
 }
 template<typename c0, typename c1>
 void
 LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
 {
    lmReturnedRequests.pop();
    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
    // Return data to registers
    if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
        std::vector<uint32_t> regVec;
        for (int k = 0; k < m->n_reg; ++k) {
            int dst = m->dst_reg+k;
            if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
                dst = m->dst_reg_vec[k];
            // virtual->physical VGPR mapping
            int physVgpr = w->remap(dst,sizeof(c0),1);
            // save the physical VGPR index
            regVec.push_back(physVgpr);
            c1 *p1 = &((c1*)m->d_data)[k * VSZ];
            for (int i = 0; i < VSZ; ++i) {
                if (m->exec_mask[i]) {
                    // write the value into the physical VGPR. This is a purely
                    // functional operation. No timing is modeled.
                    w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
                                                                *p1, i);
                }
                ++p1;
            }
        }
        // Schedule the write operation of the load data on the VRF. This simply
        // models the timing aspect of the VRF write operation. It does not
        // modify the physical VGPR.
        loadVrfBankConflictCycles +=
            w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
                                                 regVec, sizeof(c0), m->time);
    }
    // Decrement outstanding request count
    computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
    if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op)
        || MO_H(m->m_op)) {
        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm,
                                         m->time, -1);
    }
    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm,
                                         m->time, -1);
    }
    // Mark write bus busy for appropriate amount of time
    computeUnit->locMemToVrfBus.set(m->time);
    if (computeUnit->shader->coissue_return == 0)
        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
 }
 void
 LocalMemPipeline::regStats()
 {
    loadVrfBankConflictCycles
        .name(name() + ".load_vrf_bank_conflict_cycles")
        .desc("total number of cycles LDS data are delayed before updating "
              "the VRF")
        ;
 }
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@ -0,0 +1,98 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #ifndef __LOCAL_MEMORY_PIPELINE_HH__
 #define __LOCAL_MEMORY_PIPELINE_HH__
 #include <queue>
 #include <string>
 #include "gpu-compute/misc.hh"
 #include "params/ComputeUnit.hh"
 #include "sim/stats.hh"
 /*
 * @file local_memory_pipeline.hh
 *
 * The local memory pipeline issues newly created local memory packets
 * from pipeline to the LDS. This stage also retires previously issued
 * loads and stores that have returned from the LDS.
 */
 class ComputeUnit;
 class Wavefront;
 class LocalMemPipeline
 {
  public:
    LocalMemPipeline(const ComputeUnitParams *params);
    void init(ComputeUnit *cu);
    void exec();
    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
    std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
    std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
    bool
    isLMRespFIFOWrRdy() const
    {
        return lmReturnedRequests.size() < lmQueueSize;
    }
    bool
    isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
    {
        return (lmIssuedRequests.size() + pendReqs) < lmQueueSize;
    }
    const std::string& name() const { return _name; }
    void regStats();
  private:
    ComputeUnit *computeUnit;
    std::string _name;
    int lmQueueSize;
    Stats::Scalar loadVrfBankConflictCycles;
    // Local Memory Request Fifo: all shared memory requests
    // are issued to this FIFO from the memory pipelines
    std::queue<GPUDynInstPtr> lmIssuedRequests;
    // Local Memory Response Fifo: all responses of shared memory
    // requests are sent to this FIFO from LDS
    std::queue<GPUDynInstPtr> lmReturnedRequests;
 };
 #endif // __LOCAL_MEMORY_PIPELINE_HH__
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@ -0,0 +1,162 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #ifndef __MISC_HH__
 #define __MISC_HH__
 #include <bitset>
 #include <memory>
 #include "base/misc.hh"
 class GPUDynInst;
 // wavefront size of the machine
 static const int VSZ = 64;
 /*
 This check is necessary because std::bitset only provides conversion to
 unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
 a few places in the code where to_ullong() is used, however if VSZ is larger
 than a value the host can support then bitset will throw a runtime exception.
 we should remove all use of to_long() or to_ullong() so we can have VSZ
 greater than 64b, however until that is done this assert is required.
 */
 static_assert(VSZ <= sizeof(unsigned long long) * 8,
              "VSZ is larger than the host can support");
 typedef std::bitset<VSZ> VectorMask;
 typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
 class WaitClass
 {
  public:
    WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
    void init(uint64_t *_tcnt, uint32_t _numStages=0)
    {
        tcnt = _tcnt;
        numStages = _numStages;
    }
    void set(uint32_t i)
    {
        fatal_if(nxtAvail > *tcnt,
                 "Can't allocate resource because it is busy!!!");
        nxtAvail = *tcnt + i;
    }
    void preset(uint32_t delay)
    {
        lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
    }
    bool rdy() const { return *tcnt >= nxtAvail; }
    bool prerdy() const { return *tcnt >= lookAheadAvail; }
  private:
    // timestamp indicating when resource will be available
    uint64_t nxtAvail;
    // timestamp indicating when resource will be available including
    // pending uses of the resource (when there is a cycle gap between
    // rdy() and set()
    uint64_t lookAheadAvail;
    // current timestamp
    uint64_t *tcnt;
    // number of stages between checking if a resource is ready and
    // setting the resource's utilization
    uint32_t numStages;
 };
 class Float16
 {
  public:
    uint16_t val;
    Float16() { val = 0; }
    Float16(const Float16 &x) : val(x.val) { }
    Float16(float x)
    {
        uint32_t ai = *(uint32_t *)&x;
        uint32_t s = (ai >> 31) & 0x1;
        uint32_t exp = (ai >> 23) & 0xff;
        uint32_t mant = (ai >> 0) & 0x7fffff;
        if (exp == 0 || exp <= 0x70) {
            exp = 0;
            mant = 0;
        } else if (exp == 0xff) {
            exp = 0x1f;
        } else if (exp >= 0x8f) {
            exp = 0x1f;
            mant = 0;
        } else {
            exp = exp - 0x7f + 0x0f;
        }
        mant = mant >> 13;
        val = 0;
        val |= (s << 15);
        val |= (exp << 10);
        val |= (mant << 0);
    }
    operator float() const
    {
        uint32_t s = (val >> 15) & 0x1;
        uint32_t exp = (val >> 10) & 0x1f;
        uint32_t mant = (val >> 0) & 0x3ff;
        if (!exp) {
            exp = 0;
            mant = 0;
        } else if (exp == 0x1f) {
            exp = 0xff;
        } else {
            exp = exp - 0x0f + 0x7f;
        }
        uint32_t val1 = 0;
        val1 |= (s << 31);
        val1 |= (exp << 23);
        val1 |= (mant << 13);
        return *(float*)&val1;
    }
 };
 #endif // __MISC_HH__
--- a/src/gpu-compute/ndrange.hh
+++ b/src/gpu-compute/ndrange.hh
@ -0,0 +1,70 @@
 /*
 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #ifndef __NDRANGE_HH__
 #define __NDRANGE_HH__
 #include "base/types.hh"
 #include "gpu-compute/qstruct.hh"
 struct NDRange
 {
    // copy of the queue entry provided at dispatch
    HsaQueueEntry q;
    // The current workgroup id (3 dimensions)
    int wgId[3];
    // The number of workgroups in each dimension
    int numWg[3];
    // The total number of workgroups
    int numWgTotal;
    // The number of completed work groups
    int numWgCompleted;
    // The global workgroup ID
    uint32_t globalWgId;
    // flag indicating whether all work groups have been launched
    bool wg_disp_rem;
    // kernel complete
    bool execDone;
    bool userDoorBellSet;
    volatile bool *addrToNotify;
    volatile uint32_t *numDispLeft;
    int dispatchId;
    int curTid; // Current thread id
 };
 #endif // __NDRANGE_HH__
--- a/src/gpu-compute/of_scheduling_policy.cc
+++ b/src/gpu-compute/of_scheduling_policy.cc
@ -0,0 +1,76 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #include "gpu-compute/of_scheduling_policy.hh"
 #include "gpu-compute/wavefront.hh"
 Wavefront*
 OFSchedulingPolicy::chooseWave()
 {
    // Set when policy choose a wave to schedule
    bool waveChosen = false;
    Wavefront *selectedWave = nullptr;
    int selectedWaveID = -1;
    uint32_t selectedPosition = 0;
    for (int position = 0; position < scheduleList->size(); ++position) {
        Wavefront *curWave = scheduleList->at(position);
        uint32_t curWaveID = curWave->wfDynId;
        // Choosed wave with the lowest wave ID
        if (selectedWaveID == -1 || curWaveID < selectedWaveID) {
            waveChosen = true;
            selectedWaveID = curWaveID;
            selectedWave = curWave;
            selectedPosition = position;
        }
    }
    // Check to make sure ready list had atleast one schedulable wave
    if (waveChosen) {
        scheduleList->erase(scheduleList->begin() + selectedPosition);
    } else {
        panic("Empty ready list");
    }
    return selectedWave;
 }
 void
 OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
 {
    scheduleList = list;
 }
--- a/src/gpu-compute/of_scheduling_policy.hh
+++ b/src/gpu-compute/of_scheduling_policy.hh
@ -0,0 +1,61 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #ifndef __OF_SCHEDULING_POLICY_HH__
 #define __OF_SCHEDULING_POLICY_HH__
 #include <cstddef>
 #include <vector>
 #include "base/misc.hh"
 class Wavefront;
 // Oldest First where age is marked by the wave id
 class OFSchedulingPolicy
 {
  public:
    OFSchedulingPolicy() : scheduleList(nullptr) { }
    Wavefront* chooseWave();
    void bindList(std::vector<Wavefront*> *list);
  private:
    // List of waves which are participating in scheduling.
    // This scheduler selects the oldest wave from this list
    std::vector<Wavefront*> *scheduleList;
 };
 #endif // __OF_SCHEDULING_POLICY_HH__
--- a/src/gpu-compute/pool_manager.cc
+++ b/src/gpu-compute/pool_manager.cc
@ -0,0 +1,42 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos
 */
 #include "gpu-compute/pool_manager.hh"
 PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
    : _minAllocation(minAlloc), _poolSize(poolSize)
 {
    assert(poolSize > 0);
 }
--- a/src/gpu-compute/pool_manager.hh
+++ b/src/gpu-compute/pool_manager.hh
@ -0,0 +1,66 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos
 */
 #ifndef __POOL_MANAGER_HH__
 #define __POOL_MANAGER_HH__
 #include <cassert>
 #include <cstdint>
 #include <string>
 // Pool Manager Logic
 class PoolManager
 {
  public:
    PoolManager(uint32_t minAlloc, uint32_t poolSize);
    uint32_t minAllocation() { return _minAllocation; }
    virtual std::string printRegion() = 0;
    virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
    virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0;
    virtual uint32_t allocateRegion(const uint32_t size,
                                    uint32_t *reserved) = 0;
    virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
    uint32_t poolSize() { return _poolSize; }
  private:
    // minimum size that can be reserved per allocation
    uint32_t _minAllocation;
    // pool size in number of elements
    uint32_t _poolSize;
 };
 #endif // __POOL_MANAGER_HH__
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@ -0,0 +1,201 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Brad Beckmann, Marc Orr
 */
 #ifndef __Q_STRUCT_HH__
 #define __Q_STRUCT_HH__
 #include <bitset>
 #include <cstdint>
 // Maximum number of arguments
 static const int KER_NUM_ARGS = 32;
 // Kernel argument buffer size
 static const int KER_ARGS_LENGTH = 512;
 class LdsChunk;
 struct NDRange;
 // Be very careful of alignment in this structure. The structure
 // must compile to the same layout in both 32-bit and 64-bit mode.
 struct HsaQueueEntry
 {
    // Base pointer for array of instruction pointers
    uint64_t code_ptr;
    // Grid Size (3 dimensions)
    uint32_t gdSize[3];
    // Workgroup Size (3 dimensions)
    uint32_t wgSize[3];
    uint16_t sRegCount;
    uint16_t dRegCount;
    uint16_t cRegCount;
    uint64_t privMemStart;
    uint32_t privMemPerItem;
    uint32_t privMemTotal;
    uint64_t spillMemStart;
    uint32_t spillMemPerItem;
    uint32_t spillMemTotal;
    uint64_t roMemStart;
    uint32_t roMemTotal;
    // Size (in bytes) of LDS
    uint32_t ldsSize;
    // Virtual Memory Id (unused right now)
    uint32_t vmId;
    // Pointer to dependency chain (unused now)
    uint64_t depends;
    // pointer to bool
    uint64_t addrToNotify;
    // pointer to uint32_t
    uint64_t numDispLeft;
    // variables to pass arguments when running in standalone mode,
    // will be removed when run.py and sh.cpp have been updated to
    // use args and offset arrays
    uint64_t arg1;
    uint64_t arg2;
    uint64_t arg3;
    uint64_t arg4;
    // variables to pass arguments when running in cpu+gpu mode
    uint8_t args[KER_ARGS_LENGTH];
    uint16_t offsets[KER_NUM_ARGS];
    uint16_t num_args;
 };
 // State used to start (or restart) a WF
 struct WFContext
 {
    // 32 bit values
    // barrier state
    int bar_cnt[VSZ];
    // id (which WF in the WG)
    int cnt;
    // more barrier state
    int max_bar_cnt;
    int old_barrier_cnt;
    int barrier_cnt;
    // More Program Counter Stuff
    uint32_t pc;
    // Program counter of the immediate post-dominator instruction
    uint32_t rpc;
    // WG wide state (I don't see how to avoid redundancy here)
    int cu_id;
    uint32_t wg_id;
    uint32_t barrier_id;
    // 64 bit values (these values depend on the wavefront size)
    // masks
    uint64_t init_mask;
    uint64_t exec_mask;
    // private memory;
    Addr privBase;
    Addr spillBase;
    LdsChunk *ldsChunk;
    /*
     * Kernel wide state
     * This is a hack. This state should be moved through simulated memory
     * during a yield. Though not much is being used here, so it's probably
     * probably not a big deal.
     *
     * Just to add to this comment... The ndr is derived from simulated
     * memory when the cl-runtime allocates an HsaQueueEntry and populates it
     * for a kernel launch. So in theory the runtime should be able to keep
     * that state around. Then a WF can reference it upon restart to derive
     * kernel wide state. The runtime can deallocate the state when the
     * kernel completes.
     */
    NDRange *ndr;
 };
 // State that needs to be passed between the simulation and simulated app, a
 // pointer to this struct can be passed through the depends field in the
 // HsaQueueEntry struct
 struct HostState
 {
    // cl_event* has original HsaQueueEntry for init
    uint64_t event;
 };
 // Total number of HSA queues
 static const int HSAQ_NQUEUES = 8;
 // These values will eventually live in memory mapped registers
 // and be settable by the kernel mode driver.
 // Number of entries in each HSA queue
 static const int HSAQ_SIZE = 64;
 // Address of first HSA queue index
 static const int HSAQ_INDX_BASE = 0x10000ll;
 // Address of first HSA queue
 static const int HSAQ_BASE = 0x11000ll;
 // Suggested start of HSA code
 static const int HSA_CODE_BASE = 0x18000ll;
 // These are shortcuts for deriving the address of a specific
 // HSA queue or queue index
 #define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n)
 #define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue))
 #define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0))
 #define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1))
 #define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2))
 /*
 * Example code for writing to a queue
 *
 * void
 * ToQueue(int n,struct fsaQueue *val)
 * {
 *     int wi = *(int*)HSAQ_WI(n);
 *     int ri = *(int*)HSAQ_RI(n);
 *     int ci = *(int*)HSAQ_CI(n);
 *
 *     if (ci - ri < HSAQ_SIZE) {
 *         (*(int*)HSAQ_CI(n))++;
 *         *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val;
 *         (*(int*)HSAQ_WI(n))++;
 *     }
 * }
 */
 #endif // __Q_STRUCT_HH__
--- a/src/gpu-compute/rr_scheduling_policy.cc
+++ b/src/gpu-compute/rr_scheduling_policy.cc
@ -0,0 +1,67 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #include "gpu-compute/rr_scheduling_policy.hh"
 #include "gpu-compute/wavefront.hh"
 Wavefront*
 RRSchedulingPolicy::chooseWave()
 {
    Wavefront *selectedWave = nullptr;
    // Check to make sure ready list had atleast one schedulable wave
    if (scheduleList->size()) {
        // For RR policy, select the wave which is at the
        // front of the list. The selected wave is popped
        // out from the schedule list immediately after selection
        // to avoid starvation. It is the responsibility of the
        // module invoking the RR scheduler to make surei scheduling
        // eligible waves are added to the back of the schedule
        // list
        selectedWave = scheduleList->front();
        scheduleList->erase(scheduleList->begin() + 0);
    } else {
        panic("Empty ready list");
    }
    return selectedWave;
 }
 void
 RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
 {
    scheduleList = list;
 }
--- a/src/gpu-compute/rr_scheduling_policy.hh
+++ b/src/gpu-compute/rr_scheduling_policy.hh
@ -0,0 +1,65 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #ifndef __RR_SCHEDULING_POLICY_HH__
 #define __RR_SCHEDULING_POLICY_HH__
 #include <inttypes.h>
 #include <cstddef>
 #include <utility>
 #include <vector>
 #include "base/misc.hh"
 class Wavefront;
 // Round-Robin pick among the list of ready waves
 class RRSchedulingPolicy
 {
  public:
    RRSchedulingPolicy() : scheduleList(nullptr) { }
    Wavefront* chooseWave();
    void bindList(std::vector<Wavefront*> *list);
  private:
    // List of waves which are participating in scheduling.
    // This scheduler selects one wave from this list based on
    // round robin policy
    std::vector<Wavefront*> *scheduleList;
 };
 #endif // __RR_SCHEDULING_POLICY_HH__
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@ -0,0 +1,151 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
    : numSIMDs(p->num_SIMDs),
      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
 {
    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
        Scheduler newScheduler(p);
        scheduler.push_back(newScheduler);
    }
 }
 ScheduleStage::~ScheduleStage()
 {
    scheduler.clear();
    waveStatusList.clear();
 }
 void
 ScheduleStage::init(ComputeUnit *cu)
 {
    computeUnit = cu;
    _name = computeUnit->name() + ".ScheduleStage";
    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
        scheduler[j].bindList(&computeUnit->readyList[j]);
    }
    for (int j = 0; j < numSIMDs; ++j) {
        waveStatusList.push_back(&computeUnit->waveStatusList[j]);
    }
    dispatchList = &computeUnit->dispatchList;
 }
 void
 ScheduleStage::arbitrate()
 {
    // iterate over all Memory pipelines
    for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
        if (dispatchList->at(j).first) {
            Wavefront *waveToMemPipe = dispatchList->at(j).first;
            // iterate over all execution pipelines
            for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
                if ((i != j) && (dispatchList->at(i).first)) {
                    Wavefront *waveToExePipe = dispatchList->at(i).first;
                    // if the two selected wavefronts are mapped to the same
                    // SIMD unit then they share the VRF
                    if (waveToMemPipe->simdId == waveToExePipe->simdId) {
                        int simdId = waveToMemPipe->simdId;
                        // Read VRF port arbitration:
                        // If there are read VRF port conflicts between the
                        // a memory and another instruction we drop the other
                        // instruction. We don't need to check for write VRF
                        // port conflicts because the memory instruction either
                        // does not need to write to the VRF (store) or will
                        // write to the VRF when the data comes back (load) in
                        // which case the arbiter of the memory pipes will
                        // resolve any conflicts
                        if (computeUnit->vrf[simdId]->
                            isReadConflict(waveToMemPipe->wfSlotId,
                            waveToExePipe->wfSlotId)) {
                            // FIXME: The "second" member variable is never
                            // used in the model. I am setting it to READY
                            // simply to follow the protocol of setting it
                            // when the WF has an instruction ready to issue
                            waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
                                                    .second = READY;
                            dispatchList->at(i).first = nullptr;
                            dispatchList->at(i).second = EMPTY;
                            break;
                        }
                    }
                }
            }
        }
    }
 }
 void
 ScheduleStage::exec()
 {
    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
         uint32_t readyListSize = computeUnit->readyList[j].size();
         // If no wave is ready to be scheduled on the execution resource
         // then skip scheduling for this execution resource
         if (!readyListSize) {
             continue;
         }
         Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
         dispatchList->at(j).first = waveToBeDispatched;
         waveToBeDispatched->updateResources();
         dispatchList->at(j).second = FILLED;
         waveStatusList[waveToBeDispatched->simdId]->at(
                 waveToBeDispatched->wfSlotId).second = BLOCKED;
         assert(computeUnit->readyList[j].size() == readyListSize - 1);
    }
    // arbitrate over all shared resources among instructions being issued
    // simultaneously
    arbitrate();
 }
 void
 ScheduleStage::regStats()
 {
 }
--- a/src/gpu-compute/schedule_stage.hh
+++ b/src/gpu-compute/schedule_stage.hh
@ -0,0 +1,95 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #ifndef __SCHEDULE_STAGE_HH__
 #define __SCHEDULE_STAGE_HH__
 #include <utility>
 #include <vector>
 #include "gpu-compute/exec_stage.hh"
 #include "gpu-compute/scheduler.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 // Schedule or execution arbitration stage.
 // From the pool of ready waves in the ready list,
 // one wave is selected for each execution resource.
 // The selection is made based on a scheduling policy
 class ComputeUnit;
 class Wavefront;
 struct ComputeUnitParams;
 class ScheduleStage
 {
  public:
    ScheduleStage(const ComputeUnitParams *params);
    ~ScheduleStage();
    void init(ComputeUnit *cu);
    void exec();
    void arbitrate();
    // Stats related variables and methods
    std::string name() { return _name; }
    void regStats();
  private:
    ComputeUnit *computeUnit;
    uint32_t numSIMDs;
    uint32_t numMemUnits;
    // Each execution resource will have its own
    // scheduler and a dispatch list
    std::vector<Scheduler> scheduler;
    // Stores the status of waves. A READY implies the
    // wave is ready to be scheduled this cycle and
    // is already present in the readyList
    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
        waveStatusList;
    // List of waves which will be dispatched to
    // each execution resource. A FILLED implies
    // dispatch list is non-empty and
    // execution unit has something to execute
    // this cycle. Currently, the dispatch list of
    // an execution resource can hold only one wave because
    // an execution resource can execute only one wave in a cycle.
    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
    std::string _name;
 };
 #endif // __SCHEDULE_STAGE_HH__
--- a/src/gpu-compute/scheduler.cc
+++ b/src/gpu-compute/scheduler.cc
@ -0,0 +1,71 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #include "gpu-compute/scheduler.hh"
 Scheduler::Scheduler(const ComputeUnitParams *p)
 {
    if (p->execPolicy  == "OLDEST-FIRST") {
        schedPolicy = SCHED_POLICY::OF_POLICY;
    } else if (p->execPolicy  == "ROUND-ROBIN") {
        schedPolicy = SCHED_POLICY::RR_POLICY;
    } else {
        fatal("Unimplemented scheduling policy");
    }
 }
 Wavefront*
 Scheduler::chooseWave()
 {
    if (schedPolicy == SCHED_POLICY::OF_POLICY) {
        return OFSchedPolicy.chooseWave();
    } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
        return RRSchedPolicy.chooseWave();
    } else {
        fatal("Unimplemented scheduling policy");
    }
 }
 void
 Scheduler::bindList(std::vector<Wavefront*> *list)
 {
    if (schedPolicy == SCHED_POLICY::OF_POLICY) {
        OFSchedPolicy.bindList(list);
    } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
        RRSchedPolicy.bindList(list);
    } else {
        fatal("Unimplemented scheduling policy");
    }
 }
--- a/src/gpu-compute/scheduler.hh
+++ b/src/gpu-compute/scheduler.hh
@ -0,0 +1,63 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #ifndef __SCHEDULER_HH__
 #define __SCHEDULER_HH__
 #include "gpu-compute/of_scheduling_policy.hh"
 #include "gpu-compute/rr_scheduling_policy.hh"
 #include "gpu-compute/scheduling_policy.hh"
 #include "params/ComputeUnit.hh"
 enum SCHED_POLICY
 {
    OF_POLICY = 0,
    RR_POLICY
 };
 class Scheduler
 {
  public:
    Scheduler(const ComputeUnitParams *params);
    Wavefront *chooseWave();
    void bindList(std::vector<Wavefront*> *list);
  private:
    SCHED_POLICY schedPolicy;
    SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy;
    SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy;
 };
 #endif // __SCHEDULER_HH__
--- a/src/gpu-compute/scheduling_policy.hh
+++ b/src/gpu-compute/scheduling_policy.hh
@ -0,0 +1,57 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #ifndef __SCHEDULING_POLICY_HH__
 #define __SCHEDULING_POLICY_HH__
 #include <vector>
 template<typename Impl>
 class SchedulingPolicy
 {
  public:
    Wavefront* chooseWave() { return policyImpl.chooseWave(); }
    void
    bindList(std::vector<Wavefront*> *list)
    {
        return policyImpl.bindList(list);
    }
  private:
    Impl policyImpl;
 };
 #endif // __SCHEDULING_POLICY_HH__
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@ -0,0 +1,173 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"
 #include "params/ComputeUnit.hh"
 ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
    : numSIMDs(p->num_SIMDs),
      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
      numGlbMemPipes(p->num_global_mem_pipes),
      numShrMemPipes(p->num_shared_mem_pipes),
      vectorAluInstAvail(nullptr),
      lastGlbMemSimd(-1),
      lastShrMemSimd(-1), glbMemInstAvail(nullptr),
      shrMemInstAvail(nullptr)
 {
 }
 ScoreboardCheckStage::~ScoreboardCheckStage()
 {
    readyList.clear();
    waveStatusList.clear();
    shrMemInstAvail = nullptr;
    glbMemInstAvail = nullptr;
 }
 void
 ScoreboardCheckStage::init(ComputeUnit *cu)
 {
    computeUnit = cu;
    _name = computeUnit->name() + ".ScoreboardCheckStage";
    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
        readyList.push_back(&computeUnit->readyList[unitId]);
    }
    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
        waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
    }
    vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
    glbMemInstAvail= &computeUnit->glbMemInstAvail;
    shrMemInstAvail= &computeUnit->shrMemInstAvail;
 }
 void
 ScoreboardCheckStage::initStatistics()
 {
    lastGlbMemSimd = -1;
    lastShrMemSimd = -1;
    *glbMemInstAvail = 0;
    *shrMemInstAvail = 0;
    for (int unitId = 0; unitId < numSIMDs; ++unitId)
        vectorAluInstAvail->at(unitId) = false;
 }
 void
 ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
 {
    if (curWave->instructionBuffer.empty())
        return;
    // track which vector SIMD unit has at least one WV with a vector
    // ALU as the oldest instruction in its Instruction buffer
    vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
                                     curWave->isOldestInstALU();
    // track how many vector SIMD units have at least one WV with a
    // vector Global memory instruction as the oldest instruction
    // in its Instruction buffer
    if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
         curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
        *glbMemInstAvail <= 1) {
        (*glbMemInstAvail)++;
        lastGlbMemSimd = unitId;
    }
    // track how many vector SIMD units have at least one WV with a
    // vector shared memory (LDS) instruction as the oldest instruction
    // in its Instruction buffer
    // TODO: parametrize the limit of the LDS units
    if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
        lastShrMemSimd != unitId) {
        (*shrMemInstAvail)++;
        lastShrMemSimd = unitId;
    }
 }
 void
 ScoreboardCheckStage::exec()
 {
    initStatistics();
    // reset the ready list for all execution units; it will be
    // constructed every cycle since resource availability may change
    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
        readyList[unitId]->clear();
    }
    // iterate over the Wavefronts of all SIMD units
    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
        for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
            // reset the ready status of each wavefront
            waveStatusList[unitId]->at(wvId).second = BLOCKED;
            Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
            collectStatistics(curWave, unitId);
            if (curWave->ready(Wavefront::I_ALU)) {
                readyList[unitId]->push_back(curWave);
                waveStatusList[unitId]->at(wvId).second = READY;
            } else if (curWave->ready(Wavefront::I_GLOBAL)) {
                if (computeUnit->cedeSIMD(unitId, wvId)) {
                    continue;
                }
                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
                waveStatusList[unitId]->at(wvId).second = READY;
            } else if (curWave->ready(Wavefront::I_SHARED)) {
                readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
                waveStatusList[unitId]->at(wvId).second = READY;
            } else if (curWave->ready(Wavefront::I_FLAT)) {
                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
                waveStatusList[unitId]->at(wvId).second = READY;
            } else if (curWave->ready(Wavefront::I_PRIVATE)) {
                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
                waveStatusList[unitId]->at(wvId).second = READY;
            }
        }
    }
 }
 void
 ScoreboardCheckStage::regStats()
 {
 }
--- a/src/gpu-compute/scoreboard_check_stage.hh
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@ -0,0 +1,106 @@
 /*
 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */
 #ifndef __SCOREBOARD_CHECK_STAGE_HH__
 #define __SCOREBOARD_CHECK_STAGE_HH__
 #include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
 class ComputeUnit;
 class Wavefront;
 struct ComputeUnitParams;
 enum WAVE_STATUS
 {
    BLOCKED = 0,
    READY
 };
 /*
 * Scoreboard check stage.
 * All wavefronts are analyzed to see if they are ready
 * to be executed this cycle. Both structural and data
 * hazards are considered while marking a wave "ready"
 * for execution. After analysis, the ready waves are
 * added to readyList.
 */
 class ScoreboardCheckStage
 {
  public:
    ScoreboardCheckStage(const ComputeUnitParams* params);
    ~ScoreboardCheckStage();
    void init(ComputeUnit *cu);
    void exec();
    // Stats related variables and methods
    const std::string& name() const { return _name; }
    void regStats();
  private:
    void collectStatistics(Wavefront *curWave, int unitId);
    void initStatistics();
    ComputeUnit *computeUnit;
    uint32_t numSIMDs;
    uint32_t numMemUnits;
    uint32_t numGlbMemPipes;
    uint32_t numShrMemPipes;
    // flag per vector SIMD unit that is set when there is at least one
    // WF that has a vector ALU instruction as the oldest in its
    // Instruction Buffer
    std::vector<bool> *vectorAluInstAvail;
    int lastGlbMemSimd;
    int lastShrMemSimd;
    int *glbMemInstAvail;
    int *shrMemInstAvail;
    // List of waves which are ready to be scheduled.
    // Each execution resource has a ready list
    std::vector<std::vector<Wavefront*>*> readyList;
    // Stores the status of waves. A READY implies the
    // wave is ready to be scheduled this cycle and
    // is already present in the readyList
    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
        waveStatusList;
    std::string _name;
 };
 #endif // __SCOREBOARD_CHECK_STAGE_HH__
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@ -0,0 +1,412 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #include "gpu-compute/shader.hh"
 #include <limits>
 #include "arch/x86/linux/linux.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUDisp.hh"
 #include "debug/GPUMem.hh"
 #include "debug/HSAIL.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/qstruct.hh"
 #include "gpu-compute/wavefront.hh"
 #include "mem/packet.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "sim/sim_exit.hh"
 Shader::Shader(const Params *p) : SimObject(p),
    clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
    cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
    hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
    separate_acquire_release(p->separate_acquire_release), coissue_return(1),
    trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
    globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
    box_tick_cnt(0), start_tick_cnt(0)
 {
    cuList.resize(n_cu);
    for (int i = 0; i < n_cu; ++i) {
        cuList[i] = p->CUs[i];
        assert(i == cuList[i]->cu_id);
        cuList[i]->shader = this;
    }
 }
 Addr
 Shader::mmap(int length)
 {
    Addr start;
    // round up length to the next page
    length = roundUp(length, TheISA::PageBytes);
    if (X86Linux64::mmapGrowsDown()) {
        DPRINTF(HSAIL, "GROWS DOWN");
        start = gpuTc->getProcessPtr()->mmap_end -length;
        gpuTc->getProcessPtr()->mmap_end = start;
    } else {
        DPRINTF(HSAIL, "GROWS UP");
        start = gpuTc->getProcessPtr()->mmap_end;
        gpuTc->getProcessPtr()->mmap_end += length;
        // assertion to make sure we don't overwrite the stack (it grows down)
        assert(gpuTc->getProcessPtr()->mmap_end <
                gpuTc->getProcessPtr()->stack_base -
                gpuTc->getProcessPtr()->max_stack_size);
    }
    DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
    gpuTc->getProcessPtr()->allocateMem(start,length);
    return start;
 }
 void
 Shader::init()
 {
    // grab the threadContext of the thread running on the CPU
    assert(cpuPointer);
    gpuTc = cpuPointer->getContext(0);
    assert(gpuTc);
 }
 Shader::~Shader()
 {
    for (int j = 0; j < n_cu; ++j)
        delete cuList[j];
 }
 void
 Shader::updateThreadContext(int tid) {
    // thread context of the thread which dispatched work
    assert(cpuPointer);
    gpuTc = cpuPointer->getContext(tid);
    assert(gpuTc);
 }
 void
 Shader::hostWakeUp(BaseCPU *cpu) {
    if (cpuPointer == cpu) {
        if (gpuTc->status() == ThreadContext::Suspended)
            cpu->activateContext(gpuTc->threadId());
    } else {
        //Make sure both dispatcher and shader are trying to
        //wakeup same host. Hack here to enable kernel launch
        //from multiple CPUs
        panic("Dispatcher wants to wakeup a different host");
    }
 }
 Shader*
 ShaderParams::create()
 {
    return new Shader(this);
 }
 void
 Shader::exec()
 {
    tick_cnt = curTick();
    box_tick_cnt = curTick() - start_tick_cnt;
    // apply any scheduled adds
    for (int i = 0; i < sa_n; ++i) {
        if (sa_when[i] <= tick_cnt) {
            *sa_val[i] += sa_x[i];
            sa_val.erase(sa_val.begin() + i);
            sa_x.erase(sa_x.begin() + i);
            sa_when.erase(sa_when.begin() + i);
            --sa_n;
            --i;
        }
    }
    // clock all of the cu's
    for (int i = 0; i < n_cu; ++i)
        cuList[i]->exec();
 }
 bool
 Shader::dispatch_workgroups(NDRange *ndr)
 {
    bool scheduledSomething = false;
    int cuCount = 0;
    int curCu = nextSchedCu;
    while (cuCount < n_cu) {
        //Every time we try a CU, update nextSchedCu
        nextSchedCu = (nextSchedCu + 1) % n_cu;
        // dispatch workgroup iff the following two conditions are met:
        // (a) wg_rem is true - there are unassigned workgroups in the grid
        // (b) there are enough free slots in cu cuList[i] for this wg
        if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
            scheduledSomething = true;
            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
            // ticks() member function translates cycles to simulation ticks.
            if (!tickEvent.scheduled()) {
                schedule(tickEvent, curTick() + this->ticks(1));
            }
            cuList[curCu]->StartWorkgroup(ndr);
            ndr->wgId[0]++;
            ndr->globalWgId++;
            if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
                ndr->wgId[0] = 0;
                ndr->wgId[1]++;
                if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
                    ndr->wgId[1] = 0;
                    ndr->wgId[2]++;
                    if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
                        ndr->wg_disp_rem = false;
                        break;
                    }
                }
            }
        }
        ++cuCount;
        curCu = nextSchedCu;
    }
    return scheduledSomething;
 }
 void
 Shader::handshake(GpuDispatcher *_dispatcher)
 {
    dispatcher = _dispatcher;
 }
 void
 Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
                           bool suppress_func_errors, int cu_id)
 {
    unsigned block_size = RubySystem::getBlockSizeBytes();
    unsigned size = req->getSize();
    Addr tmp_addr;
    BaseTLB::Mode trans_mode;
    if (cmd == MemCmd::ReadReq) {
        trans_mode = BaseTLB::Read;
    } else if (cmd == MemCmd::WriteReq) {
        trans_mode = BaseTLB::Write;
    } else {
        fatal("unexcepted MemCmd\n");
    }
    tmp_addr = req->getVaddr();
    Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
    assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
    // Misaligned access
    if (split_addr > tmp_addr) {
        RequestPtr req1, req2;
        req->splitOnVaddr(split_addr, req1, req2);
        PacketPtr pkt1 = new Packet(req2, cmd);
        PacketPtr pkt2 = new Packet(req1, cmd);
        functionalTLBAccess(pkt1, cu_id, trans_mode);
        functionalTLBAccess(pkt2, cu_id, trans_mode);
        PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
        PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
        new_pkt1->dataStatic(data);
        new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
        if (suppress_func_errors) {
            new_pkt1->setSuppressFuncError();
            new_pkt2->setSuppressFuncError();
        }
        // fixme: this should be cuList[cu_id] if cu_id != n_cu
        // The latter requires a memPort in the dispatcher
        cuList[0]->memPort[0]->sendFunctional(new_pkt1);
        cuList[0]->memPort[0]->sendFunctional(new_pkt2);
        delete new_pkt1;
        delete new_pkt2;
        delete pkt1;
        delete pkt2;
    } else {
        PacketPtr pkt = new Packet(req, cmd);
        functionalTLBAccess(pkt, cu_id, trans_mode);
        PacketPtr new_pkt = new Packet(pkt->req, cmd);
        new_pkt->dataStatic(data);
        if (suppress_func_errors) {
            new_pkt->setSuppressFuncError();
        };
        // fixme: this should be cuList[cu_id] if cu_id != n_cu
        // The latter requires a memPort in the dispatcher
        cuList[0]->memPort[0]->sendFunctional(new_pkt);
        delete new_pkt;
        delete pkt;
    }
 }
 bool
 Shader::busy()
 {
    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
        if (!cuList[i_cu]->isDone()) {
            return true;
        }
    }
    return false;
 }
 void
 Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
 {
    sa_val.push_back(val);
    sa_when.push_back(tick_cnt + when);
    sa_x.push_back(x);
    ++sa_n;
 }
 Shader::TickEvent::TickEvent(Shader *_shader)
    : Event(CPU_Tick_Pri), shader(_shader)
 {
 }
 void
 Shader::TickEvent::process()
 {
    if (shader->busy()) {
        shader->exec();
        shader->schedule(this, curTick() + shader->ticks(1));
    }
 }
 const char*
 Shader::TickEvent::description() const
 {
    return "Shader tick";
 }
 void
 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                  MemCmd cmd, bool suppress_func_errors)
 {
    uint8_t *data_buf = (uint8_t*)ptr;
    for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
         !gen.done(); gen.next()) {
        Request *req = new Request(0, gen.addr(), gen.size(), 0,
                                   cuList[0]->masterId(), 0, 0, 0);
        doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
        data_buf += gen.size();
        delete req;
    }
 }
 void
 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
 {
    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
 }
 void
 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                bool suppress_func_errors)
 {
    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
 }
 void
 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
 {
    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
 }
 void
 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                 bool suppress_func_errors)
 {
    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
              suppress_func_errors);
 }
 /*
 * Send a packet through the appropriate TLB functional port.
 * If cu_id=n_cu, then this is the dispatcher's TLB.
 * Otherwise it's the TLB of the cu_id compute unit.
 */
 void
 Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
 {
    // update senderState. Need to know the gpuTc and the TLB mode
    pkt->senderState =
        new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
    if (cu_id == n_cu) {
        dispatcher->tlbPort->sendFunctional(pkt);
    } else {
        // even when the perLaneTLB flag is turned on
        // it's ok tp send all accesses through lane 0
        // since the lane # is not known here,
        // This isn't important since these are functional accesses.
        cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
    }
    /* safe_cast the senderState */
    TheISA::GpuTLB::TranslationState *sender_state =
               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
    delete sender_state->tlbEntry;
    delete pkt->senderState;
 }
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@ -0,0 +1,212 @@
 /*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Steve Reinhardt
 */
 #ifndef __SHADER_HH__
 #define __SHADER_HH__
 #include <functional>
 #include <string>
 #include "arch/isa.hh"
 #include "arch/isa_traits.hh"
 #include "base/types.hh"
 #include "cpu/simple/atomic.hh"
 #include "cpu/simple/timing.hh"
 #include "cpu/simple_thread.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/thread_state.hh"
 #include "enums/MemOpType.hh"
 #include "enums/MemType.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_tlb.hh"
 #include "gpu-compute/lds_state.hh"
 #include "gpu-compute/qstruct.hh"
 #include "mem/page_table.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/Shader.hh"
 #include "sim/faults.hh"
 #include "sim/process.hh"
 #include "sim/sim_object.hh"
 class BaseTLB;
 class GpuDispatcher;
 namespace TheISA
 {
    class GpuTLB;
 }
 static const int LDS_SIZE = 65536;
 // Class Shader: This describes a single shader instance. Most
 // configurations will only have a single shader.
 class Shader : public SimObject
 {
  protected:
      // Shader's clock period in terms of number of ticks of curTime,
      // aka global simulation clock
      Tick clock;
  public:
    typedef ShaderParams Params;
    enum hsail_mode_e {SIMT,VECTOR_SCALAR};
    // clock related functions ; maps to-and-from
    // Simulation ticks and shader clocks.
    Tick frequency() const { return SimClock::Frequency / clock; }
    Tick ticks(int numCycles) const { return  (Tick)clock * numCycles; }
    Tick getClock() const { return clock; }
    Tick curCycle() const { return curTick() / clock; }
    Tick tickToCycles(Tick val) const { return val / clock;}
    SimpleThread *cpuThread;
    ThreadContext *gpuTc;
    BaseCPU *cpuPointer;
    class TickEvent : public Event
    {
      private:
        Shader *shader;
      public:
        TickEvent(Shader*);
        void process();
        const char* description() const;
    };
    TickEvent tickEvent;
    // is this simulation going to be timing mode in the memory?
    bool timingSim;
    hsail_mode_e hsail_mode;
    // If set, issue acq packet @ kernel launch
    int impl_kern_boundary_sync;
    // If set, generate a separate packet for acquire/release on
    // ld_acquire/st_release/atomic operations
    int separate_acquire_release;
    // If set, fetch returns may be coissued with instructions
    int coissue_return;
    // If set, always dump all 64 gprs to trace
    int trace_vgpr_all;
    // Number of cu units in the shader
    int n_cu;
    // Number of wavefront slots per cu
    int n_wf;
    // The size of global memory
    int globalMemSize;
    /*
     * Bytes/work-item for call instruction
     * The number of arguments for an hsail function will
     * vary. We simply determine the maximum # of arguments
     * required by any hsail function up front before the
     * simulation (during parsing of the Brig) and record
     * that number here.
     */
    int funcargs_size;
    // Tracks CU that rr dispatcher should attempt scheduling
    int nextSchedCu;
    // Size of scheduled add queue
    uint32_t sa_n;
    // Pointer to value to be increments
    std::vector<uint32_t*> sa_val;
    // When to do the increment
    std::vector<uint64_t> sa_when;
    // Amount to increment by
    std::vector<int32_t> sa_x;
    // List of Compute Units (CU's)
    std::vector<ComputeUnit*> cuList;
    uint64_t tick_cnt;
    uint64_t box_tick_cnt;
    uint64_t start_tick_cnt;
    GpuDispatcher *dispatcher;
    Shader(const Params *p);
    ~Shader();
    virtual void init();
    // Run shader
    void exec();
    // Check to see if shader is busy
    bool busy();
    // Schedule a 32-bit value to be incremented some time in the future
    void ScheduleAdd(uint32_t *val, Tick when, int x);
    bool processTimingPacket(PacketPtr pkt);
    void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                   MemCmd cmd, bool suppress_func_errors);
    void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
    void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
                 bool suppress_func_errors);
    void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
    void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
                  bool suppress_func_errors);
    void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
                            bool suppress_func_errors, int cu_id);
    void
    registerCU(int cu_id, ComputeUnit *compute_unit)
    {
        cuList[cu_id] = compute_unit;
    }
    void handshake(GpuDispatcher *dispatcher);
    bool dispatch_workgroups(NDRange *ndr);
    Addr mmap(int length);
    void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
    void updateThreadContext(int tid);
    void hostWakeUp(BaseCPU *cpu);
 };
 #endif // __SHADER_HH__
--- a/src/gpu-compute/simple_pool_manager.cc
+++ b/src/gpu-compute/simple_pool_manager.cc
@ -0,0 +1,108 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos
 */
 #include "gpu-compute/simple_pool_manager.hh"
 #include "base/misc.hh"
 // return the min number of elements that the manager can reserve given
 // a request for "size" elements
 uint32_t
 SimplePoolManager::minAllocatedElements(uint32_t size)
 {
    fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
             size);
    return size % minAllocation() > 0 ?
        (minAllocation() - (size % minAllocation())) + size : size;
 }
 std::string
 SimplePoolManager::printRegion()
 {
    std::string _cout;
    if (_reservedGroups == 0)
        _cout = "VRF is empty\n";
    else if (_reservedGroups > 0) {
        uint32_t reservedEntries = _reservedGroups * _regionSize;
        _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
    }
    return _cout;
 }
 bool
 SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
 {
    assert(numRegions * minAllocatedElements(size) <= poolSize());
    return _reservedGroups == 0;
 }
 void
 SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx)
 {
    assert(_reservedGroups > 0);
    --_reservedGroups;
    if (!_reservedGroups)
        _nxtFreeIdx = 0;
 }
 uint32_t
 SimplePoolManager::allocateRegion(const uint32_t size,
                                  uint32_t *reservedPoolSize)
 {
    uint32_t actualSize = minAllocatedElements(size);
    uint32_t startIdx = _nxtFreeIdx;
    _nxtFreeIdx += actualSize;
    _regionSize = actualSize;
    assert(_nxtFreeIdx < poolSize());
    *reservedPoolSize = actualSize;
    ++_reservedGroups;
    return startIdx;
 }
 uint32_t
 SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
 {
    bool wrapAround = (region.first > region.second);
    if (!wrapAround) {
        return region.second - region.first + 1;
    } else {
        return region.second + poolSize() - region.first + 1;
    }
 }
--- a/src/gpu-compute/simple_pool_manager.hh
+++ b/src/gpu-compute/simple_pool_manager.hh
@ -0,0 +1,72 @@
 /*
 * Copyright (c) 2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: John Kalamatianos
 */
 #ifndef __SIMPLE_POOL_MANAGER_HH__
 #define __SIMPLE_POOL_MANAGER_HH__
 #include <cassert>
 #include <cstdint>
 #include "gpu-compute/pool_manager.hh"
 // Simple Pool Manager: allows one region per pool. No region merging is
 // supported.
 class SimplePoolManager : public PoolManager
 {
  public:
    SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
        : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
          _reservedGroups(0)
    {
    }
    uint32_t minAllocatedElements(uint32_t size);
    std::string printRegion();
    bool canAllocate(uint32_t numRegions, uint32_t size);
    uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize);
    void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
    uint32_t regionSize(std::pair<uint32_t,uint32_t> &region);
  private:
    // actual size of a region (normalized to the minimum size that can
    // be reserved)
    uint32_t _regionSize;
    // next index to allocate a region
    uint8_t _nxtFreeIdx;
    // number of groups that reserve a region
    uint32_t _reservedGroups;
 };
 #endif // __SIMPLE_POOL_MANAGER_HH__
--- a/Show more
+++ b/Show more