gpu-compute: AMD's baseline GPU model

This commit is contained in:
Tony Gutierrez 2016-01-19 14:28:22 -05:00
parent 28e353e040
commit 1a7d3f9fcb
191 changed files with 95286 additions and 92 deletions

View file

@ -1065,7 +1065,9 @@ main = conf.Finish()
# Define the universe of supported ISAs
all_isa_list = [ ]
all_gpu_isa_list = [ ]
Export('all_isa_list')
Export('all_gpu_isa_list')
class CpuModel(object):
'''The CpuModel class encapsulates everything the ISA parser needs to
@ -1121,9 +1123,11 @@ for bdir in [ base_dir ] + extras_dir_list:
SConscript(joinpath(root, 'SConsopts'))
all_isa_list.sort()
all_gpu_isa_list.sort()
sticky_vars.AddVariables(
EnumVariable('TARGET_ISA', 'Target ISA', 'alpha', all_isa_list),
EnumVariable('TARGET_GPU_ISA', 'Target GPU ISA', 'hsail', all_gpu_isa_list),
ListVariable('CPU_MODELS', 'CPU models',
sorted(n for n,m in CpuModel.dict.iteritems() if m.default),
sorted(CpuModel.dict.keys())),
@ -1139,6 +1143,7 @@ sticky_vars.AddVariables(
BoolVariable('USE_FENV', 'Use <fenv.h> IEEE mode control', have_fenv),
BoolVariable('CP_ANNOTATE', 'Enable critical path annotation capability', False),
BoolVariable('USE_KVM', 'Enable hardware virtualized (KVM) CPU models', have_kvm),
BoolVariable('BUILD_GPU', 'Build the compute-GPU model', False),
EnumVariable('PROTOCOL', 'Coherence protocol for Ruby', 'None',
all_protocols),
EnumVariable('BACKTRACE_IMPL', 'Post-mortem dump implementation',
@ -1146,9 +1151,9 @@ sticky_vars.AddVariables(
)
# These variables get exported to #defines in config/*.hh (see src/SConscript).
export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'CP_ANNOTATE',
'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL', 'HAVE_PROTOBUF',
'HAVE_PERF_ATTR_EXCLUDE_HOST']
export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'TARGET_GPU_ISA',
'CP_ANNOTATE', 'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL',
'HAVE_PROTOBUF', 'HAVE_PERF_ATTR_EXCLUDE_HOST']
###################################################
#
@ -1226,6 +1231,7 @@ main.SConscript('ext/nomali/SConscript',
###################################################
main['ALL_ISA_LIST'] = all_isa_list
main['ALL_GPU_ISA_LIST'] = all_gpu_isa_list
all_isa_deps = {}
def make_switching_dir(dname, switch_headers, env):
# Generate the header. target[0] is the full path of the output
@ -1258,6 +1264,35 @@ def make_switching_dir(dname, switch_headers, env):
Export('make_switching_dir')
def make_gpu_switching_dir(dname, switch_headers, env):
# Generate the header. target[0] is the full path of the output
# header to generate. 'source' is a dummy variable, since we get the
# list of ISAs from env['ALL_ISA_LIST'].
def gen_switch_hdr(target, source, env):
fname = str(target[0])
isa = env['TARGET_GPU_ISA'].lower()
try:
f = open(fname, 'w')
print >>f, '#include "%s/%s/%s"' % (dname, isa, basename(fname))
f.close()
except IOError:
print "Failed to create %s" % fname
raise
# Build SCons Action object. 'varlist' specifies env vars that this
# action depends on; when env['ALL_ISA_LIST'] changes these actions
# should get re-executed.
switch_hdr_action = MakeAction(gen_switch_hdr,
Transform("GENERATE"), varlist=['ALL_ISA_GPU_LIST'])
# Instantiate actions for each header
for hdr in switch_headers:
env.Command(hdr, [], switch_hdr_action)
Export('make_gpu_switching_dir')
# all-isas -> all-deps -> all-environs -> all_targets
main.Alias('#all-isas', [])
main.Alias('#all-deps', '#all-isas')

5
build_opts/HSAIL_X86 Normal file
View file

@ -0,0 +1,5 @@
PROTOCOL = 'GPU_RfO'
TARGET_ISA = 'x86'
TARGET_GPU_ISA = 'hsail'
BUILD_GPU = True
CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'

View file

@ -0,0 +1,3 @@
PROTOCOL = 'MOESI_AMD_Base'
TARGET_ISA = 'x86'
CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'

View file

@ -0,0 +1,203 @@
#
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Lisa Hsu
#
# Configure the TLB hierarchy
# Places which would probably need to be modified if you
# want a different hierarchy are specified by a <Modify here .. >'
# comment
import m5
from m5.objects import *
def TLB_constructor(level):
constructor_call = "X86GPUTLB(size = options.L%(level)dTLBentries, \
assoc = options.L%(level)dTLBassoc, \
hitLatency = options.L%(level)dAccessLatency,\
missLatency2 = options.L%(level)dMissLatency,\
maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
accessDistance = options.L%(level)dAccessDistanceStat,\
clk_domain = SrcClockDomain(\
clock = options.GPUClock,\
voltage_domain = VoltageDomain(\
voltage = options.gpu_voltage)))" % locals()
return constructor_call
def Coalescer_constructor(level):
constructor_call = "TLBCoalescer(probesPerCycle = \
options.L%(level)dProbesPerCycle, \
coalescingWindow = options.L%(level)dCoalescingWindow,\
disableCoalescing = options.L%(level)dDisableCoalescing,\
clk_domain = SrcClockDomain(\
clock = options.GPUClock,\
voltage_domain = VoltageDomain(\
voltage = options.gpu_voltage)))" % locals()
return constructor_call
def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
# arguments: options, TLB level, number of private structures for this Level,
# TLB name and Coalescer name
for i in xrange(my_index):
TLB_name.append(eval(TLB_constructor(my_level)))
Coalescer_name.append(eval(Coalescer_constructor(my_level)))
def config_tlb_hierarchy(options, system, shader_idx):
n_cu = options.num_compute_units
# Make this configurable now, instead of the hard coded val. The dispatcher
# is always the last item in the system.cpu list.
dispatcher_idx = len(system.cpu) - 1
if options.TLB_config == "perLane":
num_TLBs = 64 * n_cu
elif options.TLB_config == "mono":
num_TLBs = 1
elif options.TLB_config == "perCU":
num_TLBs = n_cu
elif options.TLB_config == "2CU":
num_TLBs = n_cu >> 1
else:
print "Bad option for TLB Configuration."
sys.exit(1)
#----------------------------------------------------------------------------------------
# A visual representation of the TLB hierarchy
# for ease of configuration
# < Modify here the width and the number of levels if you want a different configuration >
# width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
{'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
{'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
TLB_hierarchy = [L1, L2, L3]
#----------------------------------------------------------------------------------------
# Create the hiearchy
# Call the appropriate constructors and add objects to the system
for i in xrange(len(TLB_hierarchy)):
hierarchy_level = TLB_hierarchy[i]
level = i+1
for TLB_type in hierarchy_level:
TLB_index = TLB_type['width']
TLB_array = TLB_type['TLBarray']
Coalescer_array = TLB_type['CoalescerArray']
# If the sim calls for a fixed L1 TLB size across CUs,
# override the TLB entries option
if options.tot_L1TLB_size:
options.L1TLBentries = options.tot_L1TLB_size / num_TLBs
if options.L1TLBassoc > options.L1TLBentries:
options.L1TLBassoc = options.L1TLBentries
# call the constructors for the TLB and the Coalescer
create_TLB_Coalescer(options, level, TLB_index,\
TLB_array, Coalescer_array)
system_TLB_name = TLB_type['name'] + '_tlb'
system_Coalescer_name = TLB_type['name'] + '_coalescer'
# add the different TLB levels to the system
# Modify here if you want to make the TLB hierarchy a child of
# the shader.
exec('system.%s = TLB_array' % system_TLB_name)
exec('system.%s = Coalescer_array' % system_Coalescer_name)
#===========================================================
# Specify the TLB hierarchy (i.e., port connections)
# All TLBs but the last level TLB need to have a memSidePort (master)
#===========================================================
# Each TLB is connected with its Coalescer through a single port.
# There is a one-to-one mapping of TLBs to Coalescers at a given level
# This won't be modified no matter what the hierarchy looks like.
for i in xrange(len(TLB_hierarchy)):
hierarchy_level = TLB_hierarchy[i]
level = i+1
for TLB_type in hierarchy_level:
name = TLB_type['name']
for index in range(TLB_type['width']):
exec('system.%s_coalescer[%d].master[0] = \
system.%s_tlb[%d].slave[0]' % \
(name, index, name, index))
# Connect the cpuSidePort (slave) of all the coalescers in level 1
# < Modify here if you want a different configuration >
for TLB_type in L1:
name = TLB_type['name']
num_TLBs = TLB_type['width']
if name == 'l1': # L1 D-TLBs
tlb_per_cu = num_TLBs / n_cu
for cu_idx in range(n_cu):
if tlb_per_cu:
for tlb in range(tlb_per_cu):
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
system.l1_coalescer[%d].slave[%d]' % \
(shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
else:
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
system.l1_coalescer[%d].slave[%d]' % \
(shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
elif name == 'dispatcher': # Dispatcher TLB
for index in range(TLB_type['width']):
exec('system.cpu[%d].translation_port = \
system.dispatcher_coalescer[%d].slave[0]' % \
(dispatcher_idx, index))
elif name == 'sqc': # I-TLB
for index in range(n_cu):
sqc_tlb_index = index / options.cu_per_sqc
sqc_tlb_port_id = index % options.cu_per_sqc
exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
system.sqc_coalescer[%d].slave[%d]' % \
(shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
# Connect the memSidePorts (masters) of all the TLBs with the
# cpuSidePorts (slaves) of the Coalescers of the next level
# < Modify here if you want a different configuration >
# L1 <-> L2
l2_coalescer_index = 0
for TLB_type in L1:
name = TLB_type['name']
for index in range(TLB_type['width']):
exec('system.%s_tlb[%d].master[0] = \
system.l2_coalescer[0].slave[%d]' % \
(name, index, l2_coalescer_index))
l2_coalescer_index += 1
# L2 <-> L3
system.l2_tlb[0].master[0] = system.l3_coalescer[0].slave[0]
return system

View file

@ -0,0 +1,109 @@
#
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Myrto Papadopoulou
#
def tlb_options(parser):
#===================================================================
# TLB Configuration
#===================================================================
parser.add_option("--TLB-config", type="string", default="perCU",
help="Options are: perCU (default), mono, 2CU, or perLane")
#===================================================================
# L1 TLB Options (D-TLB, I-TLB, Dispatcher-TLB)
#===================================================================
parser.add_option("--L1TLBentries", type='int', default="32")
parser.add_option("--L1TLBassoc", type='int', default="32")
parser.add_option("--L1AccessLatency", type='int', default="1",
help="latency in gpu cycles")
parser.add_option("--L1MissLatency", type='int', default="750",
help="latency (in gpu cycles) of a page walk, "
"if this is a last level TLB")
parser.add_option("--L1MaxOutstandingReqs", type='int', default="64")
parser.add_option("--L1AccessDistanceStat", action="store_true")
parser.add_option("--tot-L1TLB-size", type="int", default="0")
#===================================================================
# L2 TLB Options
#===================================================================
parser.add_option("--L2TLBentries", type='int', default="4096")
parser.add_option("--L2TLBassoc", type='int', default="32")
parser.add_option("--L2AccessLatency", type='int', default="69",
help="latency in gpu cycles")
parser.add_option("--L2MissLatency", type='int', default="750",
help="latency (in gpu cycles) of a page walk, "
"if this is a last level TLB")
parser.add_option("--L2MaxOutstandingReqs", type='int', default="64")
parser.add_option("--L2AccessDistanceStat", action="store_true")
#===================================================================
# L3 TLB Options
#===================================================================
parser.add_option("--L3TLBentries", type='int', default="8192")
parser.add_option("--L3TLBassoc", type='int', default="32")
parser.add_option("--L3AccessLatency", type='int', default="150",
help="latency in gpu cycles")
parser.add_option("--L3MissLatency", type='int', default="750",
help="latency (in gpu cycles) of a page walk")
parser.add_option("--L3MaxOutstandingReqs", type='int', default="64")
parser.add_option("--L3AccessDistanceStat", action="store_true")
#===================================================================
# L1 TLBCoalescer Options
#===================================================================
parser.add_option("--L1ProbesPerCycle", type='int', default="2")
parser.add_option("--L1CoalescingWindow", type='int', default="1")
parser.add_option("--L1DisableCoalescing", action="store_true")
#===================================================================
# L2 TLBCoalescer Options
#===================================================================
parser.add_option("--L2ProbesPerCycle", type='int', default="2")
parser.add_option("--L2CoalescingWindow", type='int', default="1")
parser.add_option("--L2DisableCoalescing", action="store_true")
#===================================================================
# L3 TLBCoalescer Options
#===================================================================
parser.add_option("--L3ProbesPerCycle", type='int', default="2")
parser.add_option("--L3CoalescingWindow", type='int', default="1")
parser.add_option("--L3DisableCoalescing", action="store_true")

499
configs/example/apu_se.py Normal file
View file

@ -0,0 +1,499 @@
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Sooraj Puthoor
#
import optparse, os, re
import math
import glob
import inspect
import m5
from m5.objects import *
from m5.util import addToPath
addToPath('../ruby')
addToPath('../common')
addToPath('../topologies')
import Options
import Ruby
import Simulation
import GPUTLBOptions, GPUTLBConfig
########################## Script Options ########################
def setOption(parser, opt_str, value = 1):
# check to make sure the option actually exists
if not parser.has_option(opt_str):
raise Exception("cannot find %s in list of possible options" % opt_str)
opt = parser.get_option(opt_str)
# set the value
exec("parser.values.%s = %s" % (opt.dest, value))
def getOption(parser, opt_str):
# check to make sure the option actually exists
if not parser.has_option(opt_str):
raise Exception("cannot find %s in list of possible options" % opt_str)
opt = parser.get_option(opt_str)
# get the value
exec("return_value = parser.values.%s" % opt.dest)
return return_value
# Adding script options
parser = optparse.OptionParser()
Options.addCommonOptions(parser)
Options.addSEOptions(parser)
parser.add_option("--cpu-only-mode", action="store_true", default=False,
help="APU mode. Used to take care of problems in "\
"Ruby.py while running APU protocols")
parser.add_option("-k", "--kernel-files",
help="file(s) containing GPU kernel code (colon separated)")
parser.add_option("-u", "--num-compute-units", type="int", default=1,
help="number of GPU compute units"),
parser.add_option("--num-cp", type="int", default=0,
help="Number of GPU Command Processors (CP)")
parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
# not super important now, but to avoid putting the number 4 everywhere, make
# it an option/knob
parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
"sharing an SQC (icache, and thus icache TLB)")
parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
"per CU")
parser.add_option("--wf-size", type="int", default=64,
help="Wavefront size(in workitems)")
parser.add_option("--sp-bypass-path-length", type="int", default=4, \
help="Number of stages of bypass path in vector ALU for Single Precision ops")
parser.add_option("--dp-bypass-path-length", type="int", default=4, \
help="Number of stages of bypass path in vector ALU for Double Precision ops")
# issue period per SIMD unit: number of cycles before issuing another vector
parser.add_option("--issue-period", type="int", default=4, \
help="Number of cycles per vector instruction issue period")
parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
help="VGPR to Coalescer (Global Memory) data bus width in bytes")
parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
help="Coalescer to VGPR (Global Memory) data bus width in bytes")
# Currently we only support 1 local memory pipe
parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
help="Number of Shared Memory pipelines per CU")
# Currently we only support 1 global memory pipe
parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
help="Number of Global Memory pipelines per CU")
parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
"WF slots per SIMD")
parser.add_option("--vreg-file-size", type="int", default=2048,
help="number of physical vector registers per SIMD")
parser.add_option("--bw-scalor", type="int", default=0,
help="bandwidth scalor for scalability analysis")
parser.add_option("--CPUClock", type="string", default="2GHz",
help="CPU clock")
parser.add_option("--GPUClock", type="string", default="1GHz",
help="GPU clock")
parser.add_option("--cpu-voltage", action="store", type="string",
default='1.0V',
help = """CPU voltage domain""")
parser.add_option("--gpu-voltage", action="store", type="string",
default='1.0V',
help = """CPU voltage domain""")
parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
parser.add_option("--xact-cas-mode", action="store_true",
help="enable load_compare mode (transactional CAS)")
parser.add_option("--SegFaultDebug",action="store_true",
help="checks for GPU seg fault before TLB access")
parser.add_option("--FunctionalTLB",action="store_true",
help="Assumes TLB has no latency")
parser.add_option("--LocalMemBarrier",action="store_true",
help="Barrier does not wait for writethroughs to complete")
parser.add_option("--countPages", action="store_true",
help="Count Page Accesses and output in per-CU output files")
parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
"TLBs")
parser.add_option("--pf-type", type="string", help="type of prefetch: "\
"PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
parser.add_option("--pf-stride", type="int", help="set prefetch stride")
parser.add_option("--numLdsBanks", type="int", default=32,
help="number of physical banks per LDS module")
parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
help="number of cycles per LDS bank conflict")
Ruby.define_options(parser)
#add TLB options to the parser
GPUTLBOptions.tlb_options(parser)
(options, args) = parser.parse_args()
# The GPU cache coherence protocols only work with the backing store
setOption(parser, "--access-backing-store")
# if benchmark root is specified explicitly, that overrides the search path
if options.benchmark_root:
benchmark_path = [options.benchmark_root]
else:
# Set default benchmark search path to current dir
benchmark_path = ['.']
########################## Sanity Check ########################
# Currently the gpu model requires ruby
if buildEnv['PROTOCOL'] == 'None':
fatal("GPU model requires ruby")
# Currently the gpu model requires only timing or detailed CPU
if not (options.cpu_type == "timing" or
options.cpu_type == "detailed"):
fatal("GPU model requires timing or detailed CPU")
# This file can support multiple compute units
assert(options.num_compute_units >= 1)
# Currently, the sqc (I-Cache of GPU) is shared by
# multiple compute units(CUs). The protocol works just fine
# even if sqc is not shared. Overriding this option here
# so that the user need not explicitly set this (assuming
# sharing sqc is the common usage)
n_cu = options.num_compute_units
num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
options.num_sqc = num_sqc # pass this to Ruby
########################## Creating the GPU system ########################
# shader is the GPU
shader = Shader(n_wf = options.wfs_per_simd,
clk_domain = SrcClockDomain(
clock = options.GPUClock,
voltage_domain = VoltageDomain(
voltage = options.gpu_voltage)))
# GPU_RfO(Read For Ownership) implements SC/TSO memory model.
# Other GPU protocols implement release consistency at GPU side.
# So, all GPU protocols other than GPU_RfO should make their writes
# visible to the global memory and should read from global memory
# during kernal boundary. The pipeline initiates(or do not initiate)
# the acquire/release operation depending on this impl_kern_boundary_sync
# flag. This flag=true means pipeline initiates a acquire/release operation
# at kernel boundary.
if buildEnv['PROTOCOL'] == 'GPU_RfO':
shader.impl_kern_boundary_sync = False
else:
shader.impl_kern_boundary_sync = True
# Switching off per-lane TLB by default
per_lane = False
if options.TLB_config == "perLane":
per_lane = True
# List of compute units; one GPU can have multiple compute units
compute_units = []
for i in xrange(n_cu):
compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
num_SIMDs = options.simds_per_cu,
wfSize = options.wf_size,
spbypass_pipe_length = options.sp_bypass_path_length,
dpbypass_pipe_length = options.dp_bypass_path_length,
issue_period = options.issue_period,
coalescer_to_vrf_bus_width = \
options.glbmem_rd_bus_width,
vrf_to_coalescer_bus_width = \
options.glbmem_wr_bus_width,
num_global_mem_pipes = \
options.glb_mem_pipes_per_cu,
num_shared_mem_pipes = \
options.shr_mem_pipes_per_cu,
n_wf = options.wfs_per_simd,
execPolicy = options.CUExecPolicy,
xactCasMode = options.xact_cas_mode,
debugSegFault = options.SegFaultDebug,
functionalTLB = options.FunctionalTLB,
localMemBarrier = options.LocalMemBarrier,
countPages = options.countPages,
localDataStore = \
LdsState(banks = options.numLdsBanks,
bankConflictPenalty = \
options.ldsBankConflictPenalty)))
wavefronts = []
vrfs = []
for j in xrange(options.simds_per_cu):
for k in xrange(shader.n_wf):
wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
vrfs.append(VectorRegisterFile(simd_id=j,
num_regs_per_simd=options.vreg_file_size))
compute_units[-1].wavefronts = wavefronts
compute_units[-1].vector_register_file = vrfs
if options.TLB_prefetch:
compute_units[-1].prefetch_depth = options.TLB_prefetch
compute_units[-1].prefetch_prev_type = options.pf_type
# attach the LDS and the CU to the bus (actually a Bridge)
compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
# Attach compute units to GPU
shader.CUs = compute_units
########################## Creating the CPU system ########################
options.num_cpus = options.num_cpus
# The shader core will be whatever is after the CPU cores are accounted for
shader_idx = options.num_cpus
# The command processor will be whatever is after the shader is accounted for
cp_idx = shader_idx + 1
cp_list = []
# List of CPUs
cpu_list = []
# We only support timing mode for shader and memory
shader.timing = True
mem_mode = 'timing'
# create the cpus
for i in range(options.num_cpus):
cpu = None
if options.cpu_type == "detailed":
cpu = DerivO3CPU(cpu_id=i,
clk_domain = SrcClockDomain(
clock = options.CPUClock,
voltage_domain = VoltageDomain(
voltage = options.cpu_voltage)))
elif options.cpu_type == "timing":
cpu = TimingSimpleCPU(cpu_id=i,
clk_domain = SrcClockDomain(
clock = options.CPUClock,
voltage_domain = VoltageDomain(
voltage = options.cpu_voltage)))
else:
fatal("Atomic CPU not supported/tested")
cpu_list.append(cpu)
# create the command processors
for i in xrange(options.num_cp):
cp = None
if options.cpu_type == "detailed":
cp = DerivO3CPU(cpu_id = options.num_cpus + i,
clk_domain = SrcClockDomain(
clock = options.CPUClock,
voltage_domain = VoltageDomain(
voltage = options.cpu_voltage)))
elif options.cpu_type == 'timing':
cp = TimingSimpleCPU(cpu_id=options.num_cpus + i,
clk_domain = SrcClockDomain(
clock = options.CPUClock,
voltage_domain = VoltageDomain(
voltage = options.cpu_voltage)))
else:
fatal("Atomic CPU not supported/tested")
cp_list = cp_list + [cp]
########################## Creating the GPU dispatcher ########################
# Dispatcher dispatches work from host CPU to GPU
host_cpu = cpu_list[0]
dispatcher = GpuDispatcher()
########################## Create and assign the workload ########################
# Check for rel_path in elements of base_list using test, returning
# the first full path that satisfies test
def find_path(base_list, rel_path, test):
for base in base_list:
if not base:
# base could be None if environment var not set
continue
full_path = os.path.join(base, rel_path)
if test(full_path):
return full_path
fatal("%s not found in %s" % (rel_path, base_list))
def find_file(base_list, rel_path):
return find_path(base_list, rel_path, os.path.isfile)
executable = find_path(benchmark_path, options.cmd, os.path.exists)
# it's common for a benchmark to be in a directory with the same
# name as the executable, so we handle that automatically
if os.path.isdir(executable):
benchmark_path = [executable]
executable = find_file(benchmark_path, options.cmd)
if options.kernel_files:
kernel_files = [find_file(benchmark_path, f)
for f in options.kernel_files.split(':')]
else:
# if kernel_files is not set, see if there's a unique .asm file
# in the same directory as the executable
kernel_path = os.path.dirname(executable)
kernel_files = glob.glob(os.path.join(kernel_path, '*.asm'))
if kernel_files:
print "Using GPU kernel code file(s)", ",".join(kernel_files)
else:
fatal("Can't locate kernel code (.asm) in " + kernel_path)
# OpenCL driver
driver = ClDriver(filename="hsa", codefile=kernel_files)
for cpu in cpu_list:
cpu.workload = LiveProcess(executable = executable,
cmd = [options.cmd] + options.options.split(),
drivers = [driver])
for cp in cp_list:
cp.workload = host_cpu.workload
########################## Create the overall system ########################
# Full list of processing cores in the system. Note that
# dispatcher is also added to cpu_list although it is
# not a processing element
cpu_list = cpu_list + [shader] + cp_list + [dispatcher]
# creating the overall system
# notice the cpu list is explicitly added as a parameter to System
system = System(cpu = cpu_list,
mem_ranges = [AddrRange(options.mem_size)],
cache_line_size = options.cacheline_size,
mem_mode = mem_mode)
system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
system.clk_domain = SrcClockDomain(clock = options.sys_clock,
voltage_domain = system.voltage_domain)
# configure the TLB hierarchy
GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
# create Ruby system
system.piobus = IOXBar(width=32, response_latency=0,
frontend_latency=0, forward_latency=0)
Ruby.create_system(options, None, system)
system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
voltage_domain = system.voltage_domain)
# attach the CPU ports to Ruby
for i in range(options.num_cpus):
ruby_port = system.ruby._cpu_ports[i]
# Create interrupt controller
system.cpu[i].createInterruptController()
# Connect cache port's to ruby
system.cpu[i].icache_port = ruby_port.slave
system.cpu[i].dcache_port = ruby_port.slave
ruby_port.mem_master_port = system.piobus.slave
if buildEnv['TARGET_ISA'] == "x86":
system.cpu[i].interrupts[0].pio = system.piobus.master
system.cpu[i].interrupts[0].int_master = system.piobus.slave
system.cpu[i].interrupts[0].int_slave = system.piobus.master
# attach CU ports to Ruby
# Because of the peculiarities of the CP core, you may have 1 CPU but 2
# sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
# hooked up until after the CP. To make this script generic, figure out
# the index as below, but note that this assumes there is one sequencer
# per compute unit and one sequencer per SQC for the math to work out
# correctly.
gpu_port_idx = len(system.ruby._cpu_ports) \
- options.num_compute_units - options.num_sqc
gpu_port_idx = gpu_port_idx - options.num_cp * 2
wavefront_size = options.wf_size
for i in xrange(n_cu):
# The pipeline issues wavefront_size number of uncoalesced requests
# in one GPU issue cycle. Hence wavefront_size mem ports.
for j in xrange(wavefront_size):
system.cpu[shader_idx].CUs[i].memory_port[j] = \
system.ruby._cpu_ports[gpu_port_idx].slave[j]
gpu_port_idx += 1
for i in xrange(n_cu):
if i > 0 and not i % options.cu_per_sqc:
print "incrementing idx on ", i
gpu_port_idx += 1
system.cpu[shader_idx].CUs[i].sqc_port = \
system.ruby._cpu_ports[gpu_port_idx].slave
gpu_port_idx = gpu_port_idx + 1
# attach CP ports to Ruby
for i in xrange(options.num_cp):
system.cpu[cp_idx].createInterruptController()
system.cpu[cp_idx].dcache_port = \
system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
system.cpu[cp_idx].icache_port = \
system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
cp_idx = cp_idx + 1
# connect dispatcher to the system.piobus
dispatcher.pio = system.piobus.master
dispatcher.dma = system.piobus.slave
################# Connect the CPU and GPU via GPU Dispatcher ###################
# CPU rings the GPU doorbell to notify a pending task
# using this interface.
# And GPU uses this interface to notify the CPU of task completion
# The communcation happens through emulated driver.
# Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
# parameters must be after the explicit setting of the System cpu list
shader.cpu_pointer = host_cpu
dispatcher.cpu = host_cpu
dispatcher.shader_pointer = shader
dispatcher.cl_driver = driver
########################## Start simulation ########################
root = Root(system=system, full_system=False)
m5.ticks.setGlobalFrequency('1THz')
if options.abs_max_tick:
maxtick = options.abs_max_tick
else:
maxtick = m5.MaxTick
# Benchmarks support work item annotations
Simulation.setWorkCountOptions(system, options)
# Checkpointing is not supported by APU model
if (options.checkpoint_dir != None or
options.checkpoint_restore != None):
fatal("Checkpointing not supported by apu model")
checkpoint_dir = None
m5.instantiate(checkpoint_dir)
# Map workload to this address space
host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
exit_event = m5.simulate(maxtick)
print "Ticks:", m5.curTick()
print 'Exiting because ', exit_event.getCause()
sys.exit(exit_event.getCode())

View file

@ -0,0 +1,187 @@
#
# Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Brad Beckmann
#
import m5
from m5.objects import *
from m5.defines import buildEnv
from m5.util import addToPath
import os, optparse, sys
addToPath('../common')
addToPath('../ruby')
addToPath('../topologies')
import Options
import Ruby
# Get paths we might need.
config_path = os.path.dirname(os.path.abspath(__file__))
config_root = os.path.dirname(config_path)
m5_root = os.path.dirname(config_root)
parser = optparse.OptionParser()
Options.addCommonOptions(parser)
parser.add_option("--maxloads", metavar="N", default=100,
help="Stop after N loads")
parser.add_option("-f", "--wakeup_freq", metavar="N", default=10,
help="Wakeup every N cycles")
parser.add_option("-u", "--num-compute-units", type="int", default=1,
help="number of compute units in the GPU")
parser.add_option("--numCPs", type="int", default=0,
help="Number of GPU Command Processors (CP)")
# not super important now, but to avoid putting the number 4 everywhere, make
# it an option/knob
parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs \
sharing an SQC (icache, and thus icache TLB)")
parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
"per CU")
parser.add_option("--wf-size", type="int", default=64,
help="Wavefront size(in workitems)")
parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
"WF slots per SIMD")
#
# Add the ruby specific and protocol specific options
#
Ruby.define_options(parser)
execfile(os.path.join(config_root, "common", "Options.py"))
(options, args) = parser.parse_args()
#
# Set the default cache size and associativity to be very small to encourage
# races between requests and writebacks.
#
options.l1d_size="256B"
options.l1i_size="256B"
options.l2_size="512B"
options.l3_size="1kB"
options.l1d_assoc=2
options.l1i_assoc=2
options.l2_assoc=2
options.l3_assoc=2
# This file can support multiple compute units
assert(options.num_compute_units >= 1)
n_cu = options.num_compute_units
options.num_sqc = int((n_cu + options.cu_per_sqc - 1) / options.cu_per_sqc)
if args:
print "Error: script doesn't take any positional arguments"
sys.exit(1)
#
# Create the ruby random tester
#
# Check to for the GPU_RfO protocol. Other GPU protocols are non-SC and will
# not work with the Ruby random tester.
assert(buildEnv['PROTOCOL'] == 'GPU_RfO')
# The GPU_RfO protocol does not support cache flushes
check_flush = False
tester = RubyTester(check_flush=check_flush,
checks_to_complete=options.maxloads,
wakeup_frequency=options.wakeup_freq,
deadlock_threshold=1000000)
#
# Create the M5 system. Note that the Memory Object isn't
# actually used by the rubytester, but is included to support the
# M5 memory size == Ruby memory size checks
#
system = System(cpu=tester, mem_ranges=[AddrRange(options.mem_size)])
# Create a top-level voltage domain and clock domain
system.voltage_domain = VoltageDomain(voltage=options.sys_voltage)
system.clk_domain = SrcClockDomain(clock=options.sys_clock,
voltage_domain=system.voltage_domain)
Ruby.create_system(options, False, system)
# Create a seperate clock domain for Ruby
system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock,
voltage_domain=system.voltage_domain)
tester.num_cpus = len(system.ruby._cpu_ports)
#
# The tester is most effective when randomization is turned on and
# artifical delay is randomly inserted on messages
#
system.ruby.randomization = True
for ruby_port in system.ruby._cpu_ports:
#
# Tie the ruby tester ports to the ruby cpu read and write ports
#
if ruby_port.support_data_reqs and ruby_port.support_inst_reqs:
tester.cpuInstDataPort = ruby_port.slave
elif ruby_port.support_data_reqs:
tester.cpuDataPort = ruby_port.slave
elif ruby_port.support_inst_reqs:
tester.cpuInstPort = ruby_port.slave
# Do not automatically retry stalled Ruby requests
ruby_port.no_retry_on_stall = True
#
# Tell each sequencer this is the ruby tester so that it
# copies the subblock back to the checker
#
ruby_port.using_ruby_tester = True
# -----------------------
# run simulation
# -----------------------
root = Root( full_system = False, system = system )
root.system.mem_mode = 'timing'
# Not much point in this being higher than the L1 latency
m5.ticks.setGlobalFrequency('1ns')
# instantiate configuration
m5.instantiate()
# simulate until program terminates
exit_event = m5.simulate(options.abs_max_tick)
print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause()

View file

@ -0,0 +1,134 @@
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Sooraj Puthoor, Lisa Hsu
#
import math
import m5
from m5.objects import *
from m5.defines import buildEnv
from m5.util import convert
from CntrlBase import *
from Cluster import Cluster
#
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
#
class L1Cache(RubyCache):
latency = 1
resourceStalls = False
def create(self, size, assoc, options):
self.size = MemorySize(size)
self.assoc = assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
#
# Note: the L2 Cache latency is not currently used
#
class L2Cache(RubyCache):
latency = 10
resourceStalls = False
def create(self, size, assoc, options):
self.size = MemorySize(size)
self.assoc = assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class CPCntrl(AMD_Base_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.cntrl_id = self.cntrlCount()
self.L1Icache = L1Cache()
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
self.L1D0cache = L1Cache()
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
self.L1D1cache = L1Cache()
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
self.L2cache = L2Cache()
self.L2cache.create(options.l2_size, options.l2_assoc, options)
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1Icache
self.sequencer.dcache = self.L1D0cache
self.sequencer.ruby_system = ruby_system
self.sequencer.coreid = 0
self.sequencer.is_cpu_sequencer = True
self.sequencer1 = RubySequencer()
self.sequencer1.version = self.seqCount()
self.sequencer1.icache = self.L1Icache
self.sequencer1.dcache = self.L1D1cache
self.sequencer1.ruby_system = ruby_system
self.sequencer1.coreid = 1
self.sequencer1.is_cpu_sequencer = True
self.issue_latency = options.cpu_to_dir_latency
self.send_evictions = send_evicts(options)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def define_options(parser):
parser.add_option("--cpu-to-dir-latency", type="int", default=15)
def construct(options, system, ruby_system):
if (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
panic("This script requires VIPER based protocols \
to be built.")
cpu_sequencers = []
cpuCluster = None
cpuCluster = Cluster(name="CPU Cluster", extBW = 8, intBW=8) # 16 GB/s
for i in xrange((options.num_cpus + 1) / 2):
cp_cntrl = CPCntrl()
cp_cntrl.create(options, ruby_system, system)
# Connect the CP controllers to the ruby network
cp_cntrl.requestFromCore = ruby_system.network.slave
cp_cntrl.responseFromCore = ruby_system.network.slave
cp_cntrl.unblockFromCore = ruby_system.network.slave
cp_cntrl.probeToCore = ruby_system.network.master
cp_cntrl.responseToCore = ruby_system.network.master
exec("system.cp_cntrl%d = cp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
cpuCluster.add(cp_cntrl)
return cpu_sequencers, cpuCluster

751
configs/ruby/GPU_RfO.py Normal file
View file

@ -0,0 +1,751 @@
#
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Lisa Hsu
#
import math
import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
from Cluster import Cluster
from Crossbar import Crossbar
class CntrlBase:
_seqs = 0
@classmethod
def seqCount(cls):
# Use SeqCount not class since we need global count
CntrlBase._seqs += 1
return CntrlBase._seqs - 1
_cntrls = 0
@classmethod
def cntrlCount(cls):
# Use CntlCount not class since we need global count
CntrlBase._cntrls += 1
return CntrlBase._cntrls - 1
_version = 0
@classmethod
def versionCount(cls):
cls._version += 1 # Use count for this particular type
return cls._version - 1
class TccDirCache(RubyCache):
size = "512kB"
assoc = 16
resourceStalls = False
def create(self, options):
self.size = MemorySize(options.tcc_size)
self.size.value += (options.num_compute_units *
(MemorySize(options.tcp_size).value) *
options.tcc_dir_factor) / long(options.num_tccs)
self.start_index_bit = math.log(options.cacheline_size, 2) + \
math.log(options.num_tccs, 2)
self.replacement_policy = PseudoLRUReplacementPolicy()
class L1DCache(RubyCache):
resourceStalls = False
def create(self, options):
self.size = MemorySize(options.l1d_size)
self.assoc = options.l1d_assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class L1ICache(RubyCache):
resourceStalls = False
def create(self, options):
self.size = MemorySize(options.l1i_size)
self.assoc = options.l1i_assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class L2Cache(RubyCache):
resourceStalls = False
def create(self, options):
self.size = MemorySize(options.l2_size)
self.assoc = options.l2_assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class CPCntrl(CorePair_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1Icache = L1ICache()
self.L1Icache.create(options)
self.L1D0cache = L1DCache()
self.L1D0cache.create(options)
self.L1D1cache = L1DCache()
self.L1D1cache.create(options)
self.L2cache = L2Cache()
self.L2cache.create(options)
self.sequencer = RubySequencer()
self.sequencer.icache_hit_latency = 2
self.sequencer.dcache_hit_latency = 2
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1Icache
self.sequencer.dcache = self.L1D0cache
self.sequencer.ruby_system = ruby_system
self.sequencer.coreid = 0
self.sequencer.is_cpu_sequencer = True
self.sequencer1 = RubySequencer()
self.sequencer1.version = self.seqCount()
self.sequencer1.icache = self.L1Icache
self.sequencer1.dcache = self.L1D1cache
self.sequencer1.icache_hit_latency = 2
self.sequencer1.dcache_hit_latency = 2
self.sequencer1.ruby_system = ruby_system
self.sequencer1.coreid = 1
self.sequencer1.is_cpu_sequencer = True
self.issue_latency = options.cpu_to_dir_latency
self.send_evictions = send_evicts(options)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class TCPCache(RubyCache):
assoc = 8
dataArrayBanks = 16
tagArrayBanks = 4
dataAccessLatency = 4
tagAccessLatency = 1
def create(self, options):
self.size = MemorySize(options.tcp_size)
self.replacement_policy = PseudoLRUReplacementPolicy()
class TCPCntrl(TCP_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
self.L1cache.resourceStalls = options.no_resource_stalls
self.L1cache.create(options)
self.coalescer = RubyGPUCoalescer()
self.coalescer.version = self.seqCount()
self.coalescer.icache = self.L1cache
self.coalescer.dcache = self.L1cache
self.coalescer.ruby_system = ruby_system
self.coalescer.support_inst_reqs = False
self.coalescer.is_cpu_sequencer = False
self.coalescer.max_outstanding_requests = options.simds_per_cu * \
options.wfs_per_simd * \
options.wf_size
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.is_cpu_sequencer = True
self.use_seq_not_coal = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def createCP(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
self.L1cache.resourceStalls = options.no_resource_stalls
self.L1cache.create(options)
self.coalescer = RubyGPUCoalescer()
self.coalescer.version = self.seqCount()
self.coalescer.icache = self.L1cache
self.coalescer.dcache = self.L1cache
self.coalescer.ruby_system = ruby_system
self.coalescer.support_inst_reqs = False
self.coalescer.is_cpu_sequencer = False
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.is_cpu_sequencer = True
self.use_seq_not_coal = True
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class SQCCache(RubyCache):
size = "32kB"
assoc = 8
dataArrayBanks = 16
tagArrayBanks = 4
dataAccessLatency = 4
tagAccessLatency = 1
def create(self, options):
self.replacement_policy = PseudoLRUReplacementPolicy()
class SQCCntrl(SQC_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = SQCCache()
self.L1cache.create(options)
self.L1cache.resourceStalls = options.no_resource_stalls
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.support_data_reqs = False
self.sequencer.is_cpu_sequencer = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def createCP(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = SQCCache()
self.L1cache.create(options)
self.L1cache.resourceStalls = options.no_resource_stalls
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.support_data_reqs = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class TCC(RubyCache):
assoc = 16
dataAccessLatency = 8
tagAccessLatency = 2
resourceStalls = True
def create(self, options):
self.size = MemorySize(options.tcc_size)
self.size = self.size / options.num_tccs
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
if ((self.size.value / long(self.assoc)) < 128):
self.size.value = long(128 * self.assoc)
self.start_index_bit = math.log(options.cacheline_size, 2) + \
math.log(options.num_tccs, 2)
self.replacement_policy = PseudoLRUReplacementPolicy()
class TCCCntrl(TCC_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L2cache = TCC()
self.L2cache.create(options)
self.l2_response_latency = options.TCC_latency
self.number_of_TBEs = 2048
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
tcc_unblock_to_tccdir, req_to_tcc,
probe_to_tcc, resp_to_tcc):
self.w_reqToTCCDir = req_to_tccdir
self.w_respToTCCDir = resp_to_tccdir
self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
self.w_reqToTCC = req_to_tcc
self.w_probeToTCC = probe_to_tcc
self.w_respToTCC = resp_to_tcc
class TCCDirCntrl(TCCdir_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.directory = TccDirCache()
self.directory.create(options)
self.number_of_TBEs = 1024
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
tcc_unblock_to_tccdir, req_to_tcc,
probe_to_tcc, resp_to_tcc):
self.w_reqToTCCDir = req_to_tccdir
self.w_respToTCCDir = resp_to_tccdir
self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
self.w_reqToTCC = req_to_tcc
self.w_probeToTCC = probe_to_tcc
self.w_respToTCC = resp_to_tcc
class L3Cache(RubyCache):
assoc = 8
dataArrayBanks = 256
tagArrayBanks = 256
def create(self, options, ruby_system, system):
self.size = MemorySize(options.l3_size)
self.size.value /= options.num_dirs
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataAccessLatency = options.l3_data_latency
self.tagAccessLatency = options.l3_tag_latency
self.resourceStalls = options.no_resource_stalls
self.replacement_policy = PseudoLRUReplacementPolicy()
class L3Cntrl(L3Cache_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L3cache = L3Cache()
self.L3cache.create(options, ruby_system, system)
self.l3_response_latency = max(self.L3cache.dataAccessLatency,
self.L3cache.tagAccessLatency)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
class DirMem(RubyDirectoryMemory, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
phys_mem_size = AddrRange(options.mem_size).size()
mem_module_size = phys_mem_size / options.num_dirs
dir_size = MemorySize('0B')
dir_size.value = mem_module_size
self.size = dir_size
class DirCntrl(Directory_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.response_latency = 30
self.directory = DirMem()
self.directory.create(options, ruby_system, system)
self.L3CacheMemory = L3Cache()
self.L3CacheMemory.create(options, ruby_system, system)
self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
self.L3CacheMemory.tagAccessLatency)
self.number_of_TBEs = options.num_tbes
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
def define_options(parser):
parser.add_option("--num-subcaches", type="int", default=4)
parser.add_option("--l3-data-latency", type="int", default=20)
parser.add_option("--l3-tag-latency", type="int", default=15)
parser.add_option("--cpu-to-dir-latency", type="int", default=15)
parser.add_option("--gpu-to-dir-latency", type="int", default=160)
parser.add_option("--no-resource-stalls", action="store_false",
default=True)
parser.add_option("--num-tbes", type="int", default=256)
parser.add_option("--l2-latency", type="int", default=50) # load to use
parser.add_option("--num-tccs", type="int", default=1,
help="number of TCC directories and banks in the GPU")
parser.add_option("--TCP_latency", type="int", default=4,
help="TCP latency")
parser.add_option("--TCC_latency", type="int", default=16,
help="TCC latency")
parser.add_option("--tcc-size", type='string', default='256kB',
help="agregate tcc size")
parser.add_option("--tcp-size", type='string', default='16kB',
help="tcp size")
parser.add_option("--tcc-dir-factor", type='int', default=4,
help="TCCdir size = factor *(TCPs + TCC)")
def create_system(options, full_system, system, dma_devices, ruby_system):
if buildEnv['PROTOCOL'] != 'GPU_RfO':
panic("This script requires the GPU_RfO protocol to be built.")
cpu_sequencers = []
#
# The ruby network creation expects the list of nodes in the system to be
# consistent with the NetDest list. Therefore the l1 controller nodes
# must be listed before the directory nodes and directory nodes before
# dma nodes, etc.
#
cp_cntrl_nodes = []
tcp_cntrl_nodes = []
sqc_cntrl_nodes = []
tcc_cntrl_nodes = []
tccdir_cntrl_nodes = []
dir_cntrl_nodes = []
l3_cntrl_nodes = []
#
# Must create the individual controllers before the network to ensure the
# controller constructors are called before the network constructor
#
TCC_bits = int(math.log(options.num_tccs, 2))
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
# Clusters
mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
for i in xrange(options.num_dirs):
dir_cntrl = DirCntrl(TCC_select_num_bits = TCC_bits)
dir_cntrl.create(options, ruby_system, system)
dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
#Enough TBEs for all TCP TBEs
# Connect the Directory controller to the ruby network
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
dir_cntrl.requestFromCores.slave = ruby_system.network.master
dir_cntrl.responseFromCores = MessageBuffer()
dir_cntrl.responseFromCores.slave = ruby_system.network.master
dir_cntrl.unblockFromCores = MessageBuffer()
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
dir_cntrl.probeToCore = MessageBuffer()
dir_cntrl.probeToCore.master = ruby_system.network.slave
dir_cntrl.responseToCore = MessageBuffer()
dir_cntrl.responseToCore.master = ruby_system.network.slave
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.responseFromMemory = MessageBuffer()
exec("system.dir_cntrl%d = dir_cntrl" % i)
dir_cntrl_nodes.append(dir_cntrl)
mainCluster.add(dir_cntrl)
# For an odd number of CPUs, still create the right number of controllers
cpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
for i in xrange((options.num_cpus + 1) / 2):
cp_cntrl = CPCntrl()
cp_cntrl.create(options, ruby_system, system)
exec("system.cp_cntrl%d = cp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
# Connect the CP controllers and the network
cp_cntrl.requestFromCore = MessageBuffer()
cp_cntrl.requestFromCore.master = ruby_system.network.slave
cp_cntrl.responseFromCore = MessageBuffer()
cp_cntrl.responseFromCore.master = ruby_system.network.slave
cp_cntrl.unblockFromCore = MessageBuffer()
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
cp_cntrl.probeToCore = MessageBuffer()
cp_cntrl.probeToCore.slave = ruby_system.network.master
cp_cntrl.responseToCore = MessageBuffer()
cp_cntrl.responseToCore.slave = ruby_system.network.master
cp_cntrl.mandatoryQueue = MessageBuffer()
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
cpuCluster.add(cp_cntrl)
gpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
for i in xrange(options.num_compute_units):
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
number_of_TBEs = 2560) # max outstanding requests
tcp_cntrl.create(options, ruby_system, system)
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(tcp_cntrl.coalescer)
tcp_cntrl_nodes.append(tcp_cntrl)
# Connect the TCP controller to the ruby network
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
tcp_cntrl.mandatoryQueue = MessageBuffer()
gpuCluster.add(tcp_cntrl)
for i in xrange(options.num_sqc):
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
sqc_cntrl.create(options, ruby_system, system)
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(sqc_cntrl.sequencer)
# Connect the SQC controller to the ruby network
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
sqc_cntrl.mandatoryQueue = MessageBuffer()
# SQC also in GPU cluster
gpuCluster.add(sqc_cntrl)
for i in xrange(options.numCPs):
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
number_of_TBEs = 2560) # max outstanding requests
tcp_cntrl.createCP(options, ruby_system, system)
exec("system.tcp_cntrl%d = tcp_cntrl" % (options.num_compute_units + i))
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(tcp_cntrl.sequencer)
tcp_cntrl_nodes.append(tcp_cntrl)
# Connect the TCP controller to the ruby network
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
tcp_cntrl.mandatoryQueue = MessageBuffer()
gpuCluster.add(tcp_cntrl)
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
sqc_cntrl.createCP(options, ruby_system, system)
exec("system.sqc_cntrl%d = sqc_cntrl" % (options.num_compute_units + i))
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(sqc_cntrl.sequencer)
# Connect the SQC controller to the ruby network
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
sqc_cntrl.mandatoryQueue = MessageBuffer()
# SQC also in GPU cluster
gpuCluster.add(sqc_cntrl)
for i in xrange(options.num_tccs):
tcc_cntrl = TCCCntrl(TCC_select_num_bits = TCC_bits,
number_of_TBEs = options.num_compute_units * 2560)
#Enough TBEs for all TCP TBEs
tcc_cntrl.create(options, ruby_system, system)
tcc_cntrl_nodes.append(tcc_cntrl)
tccdir_cntrl = TCCDirCntrl(TCC_select_num_bits = TCC_bits,
number_of_TBEs = options.num_compute_units * 2560)
#Enough TBEs for all TCP TBEs
tccdir_cntrl.create(options, ruby_system, system)
tccdir_cntrl_nodes.append(tccdir_cntrl)
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
exec("system.tccdir_cntrl%d = tccdir_cntrl" % i)
# connect all of the wire buffers between L3 and dirs up
req_to_tccdir = RubyWireBuffer()
resp_to_tccdir = RubyWireBuffer()
tcc_unblock_to_tccdir = RubyWireBuffer()
req_to_tcc = RubyWireBuffer()
probe_to_tcc = RubyWireBuffer()
resp_to_tcc = RubyWireBuffer()
tcc_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
tcc_unblock_to_tccdir, req_to_tcc,
probe_to_tcc, resp_to_tcc)
tccdir_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
tcc_unblock_to_tccdir, req_to_tcc,
probe_to_tcc, resp_to_tcc)
# Connect the TCC controller to the ruby network
tcc_cntrl.responseFromTCC = MessageBuffer(ordered = True)
tcc_cntrl.responseFromTCC.master = ruby_system.network.slave
tcc_cntrl.responseToTCC = MessageBuffer(ordered = True)
tcc_cntrl.responseToTCC.slave = ruby_system.network.master
# Connect the TCC Dir controller to the ruby network
tccdir_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tccdir_cntrl.requestFromTCP.slave = ruby_system.network.master
tccdir_cntrl.responseFromTCP = MessageBuffer(ordered = True)
tccdir_cntrl.responseFromTCP.slave = ruby_system.network.master
tccdir_cntrl.unblockFromTCP = MessageBuffer(ordered = True)
tccdir_cntrl.unblockFromTCP.slave = ruby_system.network.master
tccdir_cntrl.probeToCore = MessageBuffer(ordered = True)
tccdir_cntrl.probeToCore.master = ruby_system.network.slave
tccdir_cntrl.responseToCore = MessageBuffer(ordered = True)
tccdir_cntrl.responseToCore.master = ruby_system.network.slave
tccdir_cntrl.probeFromNB = MessageBuffer()
tccdir_cntrl.probeFromNB.slave = ruby_system.network.master
tccdir_cntrl.responseFromNB = MessageBuffer()
tccdir_cntrl.responseFromNB.slave = ruby_system.network.master
tccdir_cntrl.requestToNB = MessageBuffer()
tccdir_cntrl.requestToNB.master = ruby_system.network.slave
tccdir_cntrl.responseToNB = MessageBuffer()
tccdir_cntrl.responseToNB.master = ruby_system.network.slave
tccdir_cntrl.unblockToNB = MessageBuffer()
tccdir_cntrl.unblockToNB.master = ruby_system.network.slave
tccdir_cntrl.triggerQueue = MessageBuffer(ordered = True)
# TCC cntrls added to the GPU cluster
gpuCluster.add(tcc_cntrl)
gpuCluster.add(tccdir_cntrl)
# Assuming no DMA devices
assert(len(dma_devices) == 0)
# Add cpu/gpu clusters to main cluster
mainCluster.add(cpuCluster)
mainCluster.add(gpuCluster)
ruby_system.network.number_of_virtual_networks = 10
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)

674
configs/ruby/GPU_VIPER.py Normal file
View file

@ -0,0 +1,674 @@
#
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Lisa Hsu
#
import math
import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
from Cluster import Cluster
from Crossbar import Crossbar
class CntrlBase:
_seqs = 0
@classmethod
def seqCount(cls):
# Use SeqCount not class since we need global count
CntrlBase._seqs += 1
return CntrlBase._seqs - 1
_cntrls = 0
@classmethod
def cntrlCount(cls):
# Use CntlCount not class since we need global count
CntrlBase._cntrls += 1
return CntrlBase._cntrls - 1
_version = 0
@classmethod
def versionCount(cls):
cls._version += 1 # Use count for this particular type
return cls._version - 1
class L1Cache(RubyCache):
resourceStalls = False
dataArrayBanks = 2
tagArrayBanks = 2
dataAccessLatency = 1
tagAccessLatency = 1
def create(self, size, assoc, options):
self.size = MemorySize(size)
self.assoc = assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class L2Cache(RubyCache):
resourceStalls = False
assoc = 16
dataArrayBanks = 16
tagArrayBanks = 16
def create(self, size, assoc, options):
self.size = MemorySize(size)
self.assoc = assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class CPCntrl(CorePair_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1Icache = L1Cache()
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
self.L1D0cache = L1Cache()
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
self.L1D1cache = L1Cache()
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
self.L2cache = L2Cache()
self.L2cache.create(options.l2_size, options.l2_assoc, options)
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1Icache
self.sequencer.dcache = self.L1D0cache
self.sequencer.ruby_system = ruby_system
self.sequencer.coreid = 0
self.sequencer.is_cpu_sequencer = True
self.sequencer1 = RubySequencer()
self.sequencer1.version = self.seqCount()
self.sequencer1.icache = self.L1Icache
self.sequencer1.dcache = self.L1D1cache
self.sequencer1.ruby_system = ruby_system
self.sequencer1.coreid = 1
self.sequencer1.is_cpu_sequencer = True
self.issue_latency = options.cpu_to_dir_latency
self.send_evictions = send_evicts(options)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class TCPCache(RubyCache):
size = "16kB"
assoc = 16
dataArrayBanks = 16 #number of data banks
tagArrayBanks = 16 #number of tag banks
dataAccessLatency = 4
tagAccessLatency = 1
def create(self, options):
self.size = MemorySize(options.tcp_size)
self.assoc = options.tcp_assoc
self.resourceStalls = options.no_tcc_resource_stalls
self.replacement_policy = PseudoLRUReplacementPolicy()
class TCPCntrl(TCP_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency,
dataAccessLatency = options.TCP_latency)
self.L1cache.resourceStalls = options.no_resource_stalls
self.L1cache.create(options)
self.issue_latency = 1
self.coalescer = VIPERCoalescer()
self.coalescer.version = self.seqCount()
self.coalescer.icache = self.L1cache
self.coalescer.dcache = self.L1cache
self.coalescer.ruby_system = ruby_system
self.coalescer.support_inst_reqs = False
self.coalescer.is_cpu_sequencer = False
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.is_cpu_sequencer = True
self.use_seq_not_coal = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def createCP(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency,
dataAccessLatency = options.TCP_latency)
self.L1cache.resourceStalls = options.no_resource_stalls
self.L1cache.create(options)
self.issue_latency = 1
self.coalescer = VIPERCoalescer()
self.coalescer.version = self.seqCount()
self.coalescer.icache = self.L1cache
self.coalescer.dcache = self.L1cache
self.coalescer.ruby_system = ruby_system
self.coalescer.support_inst_reqs = False
self.coalescer.is_cpu_sequencer = False
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.is_cpu_sequencer = True
self.use_seq_not_coal = True
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class SQCCache(RubyCache):
dataArrayBanks = 8
tagArrayBanks = 8
dataAccessLatency = 1
tagAccessLatency = 1
def create(self, options):
self.size = MemorySize(options.sqc_size)
self.assoc = options.sqc_assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class SQCCntrl(SQC_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = SQCCache()
self.L1cache.create(options)
self.L1cache.resourceStalls = options.no_resource_stalls
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.support_data_reqs = False
self.sequencer.is_cpu_sequencer = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class TCC(RubyCache):
size = MemorySize("256kB")
assoc = 16
dataAccessLatency = 8
tagAccessLatency = 2
resourceStalls = True
def create(self, options):
self.assoc = options.tcc_assoc
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
s = options.num_compute_units
tcc_size = s * 128
tcc_size = str(tcc_size)+'kB'
self.size = MemorySize(tcc_size)
self.dataArrayBanks = 64
self.tagArrayBanks = 64
else:
self.size = MemorySize(options.tcc_size)
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
self.size.value = self.size.value / options.num_tccs
if ((self.size.value / long(self.assoc)) < 128):
self.size.value = long(128 * self.assoc)
self.start_index_bit = math.log(options.cacheline_size, 2) + \
math.log(options.num_tccs, 2)
self.replacement_policy = PseudoLRUReplacementPolicy()
class TCCCntrl(TCC_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L2cache = TCC()
self.L2cache.create(options)
self.L2cache.resourceStalls = options.no_tcc_resource_stalls
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class L3Cache(RubyCache):
dataArrayBanks = 16
tagArrayBanks = 16
def create(self, options, ruby_system, system):
self.size = MemorySize(options.l3_size)
self.size.value /= options.num_dirs
self.assoc = options.l3_assoc
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataAccessLatency = options.l3_data_latency
self.tagAccessLatency = options.l3_tag_latency
self.resourceStalls = False
self.replacement_policy = PseudoLRUReplacementPolicy()
class L3Cntrl(L3Cache_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L3cache = L3Cache()
self.L3cache.create(options, ruby_system, system)
self.l3_response_latency = max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
class DirMem(RubyDirectoryMemory, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
phys_mem_size = AddrRange(options.mem_size).size()
mem_module_size = phys_mem_size / options.num_dirs
dir_size = MemorySize('0B')
dir_size.value = mem_module_size
self.size = dir_size
class DirCntrl(Directory_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.response_latency = 30
self.directory = DirMem()
self.directory.create(options, ruby_system, system)
self.L3CacheMemory = L3Cache()
self.L3CacheMemory.create(options, ruby_system, system)
self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
self.L3CacheMemory.tagAccessLatency)
self.number_of_TBEs = options.num_tbes
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
def define_options(parser):
parser.add_option("--num-subcaches", type = "int", default = 4)
parser.add_option("--l3-data-latency", type = "int", default = 20)
parser.add_option("--l3-tag-latency", type = "int", default = 15)
parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
parser.add_option("--no-resource-stalls", action = "store_false",
default = True)
parser.add_option("--no-tcc-resource-stalls", action = "store_false",
default = True)
parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
parser.add_option("--num-tbes", type = "int", default = 256)
parser.add_option("--l2-latency", type = "int", default = 50) # load to use
parser.add_option("--num-tccs", type = "int", default = 1,
help = "number of TCC banks in the GPU")
parser.add_option("--sqc-size", type = 'string', default = '32kB',
help = "SQC cache size")
parser.add_option("--sqc-assoc", type = 'int', default = 8,
help = "SQC cache assoc")
parser.add_option("--WB_L1", action = "store_true", default = False,
help = "writeback L1")
parser.add_option("--WB_L2", action = "store_true", default = False,
help = "writeback L2")
parser.add_option("--TCP_latency", type = "int", default = 4,
help = "TCP latency")
parser.add_option("--TCC_latency", type = "int", default = 16,
help = "TCC latency")
parser.add_option("--tcc-size", type = 'string', default = '256kB',
help = "agregate tcc size")
parser.add_option("--tcc-assoc", type = 'int', default = 16,
help = "tcc assoc")
parser.add_option("--tcp-size", type = 'string', default = '16kB',
help = "tcp size")
parser.add_option("--tcp-assoc", type = 'int', default = 16,
help = "tcp assoc")
parser.add_option("--noL1", action = "store_true", default = False,
help = "bypassL1")
def create_system(options, full_system, system, dma_devices, ruby_system):
if buildEnv['PROTOCOL'] != 'GPU_VIPER':
panic("This script requires the GPU_VIPER protocol to be built.")
cpu_sequencers = []
#
# The ruby network creation expects the list of nodes in the system to be
# consistent with the NetDest list. Therefore the l1 controller nodes
# must be listed before the directory nodes and directory nodes before
# dma nodes, etc.
#
cp_cntrl_nodes = []
tcp_cntrl_nodes = []
sqc_cntrl_nodes = []
tcc_cntrl_nodes = []
dir_cntrl_nodes = []
l3_cntrl_nodes = []
#
# Must create the individual controllers before the network to ensure the
# controller constructors are called before the network constructor
#
# For an odd number of CPUs, still create the right number of controllers
TCC_bits = int(math.log(options.num_tccs, 2))
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
# Clusters
crossbar_bw = None
mainCluster = None
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
#Assuming a 2GHz clock
crossbar_bw = 16 * options.num_compute_units * options.bw_scalor
mainCluster = Cluster(intBW=crossbar_bw)
else:
mainCluster = Cluster(intBW=8) # 16 GB/s
for i in xrange(options.num_dirs):
dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits = TCC_bits)
dir_cntrl.create(options, ruby_system, system)
dir_cntrl.number_of_TBEs = options.num_tbes
dir_cntrl.useL3OnWT = options.use_L3_on_WT
# the number_of_TBEs is inclusive of TBEs below
# Connect the Directory controller to the ruby network
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
dir_cntrl.requestFromCores.slave = ruby_system.network.master
dir_cntrl.responseFromCores = MessageBuffer()
dir_cntrl.responseFromCores.slave = ruby_system.network.master
dir_cntrl.unblockFromCores = MessageBuffer()
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
dir_cntrl.probeToCore = MessageBuffer()
dir_cntrl.probeToCore.master = ruby_system.network.slave
dir_cntrl.responseToCore = MessageBuffer()
dir_cntrl.responseToCore.master = ruby_system.network.slave
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.responseFromMemory = MessageBuffer()
exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
dir_cntrl_nodes.append(dir_cntrl)
mainCluster.add(dir_cntrl)
cpuCluster = None
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
cpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
else:
cpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
for i in xrange((options.num_cpus + 1) / 2):
cp_cntrl = CPCntrl()
cp_cntrl.create(options, ruby_system, system)
exec("ruby_system.cp_cntrl%d = cp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
# Connect the CP controllers and the network
cp_cntrl.requestFromCore = MessageBuffer()
cp_cntrl.requestFromCore.master = ruby_system.network.slave
cp_cntrl.responseFromCore = MessageBuffer()
cp_cntrl.responseFromCore.master = ruby_system.network.slave
cp_cntrl.unblockFromCore = MessageBuffer()
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
cp_cntrl.probeToCore = MessageBuffer()
cp_cntrl.probeToCore.slave = ruby_system.network.master
cp_cntrl.responseToCore = MessageBuffer()
cp_cntrl.responseToCore.slave = ruby_system.network.master
cp_cntrl.mandatoryQueue = MessageBuffer()
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
cpuCluster.add(cp_cntrl)
gpuCluster = None
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
else:
gpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
for i in xrange(options.num_compute_units):
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
issue_latency = 1,
number_of_TBEs = 2560)
# TBEs set to max outstanding requests
tcp_cntrl.create(options, ruby_system, system)
tcp_cntrl.WB = options.WB_L1
tcp_cntrl.disableL1 = options.noL1
tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency
tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency
exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(tcp_cntrl.coalescer)
tcp_cntrl_nodes.append(tcp_cntrl)
# Connect the TCP controller to the ruby network
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
tcp_cntrl.unblockFromCore = MessageBuffer()
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
tcp_cntrl.mandatoryQueue = MessageBuffer()
gpuCluster.add(tcp_cntrl)
for i in xrange(options.num_sqc):
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
sqc_cntrl.create(options, ruby_system, system)
exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(sqc_cntrl.sequencer)
# Connect the SQC controller to the ruby network
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
sqc_cntrl.mandatoryQueue = MessageBuffer()
# SQC also in GPU cluster
gpuCluster.add(sqc_cntrl)
for i in xrange(options.numCPs):
tcp_ID = options.num_compute_units + i
sqc_ID = options.num_sqc + i
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
issue_latency = 1,
number_of_TBEs = 2560)
# TBEs set to max outstanding requests
tcp_cntrl.createCP(options, ruby_system, system)
tcp_cntrl.WB = options.WB_L1
tcp_cntrl.disableL1 = options.noL1
tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency
tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency
exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % tcp_ID)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(tcp_cntrl.sequencer)
tcp_cntrl_nodes.append(tcp_cntrl)
# Connect the CP (TCP) controllers to the ruby network
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
tcp_cntrl.mandatoryQueue = MessageBuffer()
gpuCluster.add(tcp_cntrl)
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
sqc_cntrl.create(options, ruby_system, system)
exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % sqc_ID)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(sqc_cntrl.sequencer)
# SQC also in GPU cluster
gpuCluster.add(sqc_cntrl)
for i in xrange(options.num_tccs):
tcc_cntrl = TCCCntrl(l2_response_latency = options.TCC_latency)
tcc_cntrl.create(options, ruby_system, system)
tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
tcc_cntrl.l2_response_latency = options.TCC_latency
tcc_cntrl_nodes.append(tcc_cntrl)
tcc_cntrl.WB = options.WB_L2
tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
# the number_of_TBEs is inclusive of TBEs below
# Connect the TCC controllers to the ruby network
tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
tcc_cntrl.responseToCore.master = ruby_system.network.slave
tcc_cntrl.probeFromNB = MessageBuffer()
tcc_cntrl.probeFromNB.slave = ruby_system.network.master
tcc_cntrl.responseFromNB = MessageBuffer()
tcc_cntrl.responseFromNB.slave = ruby_system.network.master
tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
tcc_cntrl.requestToNB.master = ruby_system.network.slave
tcc_cntrl.responseToNB = MessageBuffer()
tcc_cntrl.responseToNB.master = ruby_system.network.slave
tcc_cntrl.unblockToNB = MessageBuffer()
tcc_cntrl.unblockToNB.master = ruby_system.network.slave
tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
exec("ruby_system.tcc_cntrl%d = tcc_cntrl" % i)
# connect all of the wire buffers between L3 and dirs up
# TCC cntrls added to the GPU cluster
gpuCluster.add(tcc_cntrl)
# Assuming no DMA devices
assert(len(dma_devices) == 0)
# Add cpu/gpu clusters to main cluster
mainCluster.add(cpuCluster)
mainCluster.add(gpuCluster)
ruby_system.network.number_of_virtual_networks = 10
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)

View file

@ -0,0 +1,588 @@
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Sooraj Puthoor
#
import math
import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
from Cluster import Cluster
from Crossbar import Crossbar
class CntrlBase:
_seqs = 0
@classmethod
def seqCount(cls):
# Use SeqCount not class since we need global count
CntrlBase._seqs += 1
return CntrlBase._seqs - 1
_cntrls = 0
@classmethod
def cntrlCount(cls):
# Use CntlCount not class since we need global count
CntrlBase._cntrls += 1
return CntrlBase._cntrls - 1
_version = 0
@classmethod
def versionCount(cls):
cls._version += 1 # Use count for this particular type
return cls._version - 1
class L1Cache(RubyCache):
resourceStalls = False
dataArrayBanks = 2
tagArrayBanks = 2
dataAccessLatency = 1
tagAccessLatency = 1
def create(self, size, assoc, options):
self.size = MemorySize(size)
self.assoc = assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class L2Cache(RubyCache):
resourceStalls = False
assoc = 16
dataArrayBanks = 16
tagArrayBanks = 16
def create(self, size, assoc, options):
self.size = MemorySize(size)
self.assoc = assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class CPCntrl(CorePair_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1Icache = L1Cache()
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
self.L1D0cache = L1Cache()
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
self.L1D1cache = L1Cache()
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
self.L2cache = L2Cache()
self.L2cache.create(options.l2_size, options.l2_assoc, options)
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1Icache
self.sequencer.dcache = self.L1D0cache
self.sequencer.ruby_system = ruby_system
self.sequencer.coreid = 0
self.sequencer.is_cpu_sequencer = True
self.sequencer1 = RubySequencer()
self.sequencer1.version = self.seqCount()
self.sequencer1.icache = self.L1Icache
self.sequencer1.dcache = self.L1D1cache
self.sequencer1.ruby_system = ruby_system
self.sequencer1.coreid = 1
self.sequencer1.is_cpu_sequencer = True
self.issue_latency = options.cpu_to_dir_latency
self.send_evictions = send_evicts(options)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class TCPCache(RubyCache):
size = "16kB"
assoc = 16
dataArrayBanks = 16
tagArrayBanks = 16
dataAccessLatency = 4
tagAccessLatency = 1
def create(self, options):
self.size = MemorySize(options.tcp_size)
self.dataArrayBanks = 16
self.tagArrayBanks = 16
self.dataAccessLatency = 4
self.tagAccessLatency = 1
self.resourceStalls = options.no_tcc_resource_stalls
self.replacement_policy = PseudoLRUReplacementPolicy()
class TCPCntrl(TCP_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = TCPCache()
self.L1cache.create(options)
self.issue_latency = 1
self.coalescer = VIPERCoalescer()
self.coalescer.version = self.seqCount()
self.coalescer.icache = self.L1cache
self.coalescer.dcache = self.L1cache
self.coalescer.ruby_system = ruby_system
self.coalescer.support_inst_reqs = False
self.coalescer.is_cpu_sequencer = False
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.is_cpu_sequencer = True
self.use_seq_not_coal = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class SQCCache(RubyCache):
dataArrayBanks = 8
tagArrayBanks = 8
dataAccessLatency = 1
tagAccessLatency = 1
def create(self, options):
self.size = MemorySize(options.sqc_size)
self.assoc = options.sqc_assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class SQCCntrl(SQC_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = SQCCache()
self.L1cache.create(options)
self.L1cache.resourceStalls = False
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.support_data_reqs = False
self.sequencer.is_cpu_sequencer = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class TCC(RubyCache):
size = MemorySize("256kB")
assoc = 16
dataAccessLatency = 8
tagAccessLatency = 2
resourceStalls = True
def create(self, options):
self.assoc = options.tcc_assoc
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
s = options.num_compute_units
tcc_size = s * 128
tcc_size = str(tcc_size)+'kB'
self.size = MemorySize(tcc_size)
self.dataArrayBanks = 64
self.tagArrayBanks = 64
else:
self.size = MemorySize(options.tcc_size)
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
self.size.value = self.size.value / options.num_tccs
if ((self.size.value / long(self.assoc)) < 128):
self.size.value = long(128 * self.assoc)
self.start_index_bit = math.log(options.cacheline_size, 2) + \
math.log(options.num_tccs, 2)
self.replacement_policy = PseudoLRUReplacementPolicy()
class TCCCntrl(TCC_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L2cache = TCC()
self.L2cache.create(options)
self.ruby_system = ruby_system
self.L2cache.resourceStalls = options.no_tcc_resource_stalls
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class L3Cache(RubyCache):
dataArrayBanks = 16
tagArrayBanks = 16
def create(self, options, ruby_system, system):
self.size = MemorySize(options.l3_size)
self.size.value /= options.num_dirs
self.assoc = options.l3_assoc
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataAccessLatency = options.l3_data_latency
self.tagAccessLatency = options.l3_tag_latency
self.resourceStalls = False
self.replacement_policy = PseudoLRUReplacementPolicy()
class ProbeFilter(RubyCache):
size = "4MB"
assoc = 16
dataArrayBanks = 256
tagArrayBanks = 256
def create(self, options, ruby_system, system):
self.block_size = "%dB" % (64 * options.blocks_per_region)
self.size = options.region_dir_entries * \
self.block_size * options.num_compute_units
self.assoc = 8
self.tagArrayBanks = 8
self.tagAccessLatency = options.dir_tag_latency
self.dataAccessLatency = 1
self.resourceStalls = options.no_resource_stalls
self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
self.replacement_policy = PseudoLRUReplacementPolicy()
class L3Cntrl(L3Cache_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L3cache = L3Cache()
self.L3cache.create(options, ruby_system, system)
self.l3_response_latency = \
max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
class DirMem(RubyDirectoryMemory, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
phys_mem_size = AddrRange(options.mem_size).size()
mem_module_size = phys_mem_size / options.num_dirs
dir_size = MemorySize('0B')
dir_size.value = mem_module_size
self.size = dir_size
class DirCntrl(Directory_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.response_latency = 30
self.directory = DirMem()
self.directory.create(options, ruby_system, system)
self.L3CacheMemory = L3Cache()
self.L3CacheMemory.create(options, ruby_system, system)
self.ProbeFilterMemory = ProbeFilter()
self.ProbeFilterMemory.create(options, ruby_system, system)
self.l3_hit_latency = \
max(self.L3CacheMemory.dataAccessLatency,
self.L3CacheMemory.tagAccessLatency)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
def define_options(parser):
parser.add_option("--num-subcaches", type = "int", default = 4)
parser.add_option("--l3-data-latency", type = "int", default = 20)
parser.add_option("--l3-tag-latency", type = "int", default = 15)
parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
parser.add_option("--no-resource-stalls", action = "store_false",
default = True)
parser.add_option("--no-tcc-resource-stalls", action = "store_false",
default = True)
parser.add_option("--num-tbes", type = "int", default = 2560)
parser.add_option("--l2-latency", type = "int", default = 50) # load to use
parser.add_option("--num-tccs", type = "int", default = 1,
help = "number of TCC banks in the GPU")
parser.add_option("--sqc-size", type = 'string', default = '32kB',
help = "SQC cache size")
parser.add_option("--sqc-assoc", type = 'int', default = 8,
help = "SQC cache assoc")
parser.add_option("--region-dir-entries", type = "int", default = 8192)
parser.add_option("--dir-tag-latency", type = "int", default = 8)
parser.add_option("--dir-tag-banks", type = "int", default = 4)
parser.add_option("--blocks-per-region", type = "int", default = 1)
parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
parser.add_option("--nonInclusiveDir", action = "store_true",
default = False)
parser.add_option("--WB_L1", action = "store_true",
default = False, help = "writeback L2")
parser.add_option("--WB_L2", action = "store_true",
default = False, help = "writeback L2")
parser.add_option("--TCP_latency", type = "int",
default = 4, help = "TCP latency")
parser.add_option("--TCC_latency", type = "int",
default = 16, help = "TCC latency")
parser.add_option("--tcc-size", type = 'string', default = '2MB',
help = "agregate tcc size")
parser.add_option("--tcc-assoc", type = 'int', default = 16,
help = "tcc assoc")
parser.add_option("--tcp-size", type = 'string', default = '16kB',
help = "tcp size")
parser.add_option("--sampler-sets", type = "int", default = 1024)
parser.add_option("--sampler-assoc", type = "int", default = 16)
parser.add_option("--sampler-counter", type = "int", default = 512)
parser.add_option("--noL1", action = "store_true", default = False,
help = "bypassL1")
parser.add_option("--noL2", action = "store_true", default = False,
help = "bypassL2")
def create_system(options, full_system, system, dma_devices, ruby_system):
if buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline':
panic("This script requires the" \
"GPU_VIPER_Baseline protocol to be built.")
cpu_sequencers = []
#
# The ruby network creation expects the list of nodes in the system to be
# consistent with the NetDest list. Therefore the l1 controller nodes
# must be listed before the directory nodes and directory nodes before
# dma nodes, etc.
#
cp_cntrl_nodes = []
tcp_cntrl_nodes = []
sqc_cntrl_nodes = []
tcc_cntrl_nodes = []
dir_cntrl_nodes = []
l3_cntrl_nodes = []
#
# Must create the individual controllers before the network to ensure the
# controller constructors are called before the network constructor
#
# For an odd number of CPUs, still create the right number of controllers
TCC_bits = int(math.log(options.num_tccs, 2))
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
# Clusters
crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
mainCluster = Cluster(intBW = crossbar_bw)
for i in xrange(options.num_dirs):
dir_cntrl = DirCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
dir_cntrl.create(options, ruby_system, system)
dir_cntrl.number_of_TBEs = options.num_tbes
dir_cntrl.useL3OnWT = options.use_L3_on_WT
dir_cntrl.inclusiveDir = not options.nonInclusiveDir
# Connect the Directory controller to the ruby network
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
dir_cntrl.requestFromCores.slave = ruby_system.network.master
dir_cntrl.responseFromCores = MessageBuffer()
dir_cntrl.responseFromCores.slave = ruby_system.network.master
dir_cntrl.unblockFromCores = MessageBuffer()
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
dir_cntrl.probeToCore = MessageBuffer()
dir_cntrl.probeToCore.master = ruby_system.network.slave
dir_cntrl.responseToCore = MessageBuffer()
dir_cntrl.responseToCore.master = ruby_system.network.slave
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.responseFromMemory = MessageBuffer()
exec("system.dir_cntrl%d = dir_cntrl" % i)
dir_cntrl_nodes.append(dir_cntrl)
mainCluster.add(dir_cntrl)
cpuCluster = Cluster(extBW = crossbar_bw, intBW=crossbar_bw)
for i in xrange((options.num_cpus + 1) / 2):
cp_cntrl = CPCntrl()
cp_cntrl.create(options, ruby_system, system)
exec("system.cp_cntrl%d = cp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
# Connect the CP controllers and the network
cp_cntrl.requestFromCore = MessageBuffer()
cp_cntrl.requestFromCore.master = ruby_system.network.slave
cp_cntrl.responseFromCore = MessageBuffer()
cp_cntrl.responseFromCore.master = ruby_system.network.slave
cp_cntrl.unblockFromCore = MessageBuffer()
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
cp_cntrl.probeToCore = MessageBuffer()
cp_cntrl.probeToCore.slave = ruby_system.network.master
cp_cntrl.responseToCore = MessageBuffer()
cp_cntrl.responseToCore.slave = ruby_system.network.master
cp_cntrl.mandatoryQueue = MessageBuffer()
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
cpuCluster.add(cp_cntrl)
gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
for i in xrange(options.num_compute_units):
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
issue_latency = 1,
number_of_TBEs = 2560)
# TBEs set to max outstanding requests
tcp_cntrl.create(options, ruby_system, system)
tcp_cntrl.WB = options.WB_L1
tcp_cntrl.disableL1 = options.noL1
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(tcp_cntrl.coalescer)
tcp_cntrl_nodes.append(tcp_cntrl)
# Connect the CP (TCP) controllers to the ruby network
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
tcp_cntrl.unblockFromCore = MessageBuffer()
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
tcp_cntrl.mandatoryQueue = MessageBuffer()
gpuCluster.add(tcp_cntrl)
for i in xrange(options.num_sqc):
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
sqc_cntrl.create(options, ruby_system, system)
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(sqc_cntrl.sequencer)
# Connect the SQC controller to the ruby network
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
sqc_cntrl.mandatoryQueue = MessageBuffer()
# SQC also in GPU cluster
gpuCluster.add(sqc_cntrl)
# Because of wire buffers, num_tccs must equal num_tccdirs
numa_bit = 6
for i in xrange(options.num_tccs):
tcc_cntrl = TCCCntrl()
tcc_cntrl.create(options, ruby_system, system)
tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
tcc_cntrl.l2_response_latency = options.TCC_latency
tcc_cntrl_nodes.append(tcc_cntrl)
tcc_cntrl.WB = options.WB_L2
tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
# Connect the TCC controllers to the ruby network
tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
tcc_cntrl.responseToCore.master = ruby_system.network.slave
tcc_cntrl.probeFromNB = MessageBuffer()
tcc_cntrl.probeFromNB.slave = ruby_system.network.master
tcc_cntrl.responseFromNB = MessageBuffer()
tcc_cntrl.responseFromNB.slave = ruby_system.network.master
tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
tcc_cntrl.requestToNB.master = ruby_system.network.slave
tcc_cntrl.responseToNB = MessageBuffer()
tcc_cntrl.responseToNB.master = ruby_system.network.slave
tcc_cntrl.unblockToNB = MessageBuffer()
tcc_cntrl.unblockToNB.master = ruby_system.network.slave
tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
# connect all of the wire buffers between L3 and dirs up
# TCC cntrls added to the GPU cluster
gpuCluster.add(tcc_cntrl)
# Assuming no DMA devices
assert(len(dma_devices) == 0)
# Add cpu/gpu clusters to main cluster
mainCluster.add(cpuCluster)
mainCluster.add(gpuCluster)
ruby_system.network.number_of_virtual_networks = 10
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)

View file

@ -0,0 +1,758 @@
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Sooraj Puthoor
#
import math
import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import send_evicts
from Cluster import Cluster
class CntrlBase:
_seqs = 0
@classmethod
def seqCount(cls):
# Use SeqCount not class since we need global count
CntrlBase._seqs += 1
return CntrlBase._seqs - 1
_cntrls = 0
@classmethod
def cntrlCount(cls):
# Use CntlCount not class since we need global count
CntrlBase._cntrls += 1
return CntrlBase._cntrls - 1
_version = 0
@classmethod
def versionCount(cls):
cls._version += 1 # Use count for this particular type
return cls._version - 1
#
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
#
class L1Cache(RubyCache):
resourceStalls = False
dataArrayBanks = 2
tagArrayBanks = 2
dataAccessLatency = 1
tagAccessLatency = 1
def create(self, size, assoc, options):
self.size = MemorySize(size)
self.assoc = assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class L2Cache(RubyCache):
resourceStalls = False
assoc = 16
dataArrayBanks = 16
tagArrayBanks = 16
def create(self, size, assoc, options):
self.size = MemorySize(size)
self.assoc = assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class CPCntrl(CorePair_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1Icache = L1Cache()
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
self.L1D0cache = L1Cache()
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
self.L1D1cache = L1Cache()
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
self.L2cache = L2Cache()
self.L2cache.create(options.l2_size, options.l2_assoc, options)
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1Icache
self.sequencer.dcache = self.L1D0cache
self.sequencer.ruby_system = ruby_system
self.sequencer.coreid = 0
self.sequencer.is_cpu_sequencer = True
self.sequencer1 = RubySequencer()
self.sequencer1.version = self.seqCount()
self.sequencer1.icache = self.L1Icache
self.sequencer1.dcache = self.L1D1cache
self.sequencer1.ruby_system = ruby_system
self.sequencer1.coreid = 1
self.sequencer1.is_cpu_sequencer = True
self.issue_latency = 1
self.send_evictions = send_evicts(options)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class TCPCache(RubyCache):
size = "16kB"
assoc = 16
dataArrayBanks = 16
tagArrayBanks = 16
dataAccessLatency = 4
tagAccessLatency = 1
def create(self, options):
self.size = MemorySize(options.tcp_size)
self.dataArrayBanks = 16
self.tagArrayBanks = 16
self.dataAccessLatency = 4
self.tagAccessLatency = 1
self.resourceStalls = options.no_tcc_resource_stalls
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
class TCPCntrl(TCP_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = TCPCache(dataAccessLatency = options.TCP_latency)
self.L1cache.create(options)
self.issue_latency = 1
self.coalescer = VIPERCoalescer()
self.coalescer.version = self.seqCount()
self.coalescer.icache = self.L1cache
self.coalescer.dcache = self.L1cache
self.coalescer.ruby_system = ruby_system
self.coalescer.support_inst_reqs = False
self.coalescer.is_cpu_sequencer = False
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.is_cpu_sequencer = True
self.use_seq_not_coal = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class SQCCache(RubyCache):
dataArrayBanks = 8
tagArrayBanks = 8
dataAccessLatency = 1
tagAccessLatency = 1
def create(self, options):
self.size = MemorySize(options.sqc_size)
self.assoc = options.sqc_assoc
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
class SQCCntrl(SQC_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1cache = SQCCache()
self.L1cache.create(options)
self.L1cache.resourceStalls = False
self.sequencer = RubySequencer()
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1cache
self.sequencer.dcache = self.L1cache
self.sequencer.ruby_system = ruby_system
self.sequencer.support_data_reqs = False
self.sequencer.is_cpu_sequencer = False
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class TCC(RubyCache):
size = MemorySize("256kB")
assoc = 16
dataAccessLatency = 8
tagAccessLatency = 2
resourceStalls = False
def create(self, options):
self.assoc = options.tcc_assoc
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
s = options.num_compute_units
tcc_size = s * 128
tcc_size = str(tcc_size)+'kB'
self.size = MemorySize(tcc_size)
self.dataArrayBanks = 64
self.tagArrayBanks = 64
else:
self.size = MemorySize(options.tcc_size)
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
self.size.value = self.size.value / options.num_tccs
if ((self.size.value / long(self.assoc)) < 128):
self.size.value = long(128 * self.assoc)
self.start_index_bit = math.log(options.cacheline_size, 2) + \
math.log(options.num_tccs, 2)
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
class TCCCntrl(TCC_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L2cache = TCC()
self.L2cache.create(options)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class L3Cache(RubyCache):
dataArrayBanks = 16
tagArrayBanks = 16
def create(self, options, ruby_system, system):
self.size = MemorySize(options.l3_size)
self.size.value /= options.num_dirs
self.assoc = options.l3_assoc
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataAccessLatency = options.l3_data_latency
self.tagAccessLatency = options.l3_tag_latency
self.resourceStalls = False
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
class L3Cntrl(L3Cache_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L3cache = L3Cache()
self.L3cache.create(options, ruby_system, system)
self.l3_response_latency = \
max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
# Directory memory: Directory memory of infinite size which is
# used by directory controller to store the "states" of the
# state machine. The state machine is implemented per cache block
class DirMem(RubyDirectoryMemory, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
phys_mem_size = AddrRange(options.mem_size).size()
mem_module_size = phys_mem_size / options.num_dirs
dir_size = MemorySize('0B')
dir_size.value = mem_module_size
self.size = dir_size
# Directory controller: Contains directory memory, L3 cache and associated state
# machine which is used to accurately redirect a data request to L3 cache or to
# memory. The permissions requests do not come to this directory for region
# based protocols as they are handled exclusively by the region directory.
# However, region directory controller uses this directory controller for
# sending probe requests and receiving probe responses.
class DirCntrl(Directory_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.response_latency = 25
self.response_latency_regionDir = 1
self.directory = DirMem()
self.directory.create(options, ruby_system, system)
self.L3CacheMemory = L3Cache()
self.L3CacheMemory.create(options, ruby_system, system)
self.l3_hit_latency = \
max(self.L3CacheMemory.dataAccessLatency,
self.L3CacheMemory.tagAccessLatency)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
# Region directory : Stores region permissions
class RegionDir(RubyCache):
def create(self, options, ruby_system, system):
self.block_size = "%dB" % (64 * options.blocks_per_region)
self.size = options.region_dir_entries * \
self.block_size * options.num_compute_units
self.assoc = 8
self.tagArrayBanks = 8
self.tagAccessLatency = options.dir_tag_latency
self.dataAccessLatency = 1
self.resourceStalls = options.no_resource_stalls
self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
# Region directory controller : Contains region directory and associated state
# machine for dealing with region coherence requests.
class RegionCntrl(RegionDir_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.cacheMemory = RegionDir()
self.cacheMemory.create(options, ruby_system, system)
self.blocksPerRegion = options.blocks_per_region
self.toDirLatency = \
max(self.cacheMemory.dataAccessLatency,
self.cacheMemory.tagAccessLatency)
self.ruby_system = ruby_system
self.always_migrate = options.always_migrate
self.sym_migrate = options.symmetric_migrate
self.asym_migrate = options.asymmetric_migrate
if self.always_migrate:
assert(not self.asym_migrate and not self.sym_migrate)
if self.sym_migrate:
assert(not self.always_migrate and not self.asym_migrate)
if self.asym_migrate:
assert(not self.always_migrate and not self.sym_migrate)
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
# Region Buffer: A region directory cache which avoids some potential
# long latency lookup of region directory for getting region permissions
class RegionBuffer(RubyCache):
assoc = 4
dataArrayBanks = 256
tagArrayBanks = 256
dataAccessLatency = 1
tagAccessLatency = 1
resourceStalls = True
class RBCntrl(RegionBuffer_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.cacheMemory = RegionBuffer()
self.cacheMemory.resourceStalls = options.no_tcc_resource_stalls
self.cacheMemory.dataArrayBanks = 64
self.cacheMemory.tagArrayBanks = 64
self.blocksPerRegion = options.blocks_per_region
self.cacheMemory.block_size = "%dB" % (64 * self.blocksPerRegion)
self.cacheMemory.start_index_bit = \
6 + int(math.log(self.blocksPerRegion, 2))
self.cacheMemory.size = options.region_buffer_entries * \
self.cacheMemory.block_size * options.num_compute_units
self.toDirLatency = options.gpu_to_dir_latency
self.toRegionDirLatency = options.cpu_to_dir_latency
self.noTCCdir = True
TCC_bits = int(math.log(options.num_tccs, 2))
self.TCC_select_num_bits = TCC_bits
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
self.cacheMemory.replacement_policy = \
PseudoLRUReplacementPolicy(assoc = self.cacheMemory.assoc)
def define_options(parser):
parser.add_option("--num-subcaches", type="int", default=4)
parser.add_option("--l3-data-latency", type="int", default=20)
parser.add_option("--l3-tag-latency", type="int", default=15)
parser.add_option("--cpu-to-dir-latency", type="int", default=120)
parser.add_option("--gpu-to-dir-latency", type="int", default=60)
parser.add_option("--no-resource-stalls", action="store_false",
default=True)
parser.add_option("--no-tcc-resource-stalls", action="store_false",
default=True)
parser.add_option("--num-tbes", type="int", default=32)
parser.add_option("--l2-latency", type="int", default=50) # load to use
parser.add_option("--num-tccs", type="int", default=1,
help="number of TCC banks in the GPU")
parser.add_option("--sqc-size", type='string', default='32kB',
help="SQC cache size")
parser.add_option("--sqc-assoc", type='int', default=8,
help="SQC cache assoc")
parser.add_option("--WB_L1", action="store_true",
default=False, help="L2 Writeback Cache")
parser.add_option("--WB_L2", action="store_true",
default=False, help="L2 Writeback Cache")
parser.add_option("--TCP_latency",
type="int", default=4, help="TCP latency")
parser.add_option("--TCC_latency",
type="int", default=16, help="TCC latency")
parser.add_option("--tcc-size", type='string', default='2MB',
help="agregate tcc size")
parser.add_option("--tcc-assoc", type='int', default=16,
help="tcc assoc")
parser.add_option("--tcp-size", type='string', default='16kB',
help="tcp size")
parser.add_option("--dir-tag-latency", type="int", default=4)
parser.add_option("--dir-tag-banks", type="int", default=4)
parser.add_option("--blocks-per-region", type="int", default=16)
parser.add_option("--dir-entries", type="int", default=8192)
# Region buffer is a cache of region directory. Hence region
# directory is inclusive with respect to region directory.
# However, region directory is non-inclusive with respect to
# the caches in the system
parser.add_option("--region-dir-entries", type="int", default=1024)
parser.add_option("--region-buffer-entries", type="int", default=512)
parser.add_option("--always-migrate",
action="store_true", default=False)
parser.add_option("--symmetric-migrate",
action="store_true", default=False)
parser.add_option("--asymmetric-migrate",
action="store_true", default=False)
parser.add_option("--use-L3-on-WT", action="store_true", default=False)
def create_system(options, full_system, system, dma_devices, ruby_system):
if buildEnv['PROTOCOL'] != 'GPU_VIPER_Region':
panic("This script requires the GPU_VIPER_Region protocol to be built.")
cpu_sequencers = []
#
# The ruby network creation expects the list of nodes in the system to be
# consistent with the NetDest list. Therefore the l1 controller nodes
# must be listed before the directory nodes and directory nodes before
# dma nodes, etc.
#
dir_cntrl_nodes = []
# For an odd number of CPUs, still create the right number of controllers
TCC_bits = int(math.log(options.num_tccs, 2))
#
# Must create the individual controllers before the network to ensure the
# controller constructors are called before the network constructor
#
# For an odd number of CPUs, still create the right number of controllers
crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
cpuCluster = Cluster(extBW = (crossbar_bw), intBW=crossbar_bw)
for i in xrange((options.num_cpus + 1) / 2):
cp_cntrl = CPCntrl()
cp_cntrl.create(options, ruby_system, system)
rb_cntrl = RBCntrl()
rb_cntrl.create(options, ruby_system, system)
rb_cntrl.number_of_TBEs = 256
rb_cntrl.isOnCPU = True
cp_cntrl.regionBufferNum = rb_cntrl.version
exec("system.cp_cntrl%d = cp_cntrl" % i)
exec("system.rb_cntrl%d = rb_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
# Connect the CP controllers and the network
cp_cntrl.requestFromCore = MessageBuffer()
cp_cntrl.requestFromCore.master = ruby_system.network.slave
cp_cntrl.responseFromCore = MessageBuffer()
cp_cntrl.responseFromCore.master = ruby_system.network.slave
cp_cntrl.unblockFromCore = MessageBuffer()
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
cp_cntrl.probeToCore = MessageBuffer()
cp_cntrl.probeToCore.slave = ruby_system.network.master
cp_cntrl.responseToCore = MessageBuffer()
cp_cntrl.responseToCore.slave = ruby_system.network.master
cp_cntrl.mandatoryQueue = MessageBuffer()
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
# Connect the RB controllers to the ruby network
rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
rb_cntrl.requestFromCore.slave = ruby_system.network.master
rb_cntrl.responseFromCore = MessageBuffer()
rb_cntrl.responseFromCore.slave = ruby_system.network.master
rb_cntrl.requestToNetwork = MessageBuffer()
rb_cntrl.requestToNetwork.master = ruby_system.network.slave
rb_cntrl.notifyFromRegionDir = MessageBuffer()
rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
rb_cntrl.probeFromRegionDir = MessageBuffer()
rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
rb_cntrl.unblockFromDir = MessageBuffer()
rb_cntrl.unblockFromDir.slave = ruby_system.network.master
rb_cntrl.responseToRegDir = MessageBuffer()
rb_cntrl.responseToRegDir.master = ruby_system.network.slave
rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
cpuCluster.add(cp_cntrl)
cpuCluster.add(rb_cntrl)
gpuCluster = Cluster(extBW = (crossbar_bw), intBW = crossbar_bw)
for i in xrange(options.num_compute_units):
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
issue_latency = 1,
number_of_TBEs = 2560)
# TBEs set to max outstanding requests
tcp_cntrl.create(options, ruby_system, system)
tcp_cntrl.WB = options.WB_L1
tcp_cntrl.disableL1 = False
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(tcp_cntrl.coalescer)
# Connect the CP (TCP) controllers to the ruby network
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
tcp_cntrl.unblockFromCore = MessageBuffer()
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
tcp_cntrl.mandatoryQueue = MessageBuffer()
gpuCluster.add(tcp_cntrl)
for i in xrange(options.num_sqc):
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
sqc_cntrl.create(options, ruby_system, system)
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.append(sqc_cntrl.sequencer)
# Connect the SQC controller to the ruby network
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
sqc_cntrl.mandatoryQueue = MessageBuffer()
# SQC also in GPU cluster
gpuCluster.add(sqc_cntrl)
numa_bit = 6
for i in xrange(options.num_tccs):
tcc_cntrl = TCCCntrl()
tcc_cntrl.create(options, ruby_system, system)
tcc_cntrl.l2_request_latency = 1
tcc_cntrl.l2_response_latency = options.TCC_latency
tcc_cntrl.WB = options.WB_L2
tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
# Connect the TCC controllers to the ruby network
tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
tcc_cntrl.responseToCore.master = ruby_system.network.slave
tcc_cntrl.probeFromNB = MessageBuffer()
tcc_cntrl.probeFromNB.slave = ruby_system.network.master
tcc_cntrl.responseFromNB = MessageBuffer()
tcc_cntrl.responseFromNB.slave = ruby_system.network.master
tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
tcc_cntrl.requestToNB.master = ruby_system.network.slave
tcc_cntrl.responseToNB = MessageBuffer()
tcc_cntrl.responseToNB.master = ruby_system.network.slave
tcc_cntrl.unblockToNB = MessageBuffer()
tcc_cntrl.unblockToNB.master = ruby_system.network.slave
tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
rb_cntrl = RBCntrl()
rb_cntrl.create(options, ruby_system, system)
rb_cntrl.number_of_TBEs = 2560 * options.num_compute_units
rb_cntrl.isOnCPU = False
# Connect the RB controllers to the ruby network
rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
rb_cntrl.requestFromCore.slave = ruby_system.network.master
rb_cntrl.responseFromCore = MessageBuffer()
rb_cntrl.responseFromCore.slave = ruby_system.network.master
rb_cntrl.requestToNetwork = MessageBuffer()
rb_cntrl.requestToNetwork.master = ruby_system.network.slave
rb_cntrl.notifyFromRegionDir = MessageBuffer()
rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
rb_cntrl.probeFromRegionDir = MessageBuffer()
rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
rb_cntrl.unblockFromDir = MessageBuffer()
rb_cntrl.unblockFromDir.slave = ruby_system.network.master
rb_cntrl.responseToRegDir = MessageBuffer()
rb_cntrl.responseToRegDir.master = ruby_system.network.slave
rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
tcc_cntrl.regionBufferNum = rb_cntrl.version
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
exec("system.tcc_rb_cntrl%d = rb_cntrl" % i)
# TCC cntrls added to the GPU cluster
gpuCluster.add(tcc_cntrl)
gpuCluster.add(rb_cntrl)
# Because of wire buffers, num_l3caches must equal num_dirs
# Region coherence only works with 1 dir
assert(options.num_l3caches == options.num_dirs == 1)
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
# Clusters
mainCluster = Cluster(intBW = crossbar_bw)
dir_cntrl = DirCntrl()
dir_cntrl.create(options, ruby_system, system)
dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
dir_cntrl.useL3OnWT = options.use_L3_on_WT
# Connect the Directory controller to the ruby network
dir_cntrl.requestFromCores = MessageBuffer()
dir_cntrl.requestFromCores.slave = ruby_system.network.master
dir_cntrl.responseFromCores = MessageBuffer()
dir_cntrl.responseFromCores.slave = ruby_system.network.master
dir_cntrl.unblockFromCores = MessageBuffer()
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
dir_cntrl.probeToCore = MessageBuffer()
dir_cntrl.probeToCore.master = ruby_system.network.slave
dir_cntrl.responseToCore = MessageBuffer()
dir_cntrl.responseToCore.master = ruby_system.network.slave
dir_cntrl.reqFromRegBuf = MessageBuffer()
dir_cntrl.reqFromRegBuf.slave = ruby_system.network.master
dir_cntrl.reqToRegDir = MessageBuffer(ordered = True)
dir_cntrl.reqToRegDir.master = ruby_system.network.slave
dir_cntrl.reqFromRegDir = MessageBuffer(ordered = True)
dir_cntrl.reqFromRegDir.slave = ruby_system.network.master
dir_cntrl.unblockToRegDir = MessageBuffer()
dir_cntrl.unblockToRegDir.master = ruby_system.network.slave
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.responseFromMemory = MessageBuffer()
exec("system.dir_cntrl%d = dir_cntrl" % i)
dir_cntrl_nodes.append(dir_cntrl)
mainCluster.add(dir_cntrl)
reg_cntrl = RegionCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
reg_cntrl.create(options, ruby_system, system)
reg_cntrl.number_of_TBEs = options.num_tbes
reg_cntrl.cpuRegionBufferNum = system.rb_cntrl0.version
reg_cntrl.gpuRegionBufferNum = system.tcc_rb_cntrl0.version
# Connect the Region Dir controllers to the ruby network
reg_cntrl.requestToDir = MessageBuffer(ordered = True)
reg_cntrl.requestToDir.master = ruby_system.network.slave
reg_cntrl.notifyToRBuffer = MessageBuffer()
reg_cntrl.notifyToRBuffer.master = ruby_system.network.slave
reg_cntrl.probeToRBuffer = MessageBuffer()
reg_cntrl.probeToRBuffer.master = ruby_system.network.slave
reg_cntrl.responseFromRBuffer = MessageBuffer()
reg_cntrl.responseFromRBuffer.slave = ruby_system.network.master
reg_cntrl.requestFromRegBuf = MessageBuffer()
reg_cntrl.requestFromRegBuf.slave = ruby_system.network.master
reg_cntrl.triggerQueue = MessageBuffer(ordered = True)
exec("system.reg_cntrl%d = reg_cntrl" % i)
mainCluster.add(reg_cntrl)
# Assuming no DMA devices
assert(len(dma_devices) == 0)
# Add cpu/gpu clusters to main cluster
mainCluster.add(cpuCluster)
mainCluster.add(gpuCluster)
ruby_system.network.number_of_virtual_networks = 10
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)

View file

@ -0,0 +1,326 @@
#
# Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Lisa Hsu
#
import math
import m5
from m5.objects import *
from m5.defines import buildEnv
from Ruby import create_topology
from Ruby import send_evicts
from Cluster import Cluster
from Crossbar import Crossbar
class CntrlBase:
_seqs = 0
@classmethod
def seqCount(cls):
# Use SeqCount not class since we need global count
CntrlBase._seqs += 1
return CntrlBase._seqs - 1
_cntrls = 0
@classmethod
def cntrlCount(cls):
# Use CntlCount not class since we need global count
CntrlBase._cntrls += 1
return CntrlBase._cntrls - 1
_version = 0
@classmethod
def versionCount(cls):
cls._version += 1 # Use count for this particular type
return cls._version - 1
class L1DCache(RubyCache):
resourceStalls = False
def create(self, options):
self.size = MemorySize(options.l1d_size)
self.assoc = options.l1d_assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class L1ICache(RubyCache):
resourceStalls = False
def create(self, options):
self.size = MemorySize(options.l1i_size)
self.assoc = options.l1i_assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class L2Cache(RubyCache):
resourceStalls = False
def create(self, options):
self.size = MemorySize(options.l2_size)
self.assoc = options.l2_assoc
self.replacement_policy = PseudoLRUReplacementPolicy()
class CPCntrl(CorePair_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L1Icache = L1ICache()
self.L1Icache.create(options)
self.L1D0cache = L1DCache()
self.L1D0cache.create(options)
self.L1D1cache = L1DCache()
self.L1D1cache.create(options)
self.L2cache = L2Cache()
self.L2cache.create(options)
self.sequencer = RubySequencer()
self.sequencer.icache_hit_latency = 2
self.sequencer.dcache_hit_latency = 2
self.sequencer.version = self.seqCount()
self.sequencer.icache = self.L1Icache
self.sequencer.dcache = self.L1D0cache
self.sequencer.ruby_system = ruby_system
self.sequencer.coreid = 0
self.sequencer.is_cpu_sequencer = True
self.sequencer1 = RubySequencer()
self.sequencer1.version = self.seqCount()
self.sequencer1.icache = self.L1Icache
self.sequencer1.dcache = self.L1D1cache
self.sequencer1.icache_hit_latency = 2
self.sequencer1.dcache_hit_latency = 2
self.sequencer1.ruby_system = ruby_system
self.sequencer1.coreid = 1
self.sequencer1.is_cpu_sequencer = True
self.issue_latency = options.cpu_to_dir_latency
self.send_evictions = send_evicts(options)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
class L3Cache(RubyCache):
assoc = 8
dataArrayBanks = 256
tagArrayBanks = 256
def create(self, options, ruby_system, system):
self.size = MemorySize(options.l3_size)
self.size.value /= options.num_dirs
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataArrayBanks /= options.num_dirs
self.tagArrayBanks /= options.num_dirs
self.dataAccessLatency = options.l3_data_latency
self.tagAccessLatency = options.l3_tag_latency
self.resourceStalls = options.no_resource_stalls
self.replacement_policy = PseudoLRUReplacementPolicy()
class L3Cntrl(L3Cache_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.L3cache = L3Cache()
self.L3cache.create(options, ruby_system, system)
self.l3_response_latency = max(self.L3cache.dataAccessLatency,
self.L3cache.tagAccessLatency)
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
class DirMem(RubyDirectoryMemory, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
phys_mem_size = AddrRange(options.mem_size).size()
mem_module_size = phys_mem_size / options.num_dirs
dir_size = MemorySize('0B')
dir_size.value = mem_module_size
self.size = dir_size
class DirCntrl(Directory_Controller, CntrlBase):
def create(self, options, ruby_system, system):
self.version = self.versionCount()
self.response_latency = 30
self.directory = DirMem()
self.directory.create(options, ruby_system, system)
self.L3CacheMemory = L3Cache()
self.L3CacheMemory.create(options, ruby_system, system)
self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
self.L3CacheMemory.tagAccessLatency)
self.number_of_TBEs = options.num_tbes
self.ruby_system = ruby_system
if options.recycle_latency:
self.recycle_latency = options.recycle_latency
self.CPUonly = True
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
req_to_l3, probe_to_l3, resp_to_l3):
self.reqToDir = req_to_dir
self.respToDir = resp_to_dir
self.l3UnblockToDir = l3_unblock_to_dir
self.reqToL3 = req_to_l3
self.probeToL3 = probe_to_l3
self.respToL3 = resp_to_l3
def define_options(parser):
parser.add_option("--num-subcaches", type="int", default=4)
parser.add_option("--l3-data-latency", type="int", default=20)
parser.add_option("--l3-tag-latency", type="int", default=15)
parser.add_option("--cpu-to-dir-latency", type="int", default=15)
parser.add_option("--no-resource-stalls", action="store_false",
default=True)
parser.add_option("--num-tbes", type="int", default=256)
parser.add_option("--l2-latency", type="int", default=50) # load to use
def create_system(options, full_system, system, dma_devices, ruby_system):
if buildEnv['PROTOCOL'] != 'MOESI_AMD_Base':
panic("This script requires the MOESI_AMD_Base protocol.")
cpu_sequencers = []
#
# The ruby network creation expects the list of nodes in the system to
# be consistent with the NetDest list. Therefore the l1 controller
# nodes must be listed before the directory nodes and directory nodes
# before dma nodes, etc.
#
l1_cntrl_nodes = []
l3_cntrl_nodes = []
dir_cntrl_nodes = []
control_count = 0
#
# Must create the individual controllers before the network to ensure
# the controller constructors are called before the network constructor
#
# This is the base crossbar that connects the L3s, Dirs, and cpu
# Cluster
mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
for i in xrange(options.num_dirs):
dir_cntrl = DirCntrl(TCC_select_num_bits = 0)
dir_cntrl.create(options, ruby_system, system)
# Connect the Directory controller to the ruby network
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
dir_cntrl.requestFromCores.slave = ruby_system.network.master
dir_cntrl.responseFromCores = MessageBuffer()
dir_cntrl.responseFromCores.slave = ruby_system.network.master
dir_cntrl.unblockFromCores = MessageBuffer()
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
dir_cntrl.probeToCore = MessageBuffer()
dir_cntrl.probeToCore.master = ruby_system.network.slave
dir_cntrl.responseToCore = MessageBuffer()
dir_cntrl.responseToCore.master = ruby_system.network.slave
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
dir_cntrl.responseFromMemory = MessageBuffer()
exec("system.dir_cntrl%d = dir_cntrl" % i)
dir_cntrl_nodes.append(dir_cntrl)
mainCluster.add(dir_cntrl)
# Technically this config can support an odd number of cpus, but the top
# level config files, such as the ruby_random_tester, will get confused if
# the number of cpus does not equal the number of sequencers. Thus make
# sure that an even number of cpus is specified.
assert((options.num_cpus % 2) == 0)
# For an odd number of CPUs, still create the right number of controllers
cpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
for i in xrange((options.num_cpus + 1) / 2):
cp_cntrl = CPCntrl()
cp_cntrl.create(options, ruby_system, system)
exec("system.cp_cntrl%d = cp_cntrl" % i)
#
# Add controllers and sequencers to the appropriate lists
#
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
# Connect the CP controllers and the network
cp_cntrl.requestFromCore = MessageBuffer()
cp_cntrl.requestFromCore.master = ruby_system.network.slave
cp_cntrl.responseFromCore = MessageBuffer()
cp_cntrl.responseFromCore.master = ruby_system.network.slave
cp_cntrl.unblockFromCore = MessageBuffer()
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
cp_cntrl.probeToCore = MessageBuffer()
cp_cntrl.probeToCore.slave = ruby_system.network.master
cp_cntrl.responseToCore = MessageBuffer()
cp_cntrl.responseToCore.slave = ruby_system.network.master
cp_cntrl.mandatoryQueue = MessageBuffer()
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
cpuCluster.add(cp_cntrl)
# Assuming no DMA devices
assert(len(dma_devices) == 0)
# Add cpu/gpu clusters to main cluster
mainCluster.add(cpuCluster)
ruby_system.network.number_of_virtual_networks = 10
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)

View file

@ -78,7 +78,7 @@ class SourceMeta(type):
def __init__(cls, name, bases, dict):
super(SourceMeta, cls).__init__(name, bases, dict)
cls.all = []
def get(cls, **guards):
'''Find all files that match the specified guards. If a source
file does not specify a flag, the default is False'''
@ -367,9 +367,9 @@ def makeTheISA(source, target, env):
target_isa = env['TARGET_ISA']
def define(isa):
return isa.upper() + '_ISA'
def namespace(isa):
return isa[0].upper() + isa[1:].lower() + 'ISA'
return isa[0].upper() + isa[1:].lower() + 'ISA'
code = code_formatter()
@ -407,6 +407,51 @@ def makeTheISA(source, target, env):
env.Command('config/the_isa.hh', map(Value, all_isa_list),
MakeAction(makeTheISA, Transform("CFG ISA", 0)))
def makeTheGPUISA(source, target, env):
isas = [ src.get_contents() for src in source ]
target_gpu_isa = env['TARGET_GPU_ISA']
def define(isa):
return isa.upper() + '_ISA'
def namespace(isa):
return isa[0].upper() + isa[1:].lower() + 'ISA'
code = code_formatter()
code('''\
#ifndef __CONFIG_THE_GPU_ISA_HH__
#define __CONFIG_THE_GPU_ISA_HH__
''')
# create defines for the preprocessing and compile-time determination
for i,isa in enumerate(isas):
code('#define $0 $1', define(isa), i + 1)
code()
# create an enum for any run-time determination of the ISA, we
# reuse the same name as the namespaces
code('enum class GPUArch {')
for i,isa in enumerate(isas):
if i + 1 == len(isas):
code(' $0 = $1', namespace(isa), define(isa))
else:
code(' $0 = $1,', namespace(isa), define(isa))
code('};')
code('''
#define THE_GPU_ISA ${{define(target_gpu_isa)}}
#define TheGpuISA ${{namespace(target_gpu_isa)}}
#define THE_GPU_ISA_STR "${{target_gpu_isa}}"
#endif // __CONFIG_THE_GPU_ISA_HH__''')
code.write(str(target[0]))
env.Command('config/the_gpu_isa.hh', map(Value, all_gpu_isa_list),
MakeAction(makeTheGPUISA, Transform("CFG ISA", 0)))
########################################################################
#
# Prevent any SimObjects from being added after this point, they
@ -784,7 +829,7 @@ extern "C" {
EmbeddedSwig embed_swig_${module}(init_${module});
''')
code.write(str(target[0]))
# Build all swig modules
for swig in SwigSource.all:
env.Command([swig.cc_source.tnode, swig.py_source.tnode], swig.tnode,
@ -959,7 +1004,7 @@ const uint8_t data_${sym}[] = {
x = array.array('B', data[i:i+step])
code(''.join('%d,' % d for d in x))
code.dedent()
code('''};
EmbeddedPython embedded_${sym}(

View file

@ -68,6 +68,14 @@ isa_switch_hdrs = Split('''
# Set up this directory to support switching headers
make_switching_dir('arch', isa_switch_hdrs, env)
if env['BUILD_GPU']:
gpu_isa_switch_hdrs = Split('''
gpu_decoder.hh
gpu_types.hh
''')
make_gpu_switching_dir('arch', gpu_isa_switch_hdrs, env)
#################################################################
#
# Include architecture-specific files.

67
src/arch/hsail/Brig.h Normal file
View file

@ -0,0 +1,67 @@
// University of Illinois/NCSA
// Open Source License
//
// Copyright (c) 2013, Advanced Micro Devices, Inc.
// All rights reserved.
//
// Developed by:
//
// HSA Team
//
// Advanced Micro Devices, Inc
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files (the "Software"), to deal with
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
// of the Software, and to permit persons to whom the Software is furnished to do
// so, subject to the following conditions:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimers in the
// documentation and/or other materials provided with the distribution.
//
// * Neither the names of the LLVM Team, University of Illinois at
// Urbana-Champaign, nor the names of its contributors may be used to
// endorse or promote products derived from this Software without specific
// prior written permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
// SOFTWARE.
#ifndef INTERNAL_BRIG_H
#define INTERNAL_BRIG_H
#include <stdint.h>
namespace Brig {
#include "Brig_new.hpp"
// These typedefs provide some backward compatibility with earlier versions
// of Brig.h, reducing the number of code changes. The distinct names also
// increase legibility by showing the code's intent.
typedef BrigBase BrigDirective;
typedef BrigBase BrigOperand;
enum BrigMemoryFenceSegments { // for internal use only
//.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc }
//.mnemo_token=_EMMemoryFenceSegments
//.mnemo_context=EInstModifierInstFenceContext
BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0,
BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1,
BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2,
BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip
};
}
#endif // defined(INTERNAL_BRIG_H)

1587
src/arch/hsail/Brig_new.hpp Normal file

File diff suppressed because it is too large Load diff

54
src/arch/hsail/SConscript Normal file
View file

@ -0,0 +1,54 @@
# -*- mode:python -*-
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Anthony Gutierrez
#
Import('*')
if not env['BUILD_GPU']:
Return()
if env['TARGET_GPU_ISA'] == 'hsail':
env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'],
'gen.py', '$SOURCE $TARGETS')
Source('generic_types.cc')
Source('gpu_decoder.cc')
Source('insts/branch.cc')
Source('insts/gen_exec.cc')
Source('insts/gpu_static_inst.cc')
Source('insts/main.cc')
Source('insts/pseudo_inst.cc')
Source('insts/mem.cc')
Source('operand.cc')

40
src/arch/hsail/SConsopts Normal file
View file

@ -0,0 +1,40 @@
# -*- mode:python -*-
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Anthony Gutierrez
#
Import('*')
all_gpu_isa_list.append('hsail')

806
src/arch/hsail/gen.py Executable file
View file

@ -0,0 +1,806 @@
#! /usr/bin/python
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Steve Reinhardt
#
import sys, re
from m5.util import code_formatter
if len(sys.argv) != 4:
print "Error: need 3 args (file names)"
sys.exit(0)
header_code = code_formatter()
decoder_code = code_formatter()
exec_code = code_formatter()
###############
#
# Generate file prologs (includes etc.)
#
###############
header_code('''
#include "arch/hsail/insts/decl.hh"
#include "base/bitfield.hh"
#include "gpu-compute/hsail_code.hh"
#include "gpu-compute/wavefront.hh"
namespace HsailISA
{
''')
header_code.indent()
decoder_code('''
#include "arch/hsail/gpu_decoder.hh"
#include "arch/hsail/insts/branch.hh"
#include "arch/hsail/insts/decl.hh"
#include "arch/hsail/insts/gen_decl.hh"
#include "arch/hsail/insts/mem.hh"
#include "arch/hsail/insts/mem_impl.hh"
#include "gpu-compute/brig_object.hh"
namespace HsailISA
{
std::vector<GPUStaticInst*> Decoder::decodedInsts;
GPUStaticInst*
Decoder::decode(MachInst machInst)
{
using namespace Brig;
const BrigInstBase *ib = machInst.brigInstBase;
const BrigObject *obj = machInst.brigObj;
switch(ib->opcode) {
''')
decoder_code.indent()
decoder_code.indent()
exec_code('''
#include "arch/hsail/insts/gen_decl.hh"
#include "base/intmath.hh"
namespace HsailISA
{
''')
exec_code.indent()
###############
#
# Define code templates for class declarations (for header file)
#
###############
# Basic header template for an instruction with no template parameters.
header_template_nodt = '''
class $class_name : public $base_class
{
public:
typedef $base_class Base;
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
: Base(ib, obj, "$opcode")
{
}
void execute(GPUDynInstPtr gpuDynInst);
};
'''
# Basic header template for an instruction with a single DataType
# template parameter.
header_template_1dt = '''
template<typename DataType>
class $class_name : public $base_class<DataType>
{
public:
typedef $base_class<DataType> Base;
typedef typename DataType::CType CType;
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
: Base(ib, obj, "$opcode")
{
}
void execute(GPUDynInstPtr gpuDynInst);
};
'''
header_template_1dt_noexec = '''
template<typename DataType>
class $class_name : public $base_class<DataType>
{
public:
typedef $base_class<DataType> Base;
typedef typename DataType::CType CType;
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
: Base(ib, obj, "$opcode")
{
}
};
'''
# Same as header_template_1dt, except the base class has a second
# template parameter NumSrcOperands to allow a variable number of
# source operands. Note that since this is implemented with an array,
# it only works for instructions where all sources are of the same
# type (like most arithmetics).
header_template_1dt_varsrcs = '''
template<typename DataType>
class $class_name : public $base_class<DataType, $num_srcs>
{
public:
typedef $base_class<DataType, $num_srcs> Base;
typedef typename DataType::CType CType;
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
: Base(ib, obj, "$opcode")
{
}
void execute(GPUDynInstPtr gpuDynInst);
};
'''
# Header template for instruction with two DataType template
# parameters, one for the dest and one for the source. This is used
# by compare and convert.
header_template_2dt = '''
template<typename DestDataType, class SrcDataType>
class $class_name : public $base_class<DestDataType, SrcDataType>
{
public:
typedef $base_class<DestDataType, SrcDataType> Base;
typedef typename DestDataType::CType DestCType;
typedef typename SrcDataType::CType SrcCType;
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
: Base(ib, obj, "$opcode")
{
}
void execute(GPUDynInstPtr gpuDynInst);
};
'''
header_templates = {
'ArithInst': header_template_1dt_varsrcs,
'CmovInst': header_template_1dt,
'ClassInst': header_template_1dt,
'ShiftInst': header_template_1dt,
'ExtractInsertInst': header_template_1dt,
'CmpInst': header_template_2dt,
'CvtInst': header_template_2dt,
'LdInst': '',
'StInst': '',
'SpecialInstNoSrc': header_template_nodt,
'SpecialInst1Src': header_template_nodt,
'SpecialInstNoSrcNoDest': '',
}
###############
#
# Define code templates for exec functions
#
###############
# exec function body
exec_template_nodt_nosrc = '''
void
$class_name::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
typedef Base::DestCType DestCType;
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
DestCType dest_val = $expr;
this->dest.set(w, lane, dest_val);
}
}
}
'''
exec_template_nodt_1src = '''
void
$class_name::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
typedef Base::DestCType DestCType;
typedef Base::SrcCType SrcCType;
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
DestCType dest_val = $expr;
this->dest.set(w, lane, dest_val);
}
}
}
'''
exec_template_1dt_varsrcs = '''
template<typename DataType>
void
$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
CType dest_val;
if ($dest_is_src_flag) {
dest_val = this->dest.template get<CType>(w, lane);
}
CType src_val[$num_srcs];
for (int i = 0; i < $num_srcs; ++i) {
src_val[i] = this->src[i].template get<CType>(w, lane);
}
dest_val = (CType)($expr);
this->dest.set(w, lane, dest_val);
}
}
}
'''
exec_template_1dt_3srcs = '''
template<typename DataType>
void
$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
typedef typename Base::Src0CType Src0T;
typedef typename Base::Src1CType Src1T;
typedef typename Base::Src2CType Src2T;
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
CType dest_val;
if ($dest_is_src_flag) {
dest_val = this->dest.template get<CType>(w, lane);
}
Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
Src2T src_val2 = this->src2.template get<Src2T>(w, lane);
dest_val = $expr;
this->dest.set(w, lane, dest_val);
}
}
}
'''
exec_template_1dt_2src_1dest = '''
template<typename DataType>
void
$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
typedef typename Base::DestCType DestT;
typedef CType Src0T;
typedef typename Base::Src1CType Src1T;
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
DestT dest_val;
if ($dest_is_src_flag) {
dest_val = this->dest.template get<DestT>(w, lane);
}
Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
dest_val = $expr;
this->dest.set(w, lane, dest_val);
}
}
}
'''
exec_template_shift = '''
template<typename DataType>
void
$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
CType dest_val;
if ($dest_is_src_flag) {
dest_val = this->dest.template get<CType>(w, lane);
}
CType src_val0 = this->src0.template get<CType>(w, lane);
uint32_t src_val1 = this->src1.template get<uint32_t>(w, lane);
dest_val = $expr;
this->dest.set(w, lane, dest_val);
}
}
}
'''
exec_template_2dt = '''
template<typename DestDataType, class SrcDataType>
void
$class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
DestCType dest_val;
SrcCType src_val[$num_srcs];
for (int i = 0; i < $num_srcs; ++i) {
src_val[i] = this->src[i].template get<SrcCType>(w, lane);
}
dest_val = $expr;
this->dest.set(w, lane, dest_val);
}
}
}
'''
exec_templates = {
'ArithInst': exec_template_1dt_varsrcs,
'CmovInst': exec_template_1dt_3srcs,
'ExtractInsertInst': exec_template_1dt_3srcs,
'ClassInst': exec_template_1dt_2src_1dest,
'CmpInst': exec_template_2dt,
'CvtInst': exec_template_2dt,
'LdInst': '',
'StInst': '',
'SpecialInstNoSrc': exec_template_nodt_nosrc,
'SpecialInst1Src': exec_template_nodt_1src,
'SpecialInstNoSrcNoDest': '',
}
###############
#
# Define code templates for the decoder cases
#
###############
# decode template for nodt-opcode case
decode_nodt_template = '''
case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);'''
decode_case_prolog_class_inst = '''
case BRIG_OPCODE_$brig_opcode_upper:
{
//const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]);
BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType;
//switch (baseOp->kind) {
// case BRIG_OPERAND_REG:
// type = ((const BrigOperandReg*)baseOp)->type;
// break;
// case BRIG_OPERAND_IMMED:
// type = ((const BrigOperandImmed*)baseOp)->type;
// break;
// default:
// fatal("CLASS unrecognized kind of operand %d\\n",
// baseOp->kind);
//}
switch (type) {'''
# common prolog for 1dt- or 2dt-opcode case: switch on data type
decode_case_prolog = '''
case BRIG_OPCODE_$brig_opcode_upper:
{
switch (ib->type) {'''
# single-level decode case entry (for 1dt opcodes)
decode_case_entry = \
' case BRIG_TYPE_$type_name: return $constructor(ib, obj);'
decode_store_prolog = \
' case BRIG_TYPE_$type_name: {'
decode_store_case_epilog = '''
}'''
decode_store_case_entry = \
' return $constructor(ib, obj);'
# common epilog for type switch
decode_case_epilog = '''
default: fatal("$brig_opcode_upper: unrecognized type %d\\n",
ib->type);
}
}
break;'''
# Additional templates for nested decode on a second type field (for
# compare and convert). These are used in place of the
# decode_case_entry template to create a second-level switch on on the
# second type field inside each case of the first-level type switch.
# Because the name and location of the second type can vary, the Brig
# instruction type must be provided in $brig_type, and the name of the
# second type field must be provided in $type_field.
decode_case2_prolog = '''
case BRIG_TYPE_$type_name:
switch (((Brig$brig_type*)ib)->$type2_field) {'''
decode_case2_entry = \
' case BRIG_TYPE_$type2_name: return $constructor(ib, obj);'
decode_case2_epilog = '''
default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n",
((Brig$brig_type*)ib)->$type2_field);
}
break;'''
# Figure out how many source operands an expr needs by looking for the
# highest-numbered srcN value referenced. Since sources are numbered
# starting at 0, the return value is N+1.
def num_src_operands(expr):
if expr.find('src2') != -1:
return 3
elif expr.find('src1') != -1:
return 2
elif expr.find('src0') != -1:
return 1
else:
return 0
###############
#
# Define final code generation methods
#
# The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for
# generating actual instructions.
#
###############
# Generate class declaration, exec function, and decode switch case
# for an brig_opcode with a single-level type switch. The 'types'
# parameter is a list or tuple of types for which the instruction
# should be instantiated.
def gen(brig_opcode, types=None, expr=None, base_class='ArithInst',
type2_info=None, constructor_prefix='new ', is_store=False):
brig_opcode_upper = brig_opcode.upper()
class_name = brig_opcode
opcode = class_name.lower()
if base_class == 'ArithInst':
# note that expr must be provided with ArithInst so we can
# derive num_srcs for the template
assert expr
if expr:
# Derive several bits of info from expr. If expr is not used,
# this info will be irrelevant.
num_srcs = num_src_operands(expr)
# if the RHS expression includes 'dest', then we're doing an RMW
# on the reg and we need to treat it like a source
dest_is_src = expr.find('dest') != -1
dest_is_src_flag = str(dest_is_src).lower() # for C++
if base_class in ['ShiftInst']:
expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']:
expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr)
else:
expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
expr = re.sub(r'\bdest\b', r'dest_val', expr)
# Strip template arguments off of base class before looking up
# appropriate templates
base_class_base = re.sub(r'<.*>$', '', base_class)
header_code(header_templates[base_class_base])
if base_class.startswith('SpecialInst'):
exec_code(exec_templates[base_class_base])
elif base_class.startswith('ShiftInst'):
header_code(exec_template_shift)
else:
header_code(exec_templates[base_class_base])
if not types or isinstance(types, str):
# Just a single type
constructor = constructor_prefix + class_name
decoder_code(decode_nodt_template)
else:
# multiple types, need at least one level of decode
if brig_opcode == 'Class':
decoder_code(decode_case_prolog_class_inst)
else:
decoder_code(decode_case_prolog)
if not type2_info:
if is_store == False:
# single list of types, to basic one-level decode
for type_name in types:
full_class_name = '%s<%s>' % (class_name, type_name.upper())
constructor = constructor_prefix + full_class_name
decoder_code(decode_case_entry)
else:
# single list of types, to basic one-level decode
for type_name in types:
decoder_code(decode_store_prolog)
type_size = int(re.findall(r'[0-9]+', type_name)[0])
src_size = 32
type_type = type_name[0]
full_class_name = '%s<%s,%s>' % (class_name, \
type_name.upper(), \
'%s%d' % \
(type_type.upper(), \
type_size))
constructor = constructor_prefix + full_class_name
decoder_code(decode_store_case_entry)
decoder_code(decode_store_case_epilog)
else:
# need secondary type switch (convert, compare)
# unpack extra info on second switch
(type2_field, types2) = type2_info
brig_type = 'Inst%s' % brig_opcode
for type_name in types:
decoder_code(decode_case2_prolog)
fmt = '%s<%s,%%s>' % (class_name, type_name.upper())
for type2_name in types2:
full_class_name = fmt % type2_name.upper()
constructor = constructor_prefix + full_class_name
decoder_code(decode_case2_entry)
decoder_code(decode_case2_epilog)
decoder_code(decode_case_epilog)
###############
#
# Generate instructions
#
###############
# handy abbreviations for common sets of types
# arithmetic ops are typically defined only on 32- and 64-bit sizes
arith_int_types = ('S32', 'U32', 'S64', 'U64')
arith_float_types = ('F32', 'F64')
arith_types = arith_int_types + arith_float_types
bit_types = ('B1', 'B32', 'B64')
all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types
# I think you might be able to do 'f16' memory ops too, but we'll
# ignore them for now.
mem_types = all_int_types + arith_float_types
mem_atom_types = all_int_types + ('B32', 'B64')
##### Arithmetic & logical operations
gen('Add', arith_types, 'src0 + src1')
gen('Sub', arith_types, 'src0 - src1')
gen('Mul', arith_types, 'src0 * src1')
gen('Div', arith_types, 'src0 / src1')
gen('Min', arith_types, 'std::min(src0, src1)')
gen('Max', arith_types, 'std::max(src0, src1)')
gen('Gcnmin', arith_types, 'std::min(src0, src1)')
gen('CopySign', arith_float_types,
'src1 < 0 ? -std::abs(src0) : std::abs(src0)')
gen('Sqrt', arith_float_types, 'sqrt(src0)')
gen('Floor', arith_float_types, 'floor(src0)')
# "fast" sqrt... same as slow for us
gen('Nsqrt', arith_float_types, 'sqrt(src0)')
gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)')
gen('Nrcp', arith_float_types, '1.0/src0')
gen('Fract', arith_float_types,
'(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)')
gen('Ncos', arith_float_types, 'cos(src0)');
gen('Nsin', arith_float_types, 'sin(src0)');
gen('And', bit_types, 'src0 & src1')
gen('Or', bit_types, 'src0 | src1')
gen('Xor', bit_types, 'src0 ^ src1')
gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)')
gen('Firstbit',bit_types, 'firstbit(src0)')
gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)')
gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst')
gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst')
# gen('Mul_hi', types=('s32','u32', '??'))
# gen('Mul24', types=('s32','u32', '??'))
gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)')
gen('Abs', arith_types, 'std::abs(src0)')
gen('Neg', arith_types, '-src0')
gen('Mov', bit_types, 'src0')
gen('Not', bit_types, 'heynot(src0)')
# mad and fma differ only in rounding behavior, which we don't emulate
# also there's an integer form of mad, but not of fma
gen('Mad', arith_types, 'src0 * src1 + src2')
gen('Fma', arith_float_types, 'src0 * src1 + src2')
#native floating point operations
gen('Nfma', arith_float_types, 'src0 * src1 + src2')
gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst')
gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))')
gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))')
# see base/bitfield.hh
gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)',
'ExtractInsertInst')
gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)',
'ExtractInsertInst')
##### Compare
gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)',
'CmpInst', ('sourceType', arith_types + bit_types))
gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst')
##### Conversion
# Conversion operations are only defined on B1, not B32 or B64
cvt_types = ('B1',) + mem_types
gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types))
##### Load & Store
gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode')
gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode')
gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode',
is_store=True)
gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode')
gen('AtomicNoRet', mem_atom_types, base_class='StInst',
constructor_prefix='decode')
gen('Cbr', base_class = 'LdInst', constructor_prefix='decode')
gen('Br', base_class = 'LdInst', constructor_prefix='decode')
##### Special operations
def gen_special(brig_opcode, expr, dest_type='U32'):
num_srcs = num_src_operands(expr)
if num_srcs == 0:
base_class = 'SpecialInstNoSrc<%s>' % dest_type
elif num_srcs == 1:
base_class = 'SpecialInst1Src<%s>' % dest_type
else:
assert false
gen(brig_opcode, None, expr, base_class)
gen_special('WorkItemId', 'w->workitemid[src0][lane]')
gen_special('WorkItemAbsId',
'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])')
gen_special('WorkGroupId', 'w->workgroupid[src0]')
gen_special('WorkGroupSize', 'w->workgroupsz[src0]')
gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]')
gen_special('GridSize', 'w->gridsz[src0]')
gen_special('GridGroups',
'divCeil(w->gridsz[src0],w->workgroupsz[src0])')
gen_special('LaneId', 'lane')
gen_special('WaveId', 'w->dynwaveid')
gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64')
# gen_special('CU'', ')
gen('Ret', base_class='SpecialInstNoSrcNoDest')
gen('Barrier', base_class='SpecialInstNoSrcNoDest')
gen('MemFence', base_class='SpecialInstNoSrcNoDest')
# Map magic instructions to the BrigSyscall opcode
# Magic instructions are defined in magic.hh
#
# In the future, real HSA kernel system calls can be implemented and coexist
# with magic instructions.
gen('Call', base_class='SpecialInstNoSrcNoDest')
###############
#
# Generate file epilogs
#
###############
header_code.dedent()
header_code('''
} // namespace HsailISA
''')
# close off main decode switch
decoder_code.dedent()
decoder_code.dedent()
decoder_code('''
default: fatal("unrecognized Brig opcode %d\\n", ib->opcode);
} // end switch(ib->opcode)
} // end decode()
} // namespace HsailISA
''')
exec_code.dedent()
exec_code('''
} // namespace HsailISA
''')
###############
#
# Output accumulated code to files
#
###############
header_code.write(sys.argv[1])
decoder_code.write(sys.argv[2])
exec_code.write(sys.argv[3])

View file

@ -0,0 +1,47 @@
#include "arch/hsail/generic_types.hh"
#include "base/misc.hh"
using namespace Brig;
namespace HsailISA
{
Enums::GenericMemoryOrder
getGenericMemoryOrder(BrigMemoryOrder brig_memory_order)
{
switch(brig_memory_order) {
case BRIG_MEMORY_ORDER_NONE:
return Enums::MEMORY_ORDER_NONE;
case BRIG_MEMORY_ORDER_RELAXED:
return Enums::MEMORY_ORDER_RELAXED;
case BRIG_MEMORY_ORDER_SC_ACQUIRE:
return Enums::MEMORY_ORDER_SC_ACQUIRE;
case BRIG_MEMORY_ORDER_SC_RELEASE:
return Enums::MEMORY_ORDER_SC_RELEASE;
case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE;
default:
fatal("HsailISA::MemInst::getGenericMemoryOrder -> ",
"bad BrigMemoryOrder\n");
}
}
Enums::GenericMemoryScope
getGenericMemoryScope(BrigMemoryScope brig_memory_scope)
{
switch(brig_memory_scope) {
case BRIG_MEMORY_SCOPE_NONE:
return Enums::MEMORY_SCOPE_NONE;
case BRIG_MEMORY_SCOPE_WORKITEM:
return Enums::MEMORY_SCOPE_WORKITEM;
case BRIG_MEMORY_SCOPE_WORKGROUP:
return Enums::MEMORY_SCOPE_WORKGROUP;
case BRIG_MEMORY_SCOPE_AGENT:
return Enums::MEMORY_SCOPE_DEVICE;
case BRIG_MEMORY_SCOPE_SYSTEM:
return Enums::MEMORY_SCOPE_SYSTEM;
default:
fatal("HsailISA::MemInst::getGenericMemoryScope -> ",
"bad BrigMemoryScope\n");
}
}
} // namespace HsailISA

View file

@ -0,0 +1,16 @@
#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__
#define __ARCH_HSAIL_GENERIC_TYPES_HH__
#include "arch/hsail/Brig.h"
#include "enums/GenericMemoryOrder.hh"
#include "enums/GenericMemoryScope.hh"
namespace HsailISA
{
Enums::GenericMemoryOrder
getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order);
Enums::GenericMemoryScope
getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope);
} // namespace HsailISA
#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__

View file

@ -0,0 +1,77 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __ARCH_HSAIL_GPU_DECODER_HH__
#define __ARCH_HSAIL_GPU_DECODER_HH__
#include <vector>
#include "arch/hsail/gpu_types.hh"
class BrigObject;
class GPUStaticInst;
namespace Brig
{
class BrigInstBase;
}
namespace HsailISA
{
class Decoder
{
public:
GPUStaticInst* decode(MachInst machInst);
GPUStaticInst*
decode(RawMachInst inst)
{
return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr;
}
RawMachInst
saveInst(GPUStaticInst *decodedInst)
{
decodedInsts.push_back(decodedInst);
return decodedInsts.size() - 1;
}
private:
static std::vector<GPUStaticInst*> decodedInsts;
};
} // namespace HsailISA
#endif // __ARCH_HSAIL_GPU_DECODER_HH__

View file

@ -0,0 +1,69 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __ARCH_HSAIL_GPU_TYPES_HH__
#define __ARCH_HSAIL_GPU_TYPES_HH__
#include <cstdint>
namespace Brig
{
class BrigInstBase;
}
class BrigObject;
namespace HsailISA
{
// A raw machine instruction represents the raw bits that
// our model uses to represent an actual instruction. In
// the case of HSAIL this is just an index into a list of
// instruction objects.
typedef uint64_t RawMachInst;
// The MachInst is a representation of an instruction
// that has more information than just the machine code.
// For HSAIL the actual machine code is a BrigInstBase
// and the BrigObject contains more pertinent
// information related to operaands, etc.
struct MachInst
{
const Brig::BrigInstBase *brigInstBase;
const BrigObject *brigObj;
};
}
#endif // __ARCH_HSAIL_GPU_TYPES_HH__

View file

@ -0,0 +1,86 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#include "arch/hsail/insts/branch.hh"
#include "gpu-compute/hsail_code.hh"
namespace HsailISA
{
GPUStaticInst*
decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
// Detect direct vs indirect branch by seeing whether we have a
// register operand.
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
const Brig::BrigOperand *reg = obj->getOperand(op_offs);
if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
return new BrnIndirectInst(ib, obj);
} else {
return new BrnDirectInst(ib, obj);
}
}
GPUStaticInst*
decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
// Detect direct vs indirect branch by seeing whether we have a
// second register operand (after the condition).
unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
const Brig::BrigOperand *reg = obj->getOperand(op_offs);
if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
return new CbrIndirectInst(ib, obj);
} else {
return new CbrDirectInst(ib, obj);
}
}
GPUStaticInst*
decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
// Detect direct vs indirect branch by seeing whether we have a
// second register operand (after the condition).
unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
const Brig::BrigOperand *reg = obj->getOperand(op_offs);
if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
return new BrIndirectInst(ib, obj);
} else {
return new BrDirectInst(ib, obj);
}
}
} // namespace HsailISA

View file

@ -0,0 +1,442 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
#define __ARCH_HSAIL_INSTS_BRANCH_HH__
#include "arch/hsail/insts/gpu_static_inst.hh"
#include "arch/hsail/operand.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/wavefront.hh"
namespace HsailISA
{
// The main difference between a direct branch and an indirect branch
// is whether the target is a register or a label, so we can share a
// lot of code if we template the base implementation on that type.
template<typename TargetType>
class BrnInstBase : public HsailGPUStaticInst
{
public:
void generateDisassembly();
Brig::BrigWidth8_t width;
TargetType target;
BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
: HsailGPUStaticInst(obj, "brn")
{
o_type = Enums::OT_BRANCH;
width = ((Brig::BrigInstBr*)ib)->width;
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
target.init(op_offs, obj);
o_type = Enums::OT_BRANCH;
}
uint32_t getTargetPc() override { return target.getTarget(0, 0); }
bool unconditionalJumpInstruction() override { return true; }
bool isVectorRegister(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.isVectorRegister();
}
bool isCondRegister(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.isCondRegister();
}
bool isScalarRegister(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.isScalarRegister();
}
bool isSrcOperand(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return true;
}
bool isDstOperand(int operandIndex) {
return false;
}
int getOperandSize(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.opSize();
}
int getRegisterIndex(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.regIndex();
}
int getNumOperands() {
return 1;
}
void execute(GPUDynInstPtr gpuDynInst);
};
template<typename TargetType>
void
BrnInstBase<TargetType>::generateDisassembly()
{
std::string widthClause;
if (width != 1) {
widthClause = csprintf("_width(%d)", width);
}
disassembly = csprintf("%s%s %s", opcode, widthClause,
target.disassemble());
}
template<typename TargetType>
void
BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
if (getTargetPc() == w->rpc()) {
w->popFromReconvergenceStack();
} else {
// Rpc and execution mask remain the same
w->pc(getTargetPc());
}
w->discardFetch();
}
class BrnDirectInst : public BrnInstBase<LabelOperand>
{
public:
BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
: BrnInstBase<LabelOperand>(ib, obj)
{
}
int numSrcRegOperands() { return 0; }
int numDstRegOperands() { return 0; }
};
class BrnIndirectInst : public BrnInstBase<SRegOperand>
{
public:
BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
: BrnInstBase<SRegOperand>(ib, obj)
{
}
int numSrcRegOperands() { return target.isVectorRegister(); }
int numDstRegOperands() { return 0; }
};
GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
const BrigObject *obj);
template<typename TargetType>
class CbrInstBase : public HsailGPUStaticInst
{
public:
void generateDisassembly();
Brig::BrigWidth8_t width;
CRegOperand cond;
TargetType target;
CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
: HsailGPUStaticInst(obj, "cbr")
{
o_type = Enums::OT_BRANCH;
width = ((Brig::BrigInstBr *)ib)->width;
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
cond.init(op_offs, obj);
op_offs = obj->getOperandPtr(ib->operands, 1);
target.init(op_offs, obj);
o_type = Enums::OT_BRANCH;
}
uint32_t getTargetPc() override { return target.getTarget(0, 0); }
void execute(GPUDynInstPtr gpuDynInst);
// Assumption: Target is operand 0, Condition Register is operand 1
bool isVectorRegister(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
if (!operandIndex)
return target.isVectorRegister();
else
return false;
}
bool isCondRegister(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
if (!operandIndex)
return target.isCondRegister();
else
return true;
}
bool isScalarRegister(int operandIndex) {
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (!operandIndex)
return target.isScalarRegister();
else
return false;
}
bool isSrcOperand(int operandIndex) {
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex == 0)
return true;
return false;
}
// both Condition Register and Target are source operands
bool isDstOperand(int operandIndex) {
return false;
}
int getOperandSize(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
if (!operandIndex)
return target.opSize();
else
return 1;
}
int getRegisterIndex(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
if (!operandIndex)
return target.regIndex();
else
return -1;
}
// Operands = Target, Condition Register
int getNumOperands() {
return 2;
}
};
template<typename TargetType>
void
CbrInstBase<TargetType>::generateDisassembly()
{
std::string widthClause;
if (width != 1) {
widthClause = csprintf("_width(%d)", width);
}
disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
cond.disassemble(), target.disassemble());
}
template<typename TargetType>
void
CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
const uint32_t curr_pc = w->pc();
const uint32_t curr_rpc = w->rpc();
const VectorMask curr_mask = w->execMask();
/**
* TODO: can we move this pop outside the instruction, and
* into the wavefront?
*/
w->popFromReconvergenceStack();
// immediate post-dominator instruction
const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
if (curr_rpc != rpc) {
w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
}
// taken branch
const uint32_t true_pc = getTargetPc();
VectorMask true_mask;
for (unsigned int lane = 0; lane < VSZ; ++lane) {
true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
}
// not taken branch
const uint32_t false_pc = curr_pc + 1;
assert(true_pc != false_pc);
if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
VectorMask false_mask = curr_mask & ~true_mask;
w->pushToReconvergenceStack(false_pc, rpc, false_mask);
}
if (true_pc != rpc && true_mask.count()) {
w->pushToReconvergenceStack(true_pc, rpc, true_mask);
}
assert(w->pc() != curr_pc);
w->discardFetch();
}
class CbrDirectInst : public CbrInstBase<LabelOperand>
{
public:
CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
: CbrInstBase<LabelOperand>(ib, obj)
{
}
// the source operand of a conditional branch is a Condition
// Register which is not stored in the VRF
// so we do not count it as a source-register operand
// even though, formally, it is one.
int numSrcRegOperands() { return 0; }
int numDstRegOperands() { return 0; }
};
class CbrIndirectInst : public CbrInstBase<SRegOperand>
{
public:
CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
: CbrInstBase<SRegOperand>(ib, obj)
{
}
// one source operand of the conditional indirect branch is a Condition
// register which is not stored in the VRF so we do not count it
// as a source-register operand even though, formally, it is one.
int numSrcRegOperands() { return target.isVectorRegister(); }
int numDstRegOperands() { return 0; }
};
GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
const BrigObject *obj);
template<typename TargetType>
class BrInstBase : public HsailGPUStaticInst
{
public:
void generateDisassembly();
ImmOperand<uint32_t> width;
TargetType target;
BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
: HsailGPUStaticInst(obj, "br")
{
o_type = Enums::OT_BRANCH;
width.init(((Brig::BrigInstBr *)ib)->width, obj);
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
target.init(op_offs, obj);
o_type = Enums::OT_BRANCH;
}
uint32_t getTargetPc() override { return target.getTarget(0, 0); }
bool unconditionalJumpInstruction() override { return true; }
void execute(GPUDynInstPtr gpuDynInst);
bool isVectorRegister(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.isVectorRegister();
}
bool isCondRegister(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.isCondRegister();
}
bool isScalarRegister(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.isScalarRegister();
}
bool isSrcOperand(int operandIndex) {
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return true;
}
bool isDstOperand(int operandIndex) { return false; }
int getOperandSize(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.opSize();
}
int getRegisterIndex(int operandIndex) {
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return target.regIndex();
}
int getNumOperands() { return 1; }
};
template<typename TargetType>
void
BrInstBase<TargetType>::generateDisassembly()
{
std::string widthClause;
if (width.bits != 1) {
widthClause = csprintf("_width(%d)", width.bits);
}
disassembly = csprintf("%s%s %s", opcode, widthClause,
target.disassemble());
}
template<typename TargetType>
void
BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
if (getTargetPc() == w->rpc()) {
w->popFromReconvergenceStack();
} else {
// Rpc and execution mask remain the same
w->pc(getTargetPc());
}
w->discardFetch();
}
class BrDirectInst : public BrInstBase<LabelOperand>
{
public:
BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
: BrInstBase<LabelOperand>(ib, obj)
{
}
int numSrcRegOperands() { return 0; }
int numDstRegOperands() { return 0; }
};
class BrIndirectInst : public BrInstBase<SRegOperand>
{
public:
BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
: BrInstBase<SRegOperand>(ib, obj)
{
}
int numSrcRegOperands() { return target.isVectorRegister(); }
int numDstRegOperands() { return 0; }
};
GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
const BrigObject *obj);
} // namespace HsailISA
#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__

1106
src/arch/hsail/insts/decl.hh Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,64 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#include "arch/hsail/insts/gpu_static_inst.hh"
#include "gpu-compute/brig_object.hh"
namespace HsailISA
{
HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
const std::string &opcode)
: GPUStaticInst(opcode), hsailCode(obj->currentCode)
{
}
void
HsailGPUStaticInst::generateDisassembly()
{
disassembly = opcode;
}
const std::string&
HsailGPUStaticInst::disassemble()
{
if (disassembly.empty()) {
generateDisassembly();
assert(!disassembly.empty());
}
return disassembly;
}
} // namespace HsailISA

View file

@ -0,0 +1,65 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
/*
* @file gpu_static_inst.hh
*
* Defines the base class representing HSAIL GPU static instructions.
*/
#include "gpu-compute/gpu_static_inst.hh"
class BrigObject;
class HsailCode;
namespace HsailISA
{
class HsailGPUStaticInst : public GPUStaticInst
{
public:
HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
void generateDisassembly();
const std::string &disassemble();
uint32_t instSize() { return 4; }
protected:
HsailCode *hsailCode;
};
} // namespace HsailISA
#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__

View file

@ -0,0 +1,208 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#include "arch/hsail/insts/decl.hh"
#include "debug/GPUExec.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/simple_pool_manager.hh"
namespace HsailISA
{
template<> const char *B1::label = "b1";
template<> const char *B8::label = "b8";
template<> const char *B16::label = "b16";
template<> const char *B32::label = "b32";
template<> const char *B64::label = "b64";
template<> const char *S8::label = "s8";
template<> const char *S16::label = "s16";
template<> const char *S32::label = "s32";
template<> const char *S64::label = "s64";
template<> const char *U8::label = "u8";
template<> const char *U16::label = "u16";
template<> const char *U32::label = "u32";
template<> const char *U64::label = "u64";
template<> const char *F32::label = "f32";
template<> const char *F64::label = "f64";
const char*
cmpOpToString(Brig::BrigCompareOperation cmpOp)
{
using namespace Brig;
switch (cmpOp) {
case BRIG_COMPARE_EQ:
return "eq";
case BRIG_COMPARE_NE:
return "ne";
case BRIG_COMPARE_LT:
return "lt";
case BRIG_COMPARE_LE:
return "le";
case BRIG_COMPARE_GT:
return "gt";
case BRIG_COMPARE_GE:
return "ge";
case BRIG_COMPARE_EQU:
return "equ";
case BRIG_COMPARE_NEU:
return "neu";
case BRIG_COMPARE_LTU:
return "ltu";
case BRIG_COMPARE_LEU:
return "leu";
case BRIG_COMPARE_GTU:
return "gtu";
case BRIG_COMPARE_GEU:
return "geu";
case BRIG_COMPARE_NUM:
return "num";
case BRIG_COMPARE_NAN:
return "nan";
case BRIG_COMPARE_SEQ:
return "seq";
case BRIG_COMPARE_SNE:
return "sne";
case BRIG_COMPARE_SLT:
return "slt";
case BRIG_COMPARE_SLE:
return "sle";
case BRIG_COMPARE_SGT:
return "sgt";
case BRIG_COMPARE_SGE:
return "sge";
case BRIG_COMPARE_SGEU:
return "sgeu";
case BRIG_COMPARE_SEQU:
return "sequ";
case BRIG_COMPARE_SNEU:
return "sneu";
case BRIG_COMPARE_SLTU:
return "sltu";
case BRIG_COMPARE_SLEU:
return "sleu";
case BRIG_COMPARE_SNUM:
return "snum";
case BRIG_COMPARE_SNAN:
return "snan";
case BRIG_COMPARE_SGTU:
return "sgtu";
default:
return "unknown";
}
}
void
Ret::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
const VectorMask &mask = w->get_pred();
// mask off completed work-items
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
w->init_mask[lane] = 0;
}
}
// delete extra instructions fetched for completed work-items
w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
w->instructionBuffer.end());
if (w->pendingFetch) {
w->dropFetch = true;
}
// if all work-items have completed, then wave-front is done
if (w->init_mask.none()) {
w->status = Wavefront::S_STOPPED;
int32_t refCount = w->computeUnit->getLds().
decreaseRefCounter(w->dispatchid, w->wg_id);
DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
w->computeUnit->cu_id, w->wg_id, refCount);
// free the vector registers of the completed wavefront
w->computeUnit->vectorRegsReserved[w->simdId] -=
w->reservedVectorRegs;
assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
uint32_t endIndex = (w->startVgprIndex +
w->reservedVectorRegs - 1) %
w->computeUnit->vrf[w->simdId]->numRegs();
w->computeUnit->vrf[w->simdId]->manager->
freeRegion(w->startVgprIndex, endIndex);
w->reservedVectorRegs = 0;
w->startVgprIndex = 0;
w->computeUnit->completedWfs++;
DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
if (!refCount) {
// Notify Memory System of Kernel Completion
// Kernel End = isKernel + isRelease
w->status = Wavefront::S_RETURNING;
GPUDynInstPtr local_mempacket = gpuDynInst;
local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
local_mempacket->useContinuation = false;
local_mempacket->simdId = w->simdId;
local_mempacket->wfSlotId = w->wfSlotId;
local_mempacket->wfDynId = w->wfDynId;
w->computeUnit->injectGlobalMemFence(local_mempacket, true);
} else {
w->computeUnit->shader->dispatcher->scheduleDispatch();
}
}
}
void
Barrier::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
assert(w->barrier_cnt == w->old_barrier_cnt);
w->barrier_cnt = w->old_barrier_cnt + 1;
w->stalledAtBarrier = true;
}
} // namespace HsailISA

139
src/arch/hsail/insts/mem.cc Normal file
View file

@ -0,0 +1,139 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#include "arch/hsail/insts/mem.hh"
#include "arch/hsail/Brig.h"
#include "enums/OpType.hh"
using namespace Brig;
namespace HsailISA
{
const char* atomicOpToString(BrigAtomicOperation brigOp);
Enums::MemOpType
brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
{
if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
switch (brigOp) {
case BRIG_ATOMIC_AND:
return Enums::MO_AAND;
case BRIG_ATOMIC_OR:
return Enums::MO_AOR;
case BRIG_ATOMIC_XOR:
return Enums::MO_AXOR;
case BRIG_ATOMIC_CAS:
return Enums::MO_ACAS;
case BRIG_ATOMIC_EXCH:
return Enums::MO_AEXCH;
case BRIG_ATOMIC_ADD:
return Enums::MO_AADD;
case BRIG_ATOMIC_WRAPINC:
return Enums::MO_AINC;
case BRIG_ATOMIC_WRAPDEC:
return Enums::MO_ADEC;
case BRIG_ATOMIC_MIN:
return Enums::MO_AMIN;
case BRIG_ATOMIC_MAX:
return Enums::MO_AMAX;
case BRIG_ATOMIC_SUB:
return Enums::MO_ASUB;
default:
fatal("Bad BrigAtomicOperation code %d\n", brigOp);
}
} else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
switch (brigOp) {
case BRIG_ATOMIC_AND:
return Enums::MO_ANRAND;
case BRIG_ATOMIC_OR:
return Enums::MO_ANROR;
case BRIG_ATOMIC_XOR:
return Enums::MO_ANRXOR;
case BRIG_ATOMIC_CAS:
return Enums::MO_ANRCAS;
case BRIG_ATOMIC_EXCH:
return Enums::MO_ANREXCH;
case BRIG_ATOMIC_ADD:
return Enums::MO_ANRADD;
case BRIG_ATOMIC_WRAPINC:
return Enums::MO_ANRINC;
case BRIG_ATOMIC_WRAPDEC:
return Enums::MO_ANRDEC;
case BRIG_ATOMIC_MIN:
return Enums::MO_ANRMIN;
case BRIG_ATOMIC_MAX:
return Enums::MO_ANRMAX;
case BRIG_ATOMIC_SUB:
return Enums::MO_ANRSUB;
default:
fatal("Bad BrigAtomicOperation code %d\n", brigOp);
}
} else {
fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
}
}
const char*
atomicOpToString(BrigAtomicOperation brigOp)
{
switch (brigOp) {
case BRIG_ATOMIC_AND:
return "and";
case BRIG_ATOMIC_OR:
return "or";
case BRIG_ATOMIC_XOR:
return "xor";
case BRIG_ATOMIC_CAS:
return "cas";
case BRIG_ATOMIC_EXCH:
return "exch";
case BRIG_ATOMIC_ADD:
return "add";
case BRIG_ATOMIC_WRAPINC:
return "inc";
case BRIG_ATOMIC_WRAPDEC:
return "dec";
case BRIG_ATOMIC_MIN:
return "min";
case BRIG_ATOMIC_MAX:
return "max";
case BRIG_ATOMIC_SUB:
return "sub";
default:
return "unknown";
}
}
} // namespace HsailISA

1629
src/arch/hsail/insts/mem.hh Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,660 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#include "arch/hsail/generic_types.hh"
#include "gpu-compute/hsail_code.hh"
// defined in code.cc, but not worth sucking in all of code.h for this
// at this point
extern const char *segmentNames[];
namespace HsailISA
{
template<typename DestDataType, typename AddrRegOperandType>
void
LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
{
this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
DestDataType::label,
this->dest.disassemble(),
this->addr.disassemble());
}
template<typename DestDataType, typename AddrRegOperandType>
void
LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
typedef typename DestDataType::CType CType M5_VAR_USED;
const VectorMask &mask = w->get_pred();
uint64_t addr_vec[VSZ];
this->addr.calcVector(w, addr_vec);
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
this->dest.set(w, lane, addr_vec[lane]);
}
}
}
template<typename MemDataType, typename DestDataType,
typename AddrRegOperandType>
void
LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
{
switch (num_dest_operands) {
case 1:
this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
segmentNames[this->segment],
MemDataType::label,
this->dest.disassemble(),
this->addr.disassemble());
break;
case 2:
this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
segmentNames[this->segment],
MemDataType::label,
this->dest_vect[0].disassemble(),
this->dest_vect[1].disassemble(),
this->addr.disassemble());
break;
case 4:
this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
this->opcode,
segmentNames[this->segment],
MemDataType::label,
this->dest_vect[0].disassemble(),
this->dest_vect[1].disassemble(),
this->dest_vect[2].disassemble(),
this->dest_vect[3].disassemble(),
this->addr.disassemble());
break;
default:
fatal("Bad ld register dest operand, num vector operands: %d \n",
num_dest_operands);
break;
}
}
static Addr
calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
{
// what is the size of the object we are accessing??
// NOTE: the compiler doesn't generate enough information
// to do this yet..have to just line up all the private
// work-item spaces back to back for now
/*
StorageElement* se =
i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
assert(se);
return w->wfSlotId * w->privSizePerItem * VSZ +
se->offset * VSZ +
lane * se->size;
*/
// addressing strategy: interleave the private spaces of
// work-items in a wave-front on 8 byte granularity.
// this won't be perfect coalescing like the spill space
// strategy, but it's better than nothing. The spill space
// strategy won't work with private because the same address
// may be accessed by different sized loads/stores.
// Note: I'm assuming that the largest load/store to private
// is 8 bytes. If it is larger, the stride will have to increase
Addr addr_div8 = addr / 8;
Addr addr_mod8 = addr % 8;
Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
assert(ret < w->privBase + (w->privSizePerItem * VSZ));
return ret;
}
template<typename MemDataType, typename DestDataType,
typename AddrRegOperandType>
void
LdInst<MemDataType, DestDataType,
AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
typedef typename MemDataType::CType MemCType;
const VectorMask &mask = w->get_pred();
// Kernarg references are handled uniquely for now (no Memory Request
// is used), so special-case them up front. Someday we should
// make this more realistic, at which we should get rid of this
// block and fold this case into the switch below.
if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
MemCType val;
// I assume no vector ld for kernargs
assert(num_dest_operands == 1);
// assuming for the moment that we'll never do register
// offsets into kernarg space... just to make life simpler
uint64_t address = this->addr.calcUniform();
val = *(MemCType*)&w->kernelArgs[address];
DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
this->dest.set(w, lane, val);
}
}
return;
} else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
uint64_t address = this->addr.calcUniform();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
MemCType val = w->readCallArgMem<MemCType>(lane, address);
DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
(unsigned long long)val);
this->dest.set(w, lane, val);
}
}
return;
}
GPUDynInstPtr m = gpuDynInst;
this->addr.calcVector(w, m->addr);
m->m_op = Enums::MO_LD;
m->m_type = MemDataType::memType;
m->v_type = DestDataType::vgprType;
m->exec_mask = w->execMask();
m->statusBitVector = 0;
m->equiv = this->equivClass;
m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
m->scope = getGenericMemoryScope(this->memoryScope);
if (num_dest_operands == 1) {
m->dst_reg = this->dest.regIndex();
m->n_reg = 1;
} else {
m->n_reg = num_dest_operands;
for (int i = 0; i < num_dest_operands; ++i) {
m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
}
}
m->simdId = w->simdId;
m->wfSlotId = w->wfSlotId;
m->wfDynId = w->wfDynId;
m->kern_id = w->kern_id;
m->cu_id = w->computeUnit->cu_id;
m->latency.init(&w->computeUnit->shader->tick_cnt);
switch (this->segment) {
case Brig::BRIG_SEGMENT_GLOBAL:
m->s_type = SEG_GLOBAL;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
// this is a complete hack to get around a compiler bug
// (the compiler currently generates global access for private
// addresses (starting from 0). We need to add the private offset)
for (int lane = 0; lane < VSZ; ++lane) {
if (m->addr[lane] < w->privSizePerItem) {
if (mask[lane]) {
// what is the size of the object we are accessing?
// find base for for this wavefront
// calcPrivAddr will fail if accesses are unaligned
assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
this);
m->addr[lane] = privAddr;
}
}
}
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_rd_gm++;
w->rd_gm_reqs_in_pipe--;
break;
case Brig::BRIG_SEGMENT_SPILL:
assert(num_dest_operands == 1);
m->s_type = SEG_SPILL;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
for (int lane = 0; lane < VSZ; ++lane) {
// note: this calculation will NOT WORK if the compiler
// ever generates loads/stores to the same address with
// different widths (e.g., a ld_u32 addr and a ld_u16 addr)
if (mask[lane]) {
assert(m->addr[lane] < w->spillSizePerItem);
m->addr[lane] = m->addr[lane] * w->spillWidth +
lane * sizeof(MemCType) + w->spillBase;
w->last_addr[lane] = m->addr[lane];
}
}
}
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_rd_gm++;
w->rd_gm_reqs_in_pipe--;
break;
case Brig::BRIG_SEGMENT_GROUP:
m->s_type = SEG_SHARED;
m->pipeId = LDSMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(24));
w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
w->outstanding_reqs_rd_lm++;
w->rd_lm_reqs_in_pipe--;
break;
case Brig::BRIG_SEGMENT_READONLY:
m->s_type = SEG_READONLY;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
m->addr[lane] += w->roBase;
}
}
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_rd_gm++;
w->rd_gm_reqs_in_pipe--;
break;
case Brig::BRIG_SEGMENT_PRIVATE:
m->s_type = SEG_PRIVATE;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->privSizePerItem);
m->addr[lane] = m->addr[lane] +
lane * sizeof(MemCType) + w->privBase;
}
}
}
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_rd_gm++;
w->rd_gm_reqs_in_pipe--;
break;
default:
fatal("Load to unsupported segment %d %llxe\n", this->segment,
m->addr[0]);
}
w->outstanding_reqs++;
w->mem_reqs_in_pipe--;
}
template<typename OperationType, typename SrcDataType,
typename AddrRegOperandType>
void
StInst<OperationType, SrcDataType,
AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *w = gpuDynInst->wavefront();
typedef typename OperationType::CType CType;
const VectorMask &mask = w->get_pred();
// arg references are handled uniquely for now (no Memory Request
// is used), so special-case them up front. Someday we should
// make this more realistic, at which we should get rid of this
// block and fold this case into the switch below.
if (this->segment == Brig::BRIG_SEGMENT_ARG) {
uint64_t address = this->addr.calcUniform();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
CType data = this->src.template get<CType>(w, lane);
DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
w->writeCallArgMem<CType>(lane, address, data);
}
}
return;
}
GPUDynInstPtr m = gpuDynInst;
m->exec_mask = w->execMask();
this->addr.calcVector(w, m->addr);
if (num_src_operands == 1) {
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
((CType*)m->d_data)[lane] =
this->src.template get<CType>(w, lane);
}
}
} else {
for (int k= 0; k < num_src_operands; ++k) {
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
((CType*)m->d_data)[k * VSZ + lane] =
this->src_vect[k].template get<CType>(w, lane);
}
}
}
}
m->m_op = Enums::MO_ST;
m->m_type = OperationType::memType;
m->v_type = OperationType::vgprType;
m->statusBitVector = 0;
m->equiv = this->equivClass;
if (num_src_operands == 1) {
m->n_reg = 1;
} else {
m->n_reg = num_src_operands;
}
m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
m->scope = getGenericMemoryScope(this->memoryScope);
m->simdId = w->simdId;
m->wfSlotId = w->wfSlotId;
m->wfDynId = w->wfDynId;
m->kern_id = w->kern_id;
m->cu_id = w->computeUnit->cu_id;
m->latency.init(&w->computeUnit->shader->tick_cnt);
switch (this->segment) {
case Brig::BRIG_SEGMENT_GLOBAL:
m->s_type = SEG_GLOBAL;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
// this is a complete hack to get around a compiler bug
// (the compiler currently generates global access for private
// addresses (starting from 0). We need to add the private offset)
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
if (m->addr[lane] < w->privSizePerItem) {
// calcPrivAddr will fail if accesses are unaligned
assert(!((sizeof(CType)-1) & m->addr[lane]));
Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
this);
m->addr[lane] = privAddr;
}
}
}
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_wr_gm++;
w->wr_gm_reqs_in_pipe--;
break;
case Brig::BRIG_SEGMENT_SPILL:
assert(num_src_operands == 1);
m->s_type = SEG_SPILL;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->spillSizePerItem);
m->addr[lane] = m->addr[lane] * w->spillWidth +
lane * sizeof(CType) + w->spillBase;
}
}
}
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_wr_gm++;
w->wr_gm_reqs_in_pipe--;
break;
case Brig::BRIG_SEGMENT_GROUP:
m->s_type = SEG_SHARED;
m->pipeId = LDSMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(24));
w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
w->outstanding_reqs_wr_lm++;
w->wr_lm_reqs_in_pipe--;
break;
case Brig::BRIG_SEGMENT_PRIVATE:
m->s_type = SEG_PRIVATE;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->privSizePerItem);
m->addr[lane] = m->addr[lane] + lane *
sizeof(CType)+w->privBase;
}
}
}
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_wr_gm++;
w->wr_gm_reqs_in_pipe--;
break;
default:
fatal("Store to unsupported segment %d\n", this->segment);
}
w->outstanding_reqs++;
w->mem_reqs_in_pipe--;
}
template<typename OperationType, typename SrcDataType,
typename AddrRegOperandType>
void
StInst<OperationType, SrcDataType,
AddrRegOperandType>::generateDisassembly()
{
switch (num_src_operands) {
case 1:
this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
segmentNames[this->segment],
OperationType::label,
this->src.disassemble(),
this->addr.disassemble());
break;
case 2:
this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
segmentNames[this->segment],
OperationType::label,
this->src_vect[0].disassemble(),
this->src_vect[1].disassemble(),
this->addr.disassemble());
break;
case 4:
this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
this->opcode,
segmentNames[this->segment],
OperationType::label,
this->src_vect[0].disassemble(),
this->src_vect[1].disassemble(),
this->src_vect[2].disassemble(),
this->src_vect[3].disassemble(),
this->addr.disassemble());
break;
default: fatal("Bad ld register src operand, num vector operands: "
"%d \n", num_src_operands);
break;
}
}
template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
bool HasDst>
void
AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
HasDst>::execute(GPUDynInstPtr gpuDynInst)
{
typedef typename DataType::CType CType;
Wavefront *w = gpuDynInst->wavefront();
GPUDynInstPtr m = gpuDynInst;
this->addr.calcVector(w, m->addr);
for (int lane = 0; lane < VSZ; ++lane) {
((CType *)m->a_data)[lane] =
this->src[0].template get<CType>(w, lane);
}
// load second source operand for CAS
if (NumSrcOperands > 1) {
for (int lane = 0; lane < VSZ; ++lane) {
((CType*)m->x_data)[lane] =
this->src[1].template get<CType>(w, lane);
}
}
assert(NumSrcOperands <= 2);
m->m_op = this->opType;
m->m_type = DataType::memType;
m->v_type = DataType::vgprType;
m->exec_mask = w->execMask();
m->statusBitVector = 0;
m->equiv = 0; // atomics don't have an equivalence class operand
m->n_reg = 1;
m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
m->scope = getGenericMemoryScope(this->memoryScope);
if (HasDst) {
m->dst_reg = this->dest.regIndex();
}
m->simdId = w->simdId;
m->wfSlotId = w->wfSlotId;
m->wfDynId = w->wfDynId;
m->kern_id = w->kern_id;
m->cu_id = w->computeUnit->cu_id;
m->latency.init(&w->computeUnit->shader->tick_cnt);
switch (this->segment) {
case Brig::BRIG_SEGMENT_GLOBAL:
m->s_type = SEG_GLOBAL;
m->latency.set(w->computeUnit->shader->ticks(64));
m->pipeId = GLBMEM_PIPE;
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_wr_gm++;
w->wr_gm_reqs_in_pipe--;
w->outstanding_reqs_rd_gm++;
w->rd_gm_reqs_in_pipe--;
break;
case Brig::BRIG_SEGMENT_GROUP:
m->s_type = SEG_SHARED;
m->pipeId = LDSMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(24));
w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
w->outstanding_reqs_wr_lm++;
w->wr_lm_reqs_in_pipe--;
w->outstanding_reqs_rd_lm++;
w->rd_lm_reqs_in_pipe--;
break;
default:
fatal("Atomic op to unsupported segment %d\n",
this->segment);
}
w->outstanding_reqs++;
w->mem_reqs_in_pipe--;
}
const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
bool HasDst>
void
AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
HasDst>::generateDisassembly()
{
if (HasDst) {
this->disassembly =
csprintf("%s_%s_%s_%s %s,%s", this->opcode,
atomicOpToString(this->atomicOperation),
segmentNames[this->segment],
DataType::label, this->dest.disassemble(),
this->addr.disassemble());
} else {
this->disassembly =
csprintf("%s_%s_%s_%s %s", this->opcode,
atomicOpToString(this->atomicOperation),
segmentNames[this->segment],
DataType::label, this->addr.disassemble());
}
for (int i = 0; i < NumSrcOperands; ++i) {
this->disassembly += ",";
this->disassembly += this->src[i].disassemble();
}
}
} // namespace HsailISA

View file

@ -0,0 +1,787 @@
/*
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Marc Orr
*/
#include <csignal>
#include "arch/hsail/insts/decl.hh"
#include "arch/hsail/insts/mem.hh"
namespace HsailISA
{
// Pseudo (or magic) instructions are overloaded on the hsail call
// instruction, because of its flexible parameter signature.
// To add a new magic instruction:
// 1. Add an entry to the enum.
// 2. Implement it in the switch statement below (Call::exec).
// 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
// so its easy to call from an OpenCL kernel.
// This enum should be identical to the enum in
// hsa/hsail-gpu-compute/util/magicinst.h
enum
{
MAGIC_PRINT_WF_32 = 0,
MAGIC_PRINT_WF_64,
MAGIC_PRINT_LANE,
MAGIC_PRINT_LANE_64,
MAGIC_PRINT_WF_FLOAT,
MAGIC_SIM_BREAK,
MAGIC_PREF_SUM,
MAGIC_REDUCTION,
MAGIC_MASKLANE_LOWER,
MAGIC_MASKLANE_UPPER,
MAGIC_JOIN_WF_BAR,
MAGIC_WAIT_WF_BAR,
MAGIC_PANIC,
MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
MAGIC_LOAD_GLOBAL_U32_REG,
MAGIC_XACT_CAS_LD,
MAGIC_MOST_SIG_THD,
MAGIC_MOST_SIG_BROADCAST,
MAGIC_PRINT_WFID_32,
MAGIC_PRINT_WFID_64
};
void
Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
{
const VectorMask &mask = w->get_pred();
int op = 0;
bool got_op = false;
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
int src_val0 = src1.get<int>(w, lane, 0);
if (got_op) {
if (src_val0 != op) {
fatal("Multiple magic instructions per PC not "
"supported\n");
}
} else {
op = src_val0;
got_op = true;
}
}
}
switch(op) {
case MAGIC_PRINT_WF_32:
MagicPrintWF32(w);
break;
case MAGIC_PRINT_WF_64:
MagicPrintWF64(w);
break;
case MAGIC_PRINT_LANE:
MagicPrintLane(w);
break;
case MAGIC_PRINT_LANE_64:
MagicPrintLane64(w);
break;
case MAGIC_PRINT_WF_FLOAT:
MagicPrintWFFloat(w);
break;
case MAGIC_SIM_BREAK:
MagicSimBreak(w);
break;
case MAGIC_PREF_SUM:
MagicPrefixSum(w);
break;
case MAGIC_REDUCTION:
MagicReduction(w);
break;
case MAGIC_MASKLANE_LOWER:
MagicMaskLower(w);
break;
case MAGIC_MASKLANE_UPPER:
MagicMaskUpper(w);
break;
case MAGIC_JOIN_WF_BAR:
MagicJoinWFBar(w);
break;
case MAGIC_WAIT_WF_BAR:
MagicWaitWFBar(w);
break;
case MAGIC_PANIC:
MagicPanic(w);
break;
// atomic instructions
case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
break;
case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
break;
case MAGIC_LOAD_GLOBAL_U32_REG:
MagicLoadGlobalU32Reg(w, gpuDynInst);
break;
case MAGIC_XACT_CAS_LD:
MagicXactCasLd(w);
break;
case MAGIC_MOST_SIG_THD:
MagicMostSigThread(w);
break;
case MAGIC_MOST_SIG_BROADCAST:
MagicMostSigBroadcast(w);
break;
case MAGIC_PRINT_WFID_32:
MagicPrintWF32ID(w);
break;
case MAGIC_PRINT_WFID_64:
MagicPrintWFID64(w);
break;
default: fatal("unrecognized magic instruction: %d\n", op);
}
}
void
Call::MagicPrintLane(Wavefront *w)
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
if (src_val2) {
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
disassemble(), w->computeUnit->cu_id, w->simdId,
w->wfSlotId, lane, src_val1);
} else {
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
disassemble(), w->computeUnit->cu_id, w->simdId,
w->wfSlotId, lane, src_val1);
}
}
}
#endif
}
void
Call::MagicPrintLane64(Wavefront *w)
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
if (src_val2) {
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
disassemble(), w->computeUnit->cu_id, w->simdId,
w->wfSlotId, lane, src_val1);
} else {
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
disassemble(), w->computeUnit->cu_id, w->simdId,
w->wfSlotId, lane, src_val1);
}
}
}
#endif
}
void
Call::MagicPrintWF32(Wavefront *w)
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
for (int lane = 0; lane < VSZ; ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
if (src_val2) {
res_str += csprintf("%08x", src_val1);
} else {
res_str += csprintf("%08d", src_val1);
}
} else {
res_str += csprintf("xxxxxxxx");
}
if ((lane & 7) == 7) {
res_str += csprintf("\n");
} else {
res_str += csprintf(" ");
}
}
res_str += "\n\n";
DPRINTFN(res_str.c_str());
#endif
}
void
Call::MagicPrintWF32ID(Wavefront *w)
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
std::string res_str;
int src_val3 = -1;
res_str = csprintf("krl_prt (%s)\n", disassemble());
for (int lane = 0; lane < VSZ; ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
src_val3 = src1.get<int>(w, lane, 3);
if (src_val2) {
res_str += csprintf("%08x", src_val1);
} else {
res_str += csprintf("%08d", src_val1);
}
} else {
res_str += csprintf("xxxxxxxx");
}
if ((lane & 7) == 7) {
res_str += csprintf("\n");
} else {
res_str += csprintf(" ");
}
}
res_str += "\n\n";
if (w->wfDynId == src_val3) {
DPRINTFN(res_str.c_str());
}
#endif
}
void
Call::MagicPrintWF64(Wavefront *w)
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
for (int lane = 0; lane < VSZ; ++lane) {
if (!(lane & 3)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
if (mask[lane]) {
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
if (src_val2) {
res_str += csprintf("%016x", src_val1);
} else {
res_str += csprintf("%016d", src_val1);
}
} else {
res_str += csprintf("xxxxxxxxxxxxxxxx");
}
if ((lane & 3) == 3) {
res_str += csprintf("\n");
} else {
res_str += csprintf(" ");
}
}
res_str += "\n\n";
DPRINTFN(res_str.c_str());
#endif
}
void
Call::MagicPrintWFID64(Wavefront *w)
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
std::string res_str;
int src_val3 = -1;
res_str = csprintf("krl_prt (%s)\n", disassemble());
for (int lane = 0; lane < VSZ; ++lane) {
if (!(lane & 3)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
if (mask[lane]) {
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
src_val3 = src1.get<int>(w, lane, 3);
if (src_val2) {
res_str += csprintf("%016x", src_val1);
} else {
res_str += csprintf("%016d", src_val1);
}
} else {
res_str += csprintf("xxxxxxxxxxxxxxxx");
}
if ((lane & 3) == 3) {
res_str += csprintf("\n");
} else {
res_str += csprintf(" ");
}
}
res_str += "\n\n";
if (w->wfDynId == src_val3) {
DPRINTFN(res_str.c_str());
}
#endif
}
void
Call::MagicPrintWFFloat(Wavefront *w)
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
for (int lane = 0; lane < VSZ; ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
if (mask[lane]) {
float src_val1 = src1.get<float>(w, lane, 1);
res_str += csprintf("%08f", src_val1);
} else {
res_str += csprintf("xxxxxxxx");
}
if ((lane & 7) == 7) {
res_str += csprintf("\n");
} else {
res_str += csprintf(" ");
}
}
res_str += "\n\n";
DPRINTFN(res_str.c_str());
#endif
}
// raises a signal that GDB will catch
// when done with the break, type "signal 0" in gdb to continue
void
Call::MagicSimBreak(Wavefront *w)
{
std::string res_str;
// print out state for this wavefront and then break
res_str = csprintf("Breakpoint encountered for wavefront %i\n",
w->wfSlotId);
res_str += csprintf(" Kern ID: %i\n", w->kern_id);
res_str += csprintf(" Phase ID: %i\n", w->simdId);
res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
res_str += csprintf(" Exec mask: ");
for (int i = VSZ - 1; i >= 0; --i) {
if (w->execMask(i))
res_str += "1";
else
res_str += "0";
if ((i & 7) == 7)
res_str += " ";
}
res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
res_str += "\nHelpful debugging hints:\n";
res_str += " Check out w->s_reg / w->d_reg for register state\n";
res_str += "\n\n";
DPRINTFN(res_str.c_str());
fflush(stdout);
raise(SIGTRAP);
}
void
Call::MagicPrefixSum(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
int res = 0;
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
dest.set<int>(w, lane, res);
res += src_val1;
}
}
}
void
Call::MagicReduction(Wavefront *w)
{
// reduction magic instruction
// The reduction instruction takes up to 64 inputs (one from
// each thread in a WF) and sums them. It returns the sum to
// each thread in the WF.
const VectorMask &mask = w->get_pred();
int res = 0;
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
res += src_val1;
}
}
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
}
}
void
Call::MagicMaskLower(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
int res = 0;
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
if (src_val1) {
if (lane < (VSZ/2)) {
res = res | ((uint32_t)(1) << lane);
}
}
}
}
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
}
}
void
Call::MagicMaskUpper(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
int res = 0;
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
if (src_val1) {
if (lane >= (VSZ/2)) {
res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
}
}
}
}
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
}
}
void
Call::MagicJoinWFBar(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
int max_cnt = 0;
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
w->bar_cnt[lane]++;
if (w->bar_cnt[lane] > max_cnt) {
max_cnt = w->bar_cnt[lane];
}
}
}
if (max_cnt > w->max_bar_cnt) {
w->max_bar_cnt = max_cnt;
}
}
void
Call::MagicWaitWFBar(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
int max_cnt = 0;
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
w->bar_cnt[lane]--;
}
if (w->bar_cnt[lane] > max_cnt) {
max_cnt = w->bar_cnt[lane];
}
}
if (max_cnt < w->max_bar_cnt) {
w->max_bar_cnt = max_cnt;
}
w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
w->instructionBuffer.end());
if (w->pendingFetch)
w->dropFetch = true;
}
void
Call::MagicPanic(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
src_val1, lane);
}
}
}
void
Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
{
// the address is in src1 | src2
for (int lane = 0; lane < VSZ; ++lane) {
int src_val1 = src1.get<int>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
m->addr[lane] = addr;
}
}
void
Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
{
GPUDynInstPtr m = gpuDynInst;
calcAddr(w, m);
for (int lane = 0; lane < VSZ; ++lane) {
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
}
m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
Brig::BRIG_ATOMIC_ADD);
m->m_type = U32::memType;
m->v_type = U32::vgprType;
m->exec_mask = w->execMask();
m->statusBitVector = 0;
m->equiv = 0; // atomics don't have an equivalence class operand
m->n_reg = 1;
m->memoryOrder = Enums::MEMORY_ORDER_NONE;
m->scope = Enums::MEMORY_SCOPE_NONE;
m->simdId = w->simdId;
m->wfSlotId = w->wfSlotId;
m->wfDynId = w->wfDynId;
m->latency.init(&w->computeUnit->shader->tick_cnt);
m->s_type = SEG_GLOBAL;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(64));
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_wr_gm++;
w->wr_gm_reqs_in_pipe--;
w->outstanding_reqs_rd_gm++;
w->rd_gm_reqs_in_pipe--;
w->outstanding_reqs++;
w->mem_reqs_in_pipe--;
}
void
Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
{
GPUDynInstPtr m = gpuDynInst;
calcAddr(w, m);
for (int lane = 0; lane < VSZ; ++lane) {
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
}
m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
Brig::BRIG_ATOMIC_ADD);
m->m_type = U32::memType;
m->v_type = U32::vgprType;
m->exec_mask = w->execMask();
m->statusBitVector = 0;
m->equiv = 0; // atomics don't have an equivalence class operand
m->n_reg = 1;
m->memoryOrder = Enums::MEMORY_ORDER_NONE;
m->scope = Enums::MEMORY_SCOPE_NONE;
m->simdId = w->simdId;
m->wfSlotId = w->wfSlotId;
m->wfDynId = w->wfDynId;
m->latency.init(&w->computeUnit->shader->tick_cnt);
m->s_type = SEG_GLOBAL;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(64));
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_wr_gm++;
w->wr_gm_reqs_in_pipe--;
w->outstanding_reqs_rd_gm++;
w->rd_gm_reqs_in_pipe--;
w->outstanding_reqs++;
w->mem_reqs_in_pipe--;
}
void
Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
{
GPUDynInstPtr m = gpuDynInst;
// calculate the address
calcAddr(w, m);
m->m_op = Enums::MO_LD;
m->m_type = U32::memType; //MemDataType::memType;
m->v_type = U32::vgprType; //DestDataType::vgprType;
m->exec_mask = w->execMask();
m->statusBitVector = 0;
m->equiv = 0;
m->n_reg = 1;
m->memoryOrder = Enums::MEMORY_ORDER_NONE;
m->scope = Enums::MEMORY_SCOPE_NONE;
// FIXME
//m->dst_reg = this->dest.regIndex();
m->simdId = w->simdId;
m->wfSlotId = w->wfSlotId;
m->wfDynId = w->wfDynId;
m->latency.init(&w->computeUnit->shader->tick_cnt);
m->s_type = SEG_GLOBAL;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
w->outstanding_reqs_rd_gm++;
w->rd_gm_reqs_in_pipe--;
w->outstanding_reqs++;
w->mem_reqs_in_pipe--;
}
void
Call::MagicXactCasLd(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
int src_val1 = 0;
for (int lane = 0; lane < VSZ; ++lane) {
if (mask[lane]) {
src_val1 = src1.get<int>(w, lane, 1);
break;
}
}
if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
}
w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
.push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
}
void
Call::MagicMostSigThread(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
unsigned mst = true;
for (int lane = VSZ - 1; lane >= 0; --lane) {
if (mask[lane]) {
dest.set<int>(w, lane, mst);
mst = false;
}
}
}
void
Call::MagicMostSigBroadcast(Wavefront *w)
{
const VectorMask &mask = w->get_pred();
int res = 0;
bool got_res = false;
for (int lane = VSZ - 1; lane >= 0; --lane) {
if (mask[lane]) {
if (!got_res) {
res = src1.get<int>(w, lane, 1);
got_res = true;
}
dest.set<int>(w, lane, res);
}
}
}
} // namespace HsailISA

449
src/arch/hsail/operand.cc Normal file
View file

@ -0,0 +1,449 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#include "arch/hsail/operand.hh"
using namespace Brig;
bool
BaseRegOperand::init(unsigned opOffset, const BrigObject *obj,
unsigned &maxRegIdx, char _regFileChar)
{
regFileChar = _regFileChar;
const BrigOperand *brigOp = obj->getOperand(opOffset);
if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER)
return false;
const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp;
regIdx = brigRegOp->regNum;
DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx,
brigRegOp->regKind);
maxRegIdx = std::max(maxRegIdx, regIdx);
return true;
}
void
ListOperand::init(unsigned opOffset, const BrigObject *obj)
{
const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset);
switch (brigOp->kind) {
case BRIG_KIND_OPERAND_CODE_LIST:
{
const BrigOperandCodeList *opList =
(const BrigOperandCodeList*)brigOp;
const Brig::BrigData *oprnd_data =
obj->getBrigBaseData(opList->elements);
// Note: for calls Dest list of operands could be size of 0.
elementCount = oprnd_data->byteCount / 4;
DPRINTF(GPUReg, "Operand Code List: # elements: %d\n",
elementCount);
for (int i = 0; i < elementCount; ++i) {
unsigned *data_offset =
(unsigned*)obj->getData(opList->elements + 4 * (i + 1));
const BrigDirectiveVariable *p =
(const BrigDirectiveVariable*)obj->
getCodeSectionEntry(*data_offset);
StorageElement *se = obj->currentCode->storageMap->
findSymbol(BRIG_SEGMENT_ARG, p);
assert(se);
callArgs.push_back(se);
}
}
break;
default:
fatal("ListOperand: bad operand kind %d\n", brigOp->kind);
}
}
std::string
ListOperand::disassemble()
{
std::string res_str("");
for (auto it : callArgs) {
res_str += csprintf("%s ", it->name.c_str());
}
return res_str;
}
void
FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj)
{
const BrigOperand *baseOp = obj->getOperand(opOffset);
if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) {
fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind);
}
const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp;
const BrigDirectiveExecutable *p =
(const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref);
func_name = obj->getString(p->name);
}
std::string
FunctionRefOperand::disassemble()
{
DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name);
return csprintf("%s", func_name);
}
bool
BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj,
int at, unsigned &maxRegIdx, char _regFileChar)
{
regFileChar = _regFileChar;
const BrigOperand *brigOp = obj->getOperand(opOffset);
if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST)
return false;
const Brig::BrigOperandOperandList *brigRegVecOp =
(const Brig::BrigOperandOperandList*)brigOp;
unsigned *data_offset =
(unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1));
const BrigOperand *p =
(const BrigOperand*)obj->getOperand(*data_offset);
if (p->kind != BRIG_KIND_OPERAND_REGISTER) {
return false;
}
const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p;
regIdx = brigRegOp->regNum;
DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx,
brigRegOp->regKind);
maxRegIdx = std::max(maxRegIdx, regIdx);
return true;
}
void
BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj,
unsigned &maxRegIdx, char _regFileChar)
{
const char *name = obj->getString(strOffset);
char *endptr;
regIdx = strtoul(name + 2, &endptr, 10);
if (name[0] != '$' || name[1] != _regFileChar) {
fatal("register operand parse error on \"%s\"\n", name);
}
maxRegIdx = std::max(maxRegIdx, regIdx);
}
unsigned SRegOperand::maxRegIdx;
unsigned DRegOperand::maxRegIdx;
unsigned CRegOperand::maxRegIdx;
std::string
SRegOperand::disassemble()
{
return csprintf("$s%d", regIdx);
}
std::string
DRegOperand::disassemble()
{
return csprintf("$d%d", regIdx);
}
std::string
CRegOperand::disassemble()
{
return csprintf("$c%d", regIdx);
}
BrigRegOperandInfo
findRegDataType(unsigned opOffset, const BrigObject *obj)
{
const BrigOperand *baseOp = obj->getOperand(opOffset);
switch (baseOp->kind) {
case BRIG_KIND_OPERAND_REGISTER:
{
const BrigOperandRegister *op = (BrigOperandRegister*)baseOp;
return BrigRegOperandInfo((BrigKind16_t)baseOp->kind,
(BrigRegisterKind)op->regKind);
}
break;
case BRIG_KIND_OPERAND_OPERAND_LIST:
{
const BrigOperandOperandList *op =
(BrigOperandOperandList*)baseOp;
const BrigData *data_p = (BrigData*)obj->getData(op->elements);
int num_operands = 0;
BrigRegisterKind reg_kind = (BrigRegisterKind)0;
for (int offset = 0; offset < data_p->byteCount; offset += 4) {
const BrigOperand *op_p = (const BrigOperand *)
obj->getOperand(((int *)data_p->bytes)[offset/4]);
if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) {
const BrigOperandRegister *brigRegOp =
(const BrigOperandRegister*)op_p;
reg_kind = (BrigRegisterKind)brigRegOp->regKind;
} else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) {
uint16_t num_bytes =
((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount
- sizeof(BrigBase);
if (num_bytes == sizeof(uint32_t)) {
reg_kind = BRIG_REGISTER_KIND_SINGLE;
} else if (num_bytes == sizeof(uint64_t)) {
reg_kind = BRIG_REGISTER_KIND_DOUBLE;
} else {
fatal("OperandList: bad operand size %d\n", num_bytes);
}
} else {
fatal("OperandList: bad operand kind %d\n", op_p->kind);
}
num_operands++;
}
assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST);
return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind);
}
break;
case BRIG_KIND_OPERAND_ADDRESS:
{
const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
if (!op->reg) {
BrigType type = BRIG_TYPE_NONE;
if (op->symbol) {
const BrigDirective *dir = (BrigDirective*)
obj->getCodeSectionEntry(op->symbol);
assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
const BrigDirectiveVariable *sym =
(const BrigDirectiveVariable*)dir;
type = (BrigType)sym->type;
}
return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS,
(BrigType)type);
} else {
const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp;
const BrigOperand *reg = obj->getOperand(b->reg);
const BrigOperandRegister *rop = (BrigOperandRegister*)reg;
return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER,
(BrigRegisterKind)rop->regKind);
}
}
break;
default:
fatal("AddrOperand: bad operand kind %d\n", baseOp->kind);
break;
}
}
void
AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj)
{
assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS);
const BrigDirective *d =
(BrigDirective*)obj->getCodeSectionEntry(op->symbol);
assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d;
name = obj->getString(sym->name);
if (sym->segment != BRIG_SEGMENT_ARG) {
storageElement =
obj->currentCode->storageMap->findSymbol(sym->segment, name);
assert(storageElement);
offset = 0;
} else {
// sym->name does not work for BRIG_SEGMENT_ARG for the following case:
//
// void foo(int a);
// void bar(double a);
//
// foo(...) --> arg_u32 %param_p0;
// st_arg_u32 $s0, [%param_p0];
// call &foo (%param_p0);
// bar(...) --> arg_f64 %param_p0;
// st_arg_u64 $d0, [%param_p0];
// call &foo (%param_p0);
//
// Both functions use the same variable name (param_p0)!!!
//
// Maybe this is a bug in the compiler (I don't know).
//
// Solution:
// Use directive pointer (BrigDirectiveVariable) to differentiate 2
// versions of param_p0.
//
// Note this solution is kind of stupid, because we are pulling stuff
// out of the brig binary via the directive pointer and putting it into
// the symbol table, but now we are indexing the symbol table by the
// brig directive pointer! It makes the symbol table sort of pointless.
// But I don't want to mess with the rest of the infrastructure, so
// let's go with this for now.
//
// When we update the compiler again, we should see if this problem goes
// away. If so, we can fold some of this functionality into the code for
// kernel arguments. If not, maybe we can index the symbol name on a
// hash of the variable AND function name
storageElement = obj->currentCode->
storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym);
assert(storageElement);
}
}
uint64_t
AddrOperandBase::calcUniformBase()
{
// start with offset, will be 0 if not specified
uint64_t address = offset;
// add in symbol value if specified
if (storageElement) {
address += storageElement->offset;
}
return address;
}
std::string
AddrOperandBase::disassemble(std::string reg_disassembly)
{
std::string disasm;
if (offset || reg_disassembly != "") {
disasm += "[";
if (reg_disassembly != "") {
disasm += reg_disassembly;
if (offset > 0) {
disasm += "+";
}
}
if (offset) {
disasm += csprintf("%d", offset);
}
disasm += "]";
} else if (name) {
disasm += csprintf("[%s]", name);
}
return disasm;
}
void
NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj)
{
const BrigOperand *baseOp = obj->getOperand(opOffset);
if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) {
BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp;
parseAddr(addrOp, obj);
offset = (uint64_t(addrOp->offset.hi) << 32) |
uint64_t(addrOp->offset.lo);
} else {
fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind);
}
}
std::string
NoRegAddrOperand::disassemble()
{
return AddrOperandBase::disassemble(std::string(""));
}
void
LabelOperand::init(unsigned opOffset, const BrigObject *obj)
{
const BrigOperandCodeRef *op =
(const BrigOperandCodeRef*)obj->getOperand(opOffset);
assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF);
const BrigDirective *dir =
(const BrigDirective*)obj->getCodeSectionEntry(op->ref);
assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL);
label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj);
}
uint32_t
LabelOperand::getTarget(Wavefront *w, int lane)
{
return label->get();
}
std::string
LabelOperand::disassemble()
{
return label->name;
}

768
src/arch/hsail/operand.hh Normal file
View file

@ -0,0 +1,768 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __ARCH_HSAIL_OPERAND_HH__
#define __ARCH_HSAIL_OPERAND_HH__
/**
* @file operand.hh
*
* Defines classes encapsulating HSAIL instruction operands.
*/
#include <string>
#include "arch/hsail/Brig.h"
#include "base/trace.hh"
#include "base/types.hh"
#include "debug/GPUReg.hh"
#include "enums/RegisterType.hh"
#include "gpu-compute/brig_object.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/hsail_code.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
class Label;
class StorageElement;
class BaseOperand
{
public:
Enums::RegisterType registerType;
uint32_t regOperandSize;
BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; }
bool isVectorRegister() { return registerType == Enums::RT_VECTOR; }
bool isScalarRegister() { return registerType == Enums::RT_SCALAR; }
bool isCondRegister() { return registerType == Enums::RT_CONDITION; }
unsigned int regIndex() { return 0; }
uint32_t opSize() { return regOperandSize; }
virtual ~BaseOperand() { }
};
class BrigRegOperandInfo
{
public:
Brig::BrigKind16_t kind;
Brig::BrigType type;
Brig::BrigRegisterKind regKind;
BrigRegOperandInfo(Brig::BrigKind16_t _kind,
Brig::BrigRegisterKind _regKind)
: kind(_kind), regKind(_regKind)
{
}
BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type)
: kind(_kind), type(_type)
{
}
BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES),
type(Brig::BRIG_TYPE_NONE)
{
}
};
BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj);
class BaseRegOperand : public BaseOperand
{
public:
unsigned regIdx;
char regFileChar;
bool init(unsigned opOffset, const BrigObject *obj,
unsigned &maxRegIdx, char _regFileChar);
bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at,
unsigned &maxRegIdx, char _regFileChar);
void initWithStrOffset(unsigned strOffset, const BrigObject *obj,
unsigned &maxRegIdx, char _regFileChar);
unsigned int regIndex() { return regIdx; }
};
class SRegOperand : public BaseRegOperand
{
public:
static unsigned maxRegIdx;
bool
init(unsigned opOffset, const BrigObject *obj)
{
regOperandSize = sizeof(uint32_t);
registerType = Enums::RT_VECTOR;
return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's');
}
bool
init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
{
regOperandSize = sizeof(uint32_t);
registerType = Enums::RT_VECTOR;
return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
's');
}
void
initWithStrOffset(unsigned strOffset, const BrigObject *obj)
{
regOperandSize = sizeof(uint32_t);
registerType = Enums::RT_VECTOR;
return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
's');
}
template<typename OperandType>
OperandType
get(Wavefront *w, int lane)
{
assert(sizeof(OperandType) <= sizeof(uint32_t));
assert(regIdx < w->maxSpVgprs);
// if OperandType is smaller than 32-bit, we truncate the value
OperandType ret;
uint32_t vgprIdx;
switch (sizeof(OperandType)) {
case 1: // 1 byte operand
vgprIdx = w->remap(regIdx, 1, 1);
ret = (w->computeUnit->vrf[w->simdId]->
read<uint32_t>(vgprIdx, lane)) & 0xff;
break;
case 2: // 2 byte operand
vgprIdx = w->remap(regIdx, 2, 1);
ret = (w->computeUnit->vrf[w->simdId]->
read<uint32_t>(vgprIdx, lane)) & 0xffff;
break;
case 4: // 4 byte operand
vgprIdx = w->remap(regIdx,sizeof(OperandType), 1);
ret = w->computeUnit->vrf[w->simdId]->
read<OperandType>(vgprIdx, lane);
break;
default:
panic("Bad OperandType\n");
break;
}
return (OperandType)ret;
}
// special get method for compatibility with LabelOperand
uint32_t
getTarget(Wavefront *w, int lane)
{
return get<uint32_t>(w, lane);
}
template<typename OperandType>
void set(Wavefront *w, int lane, OperandType &val);
std::string disassemble();
};
template<typename OperandType>
void
SRegOperand::set(Wavefront *w, int lane, OperandType &val)
{
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
assert(sizeof(OperandType) == sizeof(uint32_t));
assert(regIdx < w->maxSpVgprs);
uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
}
template<>
inline void
SRegOperand::set(Wavefront *w, int lane, uint64_t &val)
{
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
assert(regIdx < w->maxSpVgprs);
uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1);
w->computeUnit->vrf[w->simdId]->write<uint32_t>(vgprIdx, val, lane);
}
class DRegOperand : public BaseRegOperand
{
public:
static unsigned maxRegIdx;
bool
init(unsigned opOffset, const BrigObject *obj)
{
regOperandSize = sizeof(uint64_t);
registerType = Enums::RT_VECTOR;
return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd');
}
bool
init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
{
regOperandSize = sizeof(uint64_t);
registerType = Enums::RT_VECTOR;
return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
'd');
}
void
initWithStrOffset(unsigned strOffset, const BrigObject *obj)
{
regOperandSize = sizeof(uint64_t);
registerType = Enums::RT_VECTOR;
return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
'd');
}
template<typename OperandType>
OperandType
get(Wavefront *w, int lane)
{
assert(sizeof(OperandType) <= sizeof(uint64_t));
// TODO: this check is valid only for HSAIL
assert(regIdx < w->maxDpVgprs);
uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
return w->computeUnit->vrf[w->simdId]->read<OperandType>(vgprIdx,lane);
}
template<typename OperandType>
void
set(Wavefront *w, int lane, OperandType &val)
{
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n",
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
val);
assert(sizeof(OperandType) <= sizeof(uint64_t));
// TODO: this check is valid only for HSAIL
assert(regIdx < w->maxDpVgprs);
uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
}
std::string disassemble();
};
class CRegOperand : public BaseRegOperand
{
public:
static unsigned maxRegIdx;
bool
init(unsigned opOffset, const BrigObject *obj)
{
regOperandSize = sizeof(uint8_t);
registerType = Enums::RT_CONDITION;
return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c');
}
bool
init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
{
regOperandSize = sizeof(uint8_t);
registerType = Enums::RT_CONDITION;
return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
'c');
}
void
initWithStrOffset(unsigned strOffset, const BrigObject *obj)
{
regOperandSize = sizeof(uint8_t);
registerType = Enums::RT_CONDITION;
return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
'c');
}
template<typename OperandType>
OperandType
get(Wavefront *w, int lane)
{
assert(regIdx < w->condRegState->numRegs());
return w->condRegState->read<OperandType>((int)regIdx, lane);
}
template<typename OperandType>
void
set(Wavefront *w, int lane, OperandType &val)
{
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n",
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
val);
assert(regIdx < w->condRegState->numRegs());
w->condRegState->write<OperandType>(regIdx,lane,val);
}
std::string disassemble();
};
template<typename T>
class ImmOperand : public BaseOperand
{
public:
T bits;
bool init(unsigned opOffset, const BrigObject *obj);
bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
std::string disassemble();
template<typename OperandType>
OperandType
get()
{
assert(sizeof(OperandType) <= sizeof(T));
return *(OperandType*)&bits;
}
// This version of get() takes a WF* and a lane id for
// compatibility with the register-based get() methods.
template<typename OperandType>
OperandType
get(Wavefront *w, int lane)
{
return get<OperandType>();
}
};
template<typename T>
bool
ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
{
const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
switch (brigOp->kind) {
// this is immediate operand
case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES:
{
DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T),
brigOp->byteCount);
auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
bits = *((T*)(obj->getData(cbptr->bytes + 4)));
return true;
}
break;
case Brig::BRIG_KIND_OPERAND_WAVESIZE:
bits = VSZ;
return true;
default:
return false;
}
}
template <typename T>
bool
ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
{
const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
return false;
}
const Brig::BrigOperandOperandList *brigVecOp =
(const Brig::BrigOperandOperandList *)brigOp;
unsigned *data_offset =
(unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1));
const Brig::BrigOperand *p =
(const Brig::BrigOperand *)obj->getOperand(*data_offset);
if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
return false;
}
return init(*data_offset, obj);
}
template<typename T>
std::string
ImmOperand<T>::disassemble()
{
return csprintf("0x%08x", bits);
}
template<typename RegOperand, typename T>
class RegOrImmOperand : public BaseOperand
{
private:
bool is_imm;
public:
void setImm(const bool value) { is_imm = value; }
ImmOperand<T> imm_op;
RegOperand reg_op;
RegOrImmOperand() { is_imm = false; }
void init(unsigned opOffset, const BrigObject *obj);
void init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
std::string disassemble();
template<typename OperandType>
OperandType
get(Wavefront *w, int lane)
{
return is_imm ? imm_op.template get<OperandType>() :
reg_op.template get<OperandType>(w, lane);
}
uint32_t
opSize()
{
if (!is_imm) {
return reg_op.opSize();
}
return 0;
}
bool
isVectorRegister()
{
if (!is_imm) {
return reg_op.registerType == Enums::RT_VECTOR;
}
return false;
}
bool
isCondRegister()
{
if (!is_imm) {
return reg_op.registerType == Enums::RT_CONDITION;
}
return false;
}
bool
isScalarRegister()
{
if (!is_imm) {
return reg_op.registerType == Enums::RT_SCALAR;
}
return false;
}
unsigned int
regIndex()
{
if (!is_imm) {
return reg_op.regIndex();
}
return 0;
}
};
template<typename RegOperand, typename T>
void
RegOrImmOperand<RegOperand, T>::init(unsigned opOffset, const BrigObject *obj)
{
is_imm = false;
if (reg_op.init(opOffset, obj)) {
return;
}
if (imm_op.init(opOffset, obj)) {
is_imm = true;
return;
}
fatal("RegOrImmOperand::init(): bad operand kind %d\n",
obj->getOperand(opOffset)->kind);
}
template<typename RegOperand, typename T>
void
RegOrImmOperand<RegOperand, T>::init_from_vect(unsigned opOffset,
const BrigObject *obj, int at)
{
if (reg_op.init_from_vect(opOffset, obj, at)) {
is_imm = false;
return;
}
if (imm_op.init_from_vect(opOffset, obj, at)) {
is_imm = true;
return;
}
fatal("RegOrImmOperand::init(): bad operand kind %d\n",
obj->getOperand(opOffset)->kind);
}
template<typename RegOperand, typename T>
std::string
RegOrImmOperand<RegOperand, T>::disassemble()
{
return is_imm ? imm_op.disassemble() : reg_op.disassemble();
}
typedef RegOrImmOperand<SRegOperand, uint32_t> SRegOrImmOperand;
typedef RegOrImmOperand<DRegOperand, uint64_t> DRegOrImmOperand;
typedef RegOrImmOperand<CRegOperand, bool> CRegOrImmOperand;
class AddrOperandBase : public BaseOperand
{
protected:
// helper function for init()
void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj);
// helper function for disassemble()
std::string disassemble(std::string reg_disassembly);
uint64_t calcUniformBase();
public:
virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
uint64_t offset;
const char *name = nullptr;
StorageElement *storageElement;
};
template<typename RegOperandType>
class RegAddrOperand : public AddrOperandBase
{
public:
RegOperandType reg;
void init(unsigned opOffset, const BrigObject *obj);
uint64_t calcUniform();
void calcVector(Wavefront *w, uint64_t *addrVec);
uint64_t calcLane(Wavefront *w, int lane=0);
uint32_t opSize() { return reg.opSize(); }
bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; }
bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; }
unsigned int regIndex() { return reg.regIndex(); }
std::string disassemble();
};
template<typename RegOperandType>
void
RegAddrOperand<RegOperandType>::init(unsigned opOffset, const BrigObject *obj)
{
using namespace Brig;
const BrigOperand *baseOp = obj->getOperand(opOffset);
switch (baseOp->kind) {
case BRIG_KIND_OPERAND_ADDRESS:
{
const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
storageElement = nullptr;
offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo);
reg.init(op->reg, obj);
if (reg.regFileChar == 's') {
reg.regOperandSize = sizeof(uint32_t);
registerType = Enums::RT_VECTOR;
}
else if (reg.regFileChar == 'd') {
reg.regOperandSize = sizeof(uint64_t);
registerType = Enums::RT_VECTOR;
}
}
break;
default:
fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind);
break;
}
}
template<typename RegOperandType>
uint64_t
RegAddrOperand<RegOperandType>::calcUniform()
{
fatal("can't do calcUniform() on register-based address\n");
return 0;
}
template<typename RegOperandType>
void
RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
{
Addr address = calcUniformBase();
for (int lane = 0; lane < VSZ; ++lane) {
if (w->execMask(lane)) {
if (reg.regFileChar == 's') {
addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
} else {
addrVec[lane] = address + reg.template get<Addr>(w, lane);
}
}
}
}
template<typename RegOperandType>
uint64_t
RegAddrOperand<RegOperandType>::calcLane(Wavefront *w, int lane)
{
Addr address = calcUniformBase();
return address + reg.template get<Addr>(w, lane);
}
template<typename RegOperandType>
std::string
RegAddrOperand<RegOperandType>::disassemble()
{
return AddrOperandBase::disassemble(reg.disassemble());
}
typedef RegAddrOperand<SRegOperand> SRegAddrOperand;
typedef RegAddrOperand<DRegOperand> DRegAddrOperand;
class NoRegAddrOperand : public AddrOperandBase
{
public:
void init(unsigned opOffset, const BrigObject *obj);
uint64_t calcUniform();
void calcVector(Wavefront *w, uint64_t *addrVec);
uint64_t calcLane(Wavefront *w, int lane=0);
std::string disassemble();
};
inline uint64_t
NoRegAddrOperand::calcUniform()
{
return AddrOperandBase::calcUniformBase();
}
inline uint64_t
NoRegAddrOperand::calcLane(Wavefront *w, int lane)
{
return calcUniform();
}
inline void
NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
{
uint64_t address = calcUniformBase();
for (int lane = 0; lane < VSZ; ++lane)
addrVec[lane] = address;
}
class LabelOperand : public BaseOperand
{
public:
Label *label;
void init(unsigned opOffset, const BrigObject *obj);
std::string disassemble();
// special get method for compatibility with SRegOperand
uint32_t getTarget(Wavefront *w, int lane);
};
class ListOperand : public BaseOperand
{
public:
int elementCount;
std::vector<StorageElement*> callArgs;
int
getSrcOperand(int idx)
{
DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx,
callArgs.size());
return callArgs.at(idx)->offset;
}
void init(unsigned opOffset, const BrigObject *obj);
std::string disassemble();
template<typename OperandType>
OperandType
get(Wavefront *w, int lane, int arg_idx)
{
return w->readCallArgMem<OperandType>(lane, getSrcOperand(arg_idx));
}
template<typename OperandType>
void
set(Wavefront *w, int lane, OperandType val)
{
w->writeCallArgMem<OperandType>(lane, getSrcOperand(0), val);
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n",
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane,
getSrcOperand(0), val);
}
};
class FunctionRefOperand : public BaseOperand
{
public:
const char *func_name;
void init(unsigned opOffset, const BrigObject *obj);
std::string disassemble();
};
#endif // __ARCH_HSAIL_OPERAND_HH__

310
src/gpu-compute/GPU.py Normal file
View file

@ -0,0 +1,310 @@
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Steve Reinhardt
#
from ClockedObject import ClockedObject
from Device import DmaDevice
from m5.defines import buildEnv
from m5.params import *
from m5.proxy import *
from m5.SimObject import SimObject
from MemObject import MemObject
from Process import EmulatedDriver
from Bridge import Bridge
from LdsState import LdsState
class PrefetchType(Enum): vals = [
'PF_CU',
'PF_PHASE',
'PF_WF',
'PF_STRIDE',
'PF_END',
]
class VectorRegisterFile(SimObject):
type = 'VectorRegisterFile'
cxx_class = 'VectorRegisterFile'
cxx_header = 'gpu-compute/vector_register_file.hh'
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
class Wavefront(SimObject):
type = 'Wavefront'
cxx_class = 'Wavefront'
cxx_header = 'gpu-compute/wavefront.hh'
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
class ComputeUnit(MemObject):
type = 'ComputeUnit'
cxx_class = 'ComputeUnit'
cxx_header = 'gpu-compute/compute_unit.hh'
wavefronts = VectorParam.Wavefront('Number of wavefronts')
wfSize = Param.Int(64, 'Wavefront size (in work items)')
num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
'latency')
dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
'latency')
issue_period = Param.Int(4, 'number of cycles per issue period')
num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
"Represents the pipeline to reach the TCP and "\
"specified in GPU clock cycles")
mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
"cu. Represents the pipeline between the TCP "\
"and cu as well as TCP data array access. "\
"Specified in GPU clock cycles")
system = Param.System(Parent.any, "system object")
cu_id = Param.Int('CU id')
vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
"in bytes")
coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
"in bytes")
memory_port = VectorMasterPort("Port to the memory system")
translation_port = VectorMasterPort('Port to the TLB hierarchy')
sqc_port = MasterPort("Port to the SQC (I-cache")
sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
perLaneTLB = Param.Bool(False, "enable per-lane TLB")
prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
"(0 turns off prefetching)")
prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
"from last mem req in lane of "\
"CU|Phase|Wavefront")
execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
"kernel end")
countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
"and how many times")
global_mem_queue_size = Param.Int(256, "Number of entries in the global "
"memory pipeline's queues")
local_mem_queue_size = Param.Int(256, "Number of entries in the local "
"memory pipeline's queues")
ldsBus = Bridge() # the bridge between the CU and its LDS
ldsPort = MasterPort("The port that goes to the LDS")
localDataStore = Param.LdsState("the LDS for this CU")
vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
"file")
class Shader(ClockedObject):
type = 'Shader'
cxx_class = 'Shader'
cxx_header = 'gpu-compute/shader.hh'
CUs = VectorParam.ComputeUnit('Number of compute units')
n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
ruby at kernel boundaries""")
separate_acquire_release = Param.Bool(False,
"""Do ld_acquire/st_release generate separate requests for the
acquire and release?""")
globalmem = Param.MemorySize('64kB', 'Memory size')
timing = Param.Bool(False, 'timing memory accesses')
cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
translation = Param.Bool(False, "address translation");
class ClDriver(EmulatedDriver):
type = 'ClDriver'
cxx_header = 'gpu-compute/cl_driver.hh'
codefile = VectorParam.String('code file name(s)')
class GpuDispatcher(DmaDevice):
type = 'GpuDispatcher'
cxx_header = 'gpu-compute/dispatcher.hh'
# put at 8GB line for now
pio_addr = Param.Addr(0x200000000, "Device Address")
pio_latency = Param.Latency('1ns', "Programmed IO latency")
shader_pointer = Param.Shader('pointer to shader')
translation_port = MasterPort('Port to the dispatcher TLB')
cpu = Param.BaseCPU("CPU to wake up on kernel completion")
cl_driver = Param.ClDriver('pointer to driver')
class OpType(Enum): vals = [
'OT_NULL',
'OT_ALU',
'OT_SPECIAL',
'OT_GLOBAL_READ',
'OT_GLOBAL_WRITE',
'OT_GLOBAL_ATOMIC',
'OT_GLOBAL_HIST',
'OT_GLOBAL_LDAS',
'OT_SHARED_READ',
'OT_SHARED_WRITE',
'OT_SHARED_ATOMIC',
'OT_SHARED_HIST',
'OT_SHARED_LDAS',
'OT_PRIVATE_READ',
'OT_PRIVATE_WRITE',
'OT_PRIVATE_ATOMIC',
'OT_PRIVATE_HIST',
'OT_PRIVATE_LDAS',
'OT_SPILL_READ',
'OT_SPILL_WRITE',
'OT_SPILL_ATOMIC',
'OT_SPILL_HIST',
'OT_SPILL_LDAS',
'OT_READONLY_READ',
'OT_READONLY_WRITE',
'OT_READONLY_ATOMIC',
'OT_READONLY_HIST',
'OT_READONLY_LDAS',
'OT_FLAT_READ',
'OT_FLAT_WRITE',
'OT_FLAT_ATOMIC',
'OT_FLAT_HIST',
'OT_FLAT_LDAS',
'OT_KERN_READ',
'OT_BRANCH',
# note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
# of the compiler.
'OT_SHARED_MEMFENCE',
'OT_GLOBAL_MEMFENCE',
'OT_BOTH_MEMFENCE',
'OT_BARRIER',
'OT_PRINT',
'OT_RET',
'OT_NOP',
'OT_ARG'
]
class MemType(Enum): vals = [
'M_U8',
'M_U16',
'M_U32',
'M_U64',
'M_S8',
'M_S16',
'M_S32',
'M_S64',
'M_F16',
'M_F32',
'M_F64',
]
class MemOpType(Enum): vals = [
'MO_LD',
'MO_ST',
'MO_LDAS',
'MO_LDA',
'MO_AAND',
'MO_AOR',
'MO_AXOR',
'MO_ACAS',
'MO_AEXCH',
'MO_AADD',
'MO_ASUB',
'MO_AINC',
'MO_ADEC',
'MO_AMAX',
'MO_AMIN',
'MO_ANRAND',
'MO_ANROR',
'MO_ANRXOR',
'MO_ANRCAS',
'MO_ANREXCH',
'MO_ANRADD',
'MO_ANRSUB',
'MO_ANRINC',
'MO_ANRDEC',
'MO_ANRMAX',
'MO_ANRMIN',
'MO_HAND',
'MO_HOR',
'MO_HXOR',
'MO_HCAS',
'MO_HEXCH',
'MO_HADD',
'MO_HSUB',
'MO_HINC',
'MO_HDEC',
'MO_HMAX',
'MO_HMIN',
'MO_UNDEF'
]
class StorageClassType(Enum): vals = [
'SC_SPILL',
'SC_GLOBAL',
'SC_SHARED',
'SC_PRIVATE',
'SC_READONLY',
'SC_KERNARG',
'SC_NONE',
]
class RegisterType(Enum): vals = [
'RT_VECTOR',
'RT_SCALAR',
'RT_CONDITION',
'RT_HARDWARE',
'RT_NONE',
]
class GenericMemoryOrder(Enum): vals = [
'MEMORY_ORDER_NONE',
'MEMORY_ORDER_RELAXED',
'MEMORY_ORDER_SC_ACQUIRE',
'MEMORY_ORDER_SC_RELEASE',
'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
]
class GenericMemoryScope(Enum): vals = [
'MEMORY_SCOPE_NONE',
'MEMORY_SCOPE_WORKITEM',
'MEMORY_SCOPE_WAVEFRONT',
'MEMORY_SCOPE_WORKGROUP',
'MEMORY_SCOPE_DEVICE',
'MEMORY_SCOPE_SYSTEM',
]

View file

@ -0,0 +1,51 @@
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Joe Gross
#
from m5.defines import buildEnv
from m5.params import *
from m5.proxy import *
from MemObject import MemObject
class LdsState(MemObject):
type = 'LdsState'
cxx_class = 'LdsState'
cxx_header = 'gpu-compute/lds_state.hh'
size = Param.Int(65536, 'the size of the LDS')
range = Param.AddrRange('64kB', "address space of the LDS")
bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\
'accessing data')
banks = Param.Int(32, 'Number of LDS banks')
cuPort = SlavePort("port that goes to the compute unit")

View file

@ -0,0 +1,99 @@
# -*- mode:python -*-
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Anthony Gutierrez
#
Import('*')
if not env['BUILD_GPU']:
Return()
SimObject('GPU.py')
SimObject('LdsState.py')
SimObject('X86GPUTLB.py')
if env['TARGET_GPU_ISA'] == 'hsail':
Source('brig_object.cc')
Source('hsail_code.cc')
Source('cl_driver.cc')
Source('compute_unit.cc')
Source('condition_register_state.cc')
Source('dispatcher.cc')
Source('exec_stage.cc')
Source('fetch_stage.cc')
Source('fetch_unit.cc')
Source('global_memory_pipeline.cc')
Source('gpu_dyn_inst.cc')
Source('gpu_exec_context.cc')
Source('gpu_static_inst.cc')
Source('gpu_tlb.cc')
Source('hsa_object.cc')
Source('kernel_cfg.cc')
Source('lds_state.cc')
Source('local_memory_pipeline.cc')
Source('of_scheduling_policy.cc')
Source('pool_manager.cc')
Source('rr_scheduling_policy.cc')
Source('schedule_stage.cc')
Source('scheduler.cc')
Source('scoreboard_check_stage.cc')
Source('shader.cc')
Source('simple_pool_manager.cc')
Source('tlb_coalescer.cc')
Source('vector_register_file.cc')
Source('vector_register_state.cc')
Source('wavefront.cc')
DebugFlag('BRIG')
DebugFlag('GPUCoalescer')
DebugFlag('GPUDisp')
DebugFlag('GPUExec')
DebugFlag('GPUFetch')
DebugFlag('GPUHsailCFInfo')
DebugFlag('GPUMem')
DebugFlag('GPUPort')
DebugFlag('GPUPrefetch')
DebugFlag('GPUReg')
DebugFlag('GPUSync')
DebugFlag('GPUTLB')
DebugFlag('HSALoader')
DebugFlag('HSAIL')
DebugFlag('HSAILObject')
DebugFlag('Predictor')
DebugFlag('WavefrontStack')
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL'])

View file

@ -0,0 +1,77 @@
#
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Lisa Hsu
#
from m5.defines import buildEnv
from m5.params import *
from m5.proxy import *
from m5.objects.MemObject import MemObject
if buildEnv['FULL_SYSTEM']:
class X86PagetableWalker(MemObject):
type = 'X86PagetableWalker'
cxx_class = 'X86ISA::Walker'
port = SlavePort("Port for the hardware table walker")
system = Param.System(Parent.any, "system object")
class X86GPUTLB(MemObject):
type = 'X86GPUTLB'
cxx_class = 'X86ISA::GpuTLB'
cxx_header = 'gpu-compute/gpu_tlb.hh'
size = Param.Int(64, "TLB size (number of entries)")
assoc = Param.Int(64, "TLB associativity")
if buildEnv['FULL_SYSTEM']:
walker = Param.X86PagetableWalker(X86PagetableWalker(),
"page table walker")
hitLatency = Param.Int(2, "Latency of a TLB hit")
missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
slave = VectorSlavePort("Port on side closer to CPU/CU")
master = VectorMasterPort("Port on side closer to memory")
allocationPolicy = Param.Bool(True, "Allocate on an access")
accessDistance = Param.Bool(False, "print accessDistance stats")
class TLBCoalescer(MemObject):
type = 'TLBCoalescer'
cxx_class = 'TLBCoalescer'
cxx_header = 'gpu-compute/tlb_coalescer.hh'
probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
slave = VectorSlavePort("Port on side closer to CPU/CU")
master = VectorMasterPort("Port on side closer to memory")
disableCoalescing = Param.Bool(False,"Dispable Coalescing")

View file

@ -0,0 +1,474 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt, Anthony Gutierrez
*/
#include "gpu-compute/brig_object.hh"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <unistd.h>
#include <cassert>
#include <cstddef>
#include <cstdlib>
#include "arch/hsail/Brig.h"
#include "base/misc.hh"
#include "base/trace.hh"
#include "debug/BRIG.hh"
#include "debug/HSAILObject.hh"
#include "debug/HSALoader.hh"
using namespace Brig;
std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>>
HsaObject::tryFileFuncs = { BrigObject::tryFile };
extern int getBrigDataTypeBytes(BrigType16_t t);
const char *BrigObject::sectionNames[] =
{
"hsa_data",
"hsa_code",
"hsa_operand",
".shstrtab"
};
const char *segmentNames[] =
{
"none",
"flat",
"global",
"readonly",
"kernarg",
"group",
"private",
"spill",
"args"
};
const uint8_t*
BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const
{
// allow offs == size for dummy end pointers
assert(offs <= sectionInfo[sec].size);
return sectionInfo[sec].ptr + offs;
}
const char*
BrigObject::getString(int offs) const
{
return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4);
}
const BrigBase*
BrigObject::getCodeSectionEntry(int offs) const
{
return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs);
}
const BrigData*
BrigObject::getBrigBaseData(int offs) const
{
return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs));
}
const uint8_t*
BrigObject::getData(int offs) const
{
return getSectionOffset(DataSectionIndex, offs);
}
const BrigOperand*
BrigObject::getOperand(int offs) const
{
return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs);
}
unsigned
BrigObject::getOperandPtr(int offs, int index) const
{
unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1)));
return *op_offs;
}
const BrigInstBase*
BrigObject::getInst(int offs) const
{
return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs);
}
HsaCode*
BrigObject::getKernel(const std::string &name) const
{
return nullptr;
}
HsaCode*
BrigObject::getFunction(const std::string &name) const
{
for (int i = 0; i < functions.size(); ++i) {
if (functions[i]->name() == name) {
return functions[i];
}
}
return nullptr;
}
void
BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr,
StorageMap *storageMap)
{
while (dirPtr < endPtr) {
if (!dirPtr->byteCount) {
fatal("Bad directive size 0\n");
}
// calculate next pointer now so we can override it if needed
const BrigBase *nextDirPtr = brigNext(dirPtr);
DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n",
dirPtr->kind, dirPtr->byteCount);
switch (dirPtr->kind) {
case BRIG_KIND_DIRECTIVE_FUNCTION:
{
const BrigDirectiveExecutable *p M5_VAR_USED =
reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: "
"%d next: %d\n", getString(p->name),
p->firstCodeBlockEntry, p->nextModuleEntry);
if (p->firstCodeBlockEntry != p->nextModuleEntry) {
panic("Function calls are not fully supported yet!!: %s\n",
getString(p->name));
const char *name = getString(p->name);
HsailCode *code_obj = nullptr;
for (int i = 0; i < functions.size(); ++i) {
if (functions[i]->name() == name) {
code_obj = functions[i];
break;
}
}
if (!code_obj) {
// create new local storage map for kernel-local symbols
code_obj = new HsailCode(name, p, this,
new StorageMap(storageMap));
functions.push_back(code_obj);
} else {
panic("Multiple definition of Function!!: %s\n",
getString(p->name));
}
}
nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
}
break;
case BRIG_KIND_DIRECTIVE_KERNEL:
{
const BrigDirectiveExecutable *p =
reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: "
"next: %d\n", getString(p->name),
p->firstCodeBlockEntry, p->nextModuleEntry);
const char *name = getString(p->name);
if (name[0] == '&')
name++;
std::string str = name;
char *temp;
int len = str.length();
if (str[len - 1] >= 'a' && str[len - 1] <= 'z') {
temp = new char[str.size() + 1];
std::copy(str.begin(), str.end() , temp);
temp[str.size()] = '\0';
} else {
temp = new char[str.size()];
std::copy(str.begin(), str.end() - 1 , temp);
temp[str.size() - 1 ] = '\0';
}
std::string kernel_name = temp;
delete[] temp;
HsailCode *code_obj = nullptr;
for (const auto &kernel : kernels) {
if (kernel->name() == kernel_name) {
code_obj = kernel;
break;
}
}
if (!code_obj) {
// create new local storage map for kernel-local symbols
code_obj = new HsailCode(kernel_name, p, this,
new StorageMap(storageMap));
kernels.push_back(code_obj);
}
nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
}
break;
case BRIG_KIND_DIRECTIVE_VARIABLE:
{
const BrigDirectiveVariable *p =
reinterpret_cast<const BrigDirectiveVariable*>(dirPtr);
uint64_t readonlySize_old =
storageMap->getSize(BRIG_SEGMENT_READONLY);
StorageElement* se = storageMap->addSymbol(p, this);
DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n",
getString(p->name));
if (p->segment == BRIG_SEGMENT_READONLY) {
// readonly memory has initialization data
uint8_t* readonlyData_old = readonlyData;
readonlyData =
new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)];
if (p->init) {
if ((p->type == BRIG_TYPE_ROIMG) ||
(p->type == BRIG_TYPE_WOIMG) ||
(p->type == BRIG_TYPE_SAMP) ||
(p->type == BRIG_TYPE_SIG32) ||
(p->type == BRIG_TYPE_SIG64)) {
panic("Read only data type not supported: %s\n",
getString(p->name));
}
const BrigOperand *brigOp = getOperand(p->init);
assert(brigOp->kind ==
BRIG_KIND_OPERAND_CONSTANT_BYTES);
const Brig::BrigData *operand_data M5_VAR_USED =
getBrigBaseData(((BrigOperandConstantBytes*)
brigOp)->bytes);
assert((operand_data->byteCount / 4) > 0);
uint8_t *symbol_data =
(uint8_t*)getData(((BrigOperandConstantBytes*)
brigOp)->bytes + 4);
// copy the old data and add the new data
if (readonlySize_old > 0) {
memcpy(readonlyData, readonlyData_old,
readonlySize_old);
}
memcpy(readonlyData + se->offset, symbol_data,
se->size);
delete[] readonlyData_old;
}
}
}
break;
case BRIG_KIND_DIRECTIVE_LABEL:
{
const BrigDirectiveLabel M5_VAR_USED *p =
reinterpret_cast<const BrigDirectiveLabel*>(dirPtr);
panic("Label directives cannot be at the module level: %s\n",
getString(p->name));
}
break;
case BRIG_KIND_DIRECTIVE_COMMENT:
{
const BrigDirectiveComment M5_VAR_USED *p =
reinterpret_cast<const BrigDirectiveComment*>(dirPtr);
DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n",
getString(p->name));
}
break;
case BRIG_KIND_DIRECTIVE_LOC:
{
DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n");
}
break;
case BRIG_KIND_DIRECTIVE_MODULE:
{
const BrigDirectiveModule M5_VAR_USED *p =
reinterpret_cast<const BrigDirectiveModule*>(dirPtr);
DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n",
getString(p->name));
}
break;
case BRIG_KIND_DIRECTIVE_CONTROL:
{
DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n");
}
break;
case BRIG_KIND_DIRECTIVE_PRAGMA:
{
DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n");
}
break;
case BRIG_KIND_DIRECTIVE_EXTENSION:
{
DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n");
}
break;
case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
{
DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n");
}
break;
case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
{
DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n");
}
break;
default:
if (dirPtr->kind >= BRIG_KIND_INST_BEGIN &&
dirPtr->kind <= BRIG_KIND_INST_END)
break;
if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
dirPtr->kind <= BRIG_KIND_OPERAND_END)
break;
warn("Unknown Brig directive kind: %d\n", dirPtr->kind);
break;
}
dirPtr = nextDirPtr;
}
}
HsaObject*
BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData)
{
const char *brig_ident = "HSA BRIG";
if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH))
return nullptr;
return new BrigObject(fname, len, fileData);
}
BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData)
: HsaObject(fname), storageMap(new StorageMap())
{
const char *brig_ident = "HSA BRIG";
BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData;
fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH),
"%s is not a BRIG file\n", fname);
if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR ||
mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) {
fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n",
fname, mod_hdr->brigMajor, mod_hdr->brigMinor,
BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR);
}
fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section "
"count (%d) != expected value (%d)\n", fname,
mod_hdr->sectionCount, NumSectionIndices);
for (int i = 0; i < NumSectionIndices; ++i) {
sectionInfo[i].ptr = nullptr;
}
uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex);
for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) {
uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx];
BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr;
// It doesn't look like cprintf supports string precision values,
// but if this breaks, the right answer is to fix that
DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength,
sec_hdr->name);
sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount];
memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount);
sectionInfo[sec_idx].size = sec_hdr->byteCount;
}
BrigSectionHeader *code_hdr =
(BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr;
DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, "
"name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount,
code_hdr->nameLength);
// start at offset 4 to skip initial null entry (see Brig spec)
processDirectives(getCodeSectionEntry(code_hdr->headerByteCount),
getCodeSectionEntry(sectionInfo[CodeSectionIndex].size),
storageMap);
delete[] fileData;
DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname);
}
BrigObject::~BrigObject()
{
for (int i = 0; i < NumSectionIndices; ++i)
if (sectionInfo[i].ptr)
delete[] sectionInfo[i].ptr;
}

View file

@ -0,0 +1,134 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt, Anthony Gutierrez
*/
#ifndef __BRIG_OBJECT_HH__
#define __BRIG_OBJECT_HH__
#include <cassert>
#include <cstdint>
#include <string>
#include <vector>
#include "arch/hsail/Brig.h"
#include "gpu-compute/hsa_object.hh"
#include "gpu-compute/hsail_code.hh"
class LabelMap;
class StorageMap;
/* @class BrigObject
* this class implements the BRIG loader object, and
* is used when the simulator directly executes HSAIL.
* this class is responsible for extracting all
* information about kernels contained in BRIG format
* and converts them to HsailCode objects that are
* usable by the simulator and emulated runtime.
*/
class BrigObject final : public HsaObject
{
public:
enum SectionIndex
{
DataSectionIndex,
CodeSectionIndex,
OperandsSectionIndex,
NumSectionIndices
};
static const char *sectionNames[];
struct SectionInfo
{
uint8_t *ptr;
int size;
};
static HsaObject* tryFile(const std::string &fname, int len,
uint8_t *fileData);
SectionInfo sectionInfo[NumSectionIndices];
const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const;
std::vector<HsailCode*> kernels;
std::vector<HsailCode*> functions;
std::string kern_block_name;
void processDirectives(const Brig::BrigBase *dirPtr,
const Brig::BrigBase *endPtr,
StorageMap *storageMap);
BrigObject(const std::string &fname, int len, uint8_t *fileData);
~BrigObject();
// eventually these will need to be per-kernel not per-object-file
StorageMap *storageMap;
LabelMap *labelMap;
const char* getString(int offs) const;
const Brig::BrigData* getBrigBaseData(int offs) const;
const uint8_t* getData(int offs) const;
const Brig::BrigBase* getCodeSectionEntry(int offs) const;
const Brig::BrigOperand* getOperand(int offs) const;
unsigned getOperandPtr(int offs, int index) const;
const Brig::BrigInstBase* getInst(int offs) const;
HsaCode* getKernel(const std::string &name) const override;
HsaCode* getFunction(const std::string &name) const override;
int numKernels() const override { return kernels.size(); }
HsaCode* getKernel(int i) const override { return kernels[i]; }
// pointer to the current kernel/function we're processing, so elements
// under construction can reference it. kinda ugly, but easier
// than passing it all over for the few places it's needed.
mutable HsailCode *currentCode;
};
// Utility function to bump Brig item pointer to next element given
// item size in bytes. Really just an add but with lots of casting.
template<typename T>
T*
brigNext(T *ptr)
{
Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr;
int size = base_ptr->byteCount;
assert(size);
return (T*)((uint8_t*)ptr + size);
}
#endif // __BRIG_OBJECT_HH__

View file

@ -0,0 +1,272 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#include "gpu-compute/cl_driver.hh"
#include "base/intmath.hh"
#include "cpu/thread_context.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/hsa_code.hh"
#include "gpu-compute/hsa_kernel_info.hh"
#include "gpu-compute/hsa_object.hh"
#include "params/ClDriver.hh"
#include "sim/process.hh"
#include "sim/syscall_emul_buf.hh"
ClDriver::ClDriver(ClDriverParams *p)
: EmulatedDriver(p), hsaCode(0)
{
for (const auto &codeFile : p->codefile)
codeFiles.push_back(&codeFile);
maxFuncArgsSize = 0;
for (int i = 0; i < codeFiles.size(); ++i) {
HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]);
for (int k = 0; k < obj->numKernels(); ++k) {
assert(obj->getKernel(k));
kernels.push_back(obj->getKernel(k));
kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData);
int kern_funcargs_size = kernels.back()->funcarg_size;
maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ?
kern_funcargs_size : maxFuncArgsSize;
}
}
int name_offs = 0;
int code_offs = 0;
for (int i = 0; i < kernels.size(); ++i) {
kernelInfo.push_back(HsaKernelInfo());
HsaCode *k = kernels[i];
k->generateHsaKernelInfo(&kernelInfo[i]);
kernelInfo[i].name_offs = name_offs;
kernelInfo[i].code_offs = code_offs;
name_offs += k->name().size() + 1;
code_offs += k->numInsts() * sizeof(GPUStaticInst*);
}
}
void
ClDriver::handshake(GpuDispatcher *_dispatcher)
{
dispatcher = _dispatcher;
dispatcher->setFuncargsSize(maxFuncArgsSize);
}
int
ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags)
{
int fd = p->allocFD(-1, filename, 0, 0, false);
FDEntry *fde = p->getFDEntry(fd);
fde->driver = this;
return fd;
}
int
ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
{
int index = 2;
Addr buf_addr = process->getSyscallArg(tc, index);
switch (req) {
case HSA_GET_SIZES:
{
TypedBufferArg<HsaDriverSizes> sizes(buf_addr);
sizes->num_kernels = kernels.size();
sizes->string_table_size = 0;
sizes->code_size = 0;
sizes->readonly_size = 0;
if (kernels.size() > 0) {
// all kernels will share the same read-only memory
sizes->readonly_size =
kernels[0]->getSize(HsaCode::MemorySegment::READONLY);
// check our assumption
for (int i = 1; i<kernels.size(); ++i) {
assert(sizes->readonly_size ==
kernels[i]->getSize(HsaCode::MemorySegment::READONLY));
}
}
for (int i = 0; i < kernels.size(); ++i) {
HsaCode *k = kernels[i];
// add one for terminating '\0'
sizes->string_table_size += k->name().size() + 1;
sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*);
}
sizes.copyOut(tc->getMemProxy());
}
break;
case HSA_GET_KINFO:
{
TypedBufferArg<HsaKernelInfo>
kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size());
for (int i = 0; i < kernels.size(); ++i) {
HsaKernelInfo *ki = &kinfo[i];
ki->name_offs = kernelInfo[i].name_offs;
ki->code_offs = kernelInfo[i].code_offs;
ki->sRegCount = kernelInfo[i].sRegCount;
ki->dRegCount = kernelInfo[i].dRegCount;
ki->cRegCount = kernelInfo[i].cRegCount;
ki->static_lds_size = kernelInfo[i].static_lds_size;
ki->private_mem_size = kernelInfo[i].private_mem_size;
ki->spill_mem_size = kernelInfo[i].spill_mem_size;
}
kinfo.copyOut(tc->getMemProxy());
}
break;
case HSA_GET_STRINGS:
{
int string_table_size = 0;
for (int i = 0; i < kernels.size(); ++i) {
HsaCode *k = kernels[i];
string_table_size += k->name().size() + 1;
}
BufferArg buf(buf_addr, string_table_size);
char *bufp = (char*)buf.bufferPtr();
for (int i = 0; i < kernels.size(); ++i) {
HsaCode *k = kernels[i];
const char *n = k->name().c_str();
// idiomatic string copy
while ((*bufp++ = *n++));
}
assert(bufp - (char *)buf.bufferPtr() == string_table_size);
buf.copyOut(tc->getMemProxy());
}
break;
case HSA_GET_READONLY_DATA:
{
// we can pick any kernel --- they share the same
// readonly segment (this assumption is checked in GET_SIZES)
uint64_t size =
kernels.back()->getSize(HsaCode::MemorySegment::READONLY);
BufferArg data(buf_addr, size);
char *datap = (char *)data.bufferPtr();
memcpy(datap,
kernels.back()->readonly_data,
size);
data.copyOut(tc->getMemProxy());
}
break;
case HSA_GET_CODE:
{
// set hsaCode pointer
hsaCode = buf_addr;
int code_size = 0;
for (int i = 0; i < kernels.size(); ++i) {
HsaCode *k = kernels[i];
code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst);
}
TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size);
TheGpuISA::RawMachInst *bufp = buf;
int buf_idx = 0;
for (int i = 0; i < kernels.size(); ++i) {
HsaCode *k = kernels[i];
for (int j = 0; j < k->numInsts(); ++j) {
bufp[buf_idx] = k->insts()->at(j);
++buf_idx;
}
}
buf.copyOut(tc->getMemProxy());
}
break;
case HSA_GET_CU_CNT:
{
BufferArg buf(buf_addr, sizeof(uint32_t));
*((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs();
buf.copyOut(tc->getMemProxy());
}
break;
case HSA_GET_VSZ:
{
BufferArg buf(buf_addr, sizeof(uint32_t));
*((uint32_t*)buf.bufferPtr()) = VSZ;
buf.copyOut(tc->getMemProxy());
}
break;
default:
fatal("ClDriver: bad ioctl %d\n", req);
}
return 0;
}
const char*
ClDriver::codeOffToKernelName(uint64_t code_ptr)
{
assert(hsaCode);
uint32_t code_offs = code_ptr - hsaCode;
for (int i = 0; i < kernels.size(); ++i) {
if (code_offs == kernelInfo[i].code_offs) {
return kernels[i]->name().c_str();
}
}
return nullptr;
}
ClDriver*
ClDriverParams::create()
{
return new ClDriver(this);
}

View file

@ -0,0 +1,77 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __CL_DRIVER_HH__
#define __CL_DRIVER_HH__
#include <vector>
#include "gpu-compute/hsa_kernel_info.hh"
#include "sim/emul_driver.hh"
class GpuDispatcher;
class HsaCode;
class LiveProcess;
class ThreadContext;
struct ClDriverParams;
class ClDriver final : public EmulatedDriver
{
public:
ClDriver(ClDriverParams *p);
void handshake(GpuDispatcher *_dispatcher);
int open(LiveProcess *p, ThreadContext *tc, int mode, int flags);
int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req);
const char* codeOffToKernelName(uint64_t code_ptr);
private:
GpuDispatcher *dispatcher;
std::vector<const std::string*> codeFiles;
// All the kernels we know about
std::vector<HsaCode*> kernels;
std::vector<HsaCode*> functions;
std::vector<HsaKernelInfo> kernelInfo;
// maximum size necessary for function arguments
int maxFuncArgsSize;
// The host virtual address for the kernel code
uint64_t hsaCode;
};
#endif // __CL_DRIVER_HH__

View file

@ -0,0 +1,51 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Marc Orr
*/
#ifndef __GPU_CL_EVENT_HH__
#define __GPU_CL_EVENT_HH__
struct HsaQueueEntry;
class _cl_event {
public:
_cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { }
volatile bool done;
HsaQueueEntry *hsaTaskPtr;
uint64_t start;
uint64_t end;
};
#endif // __GPU_CL_EVENT_HH__

View file

@ -0,0 +1,116 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __CODE_ENUMS_HH__
#define __CODE_ENUMS_HH__
#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \
&& (a)<=Enums::OT_GLOBAL_LDAS)
#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \
&& (a)<=Enums::OT_SHARED_LDAS)
#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \
&& (a)<=Enums::OT_PRIVATE_LDAS)
#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \
&& (a)<=Enums::OT_SPILL_LDAS)
#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \
&& (a)<=Enums::OT_READONLY_LDAS)
#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS)
#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \
||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \
||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS)
#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \
||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \
||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ)
#define IS_OT_READ_GM(a) \
((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \
||(a)==Enums::OT_READONLY_READ)
#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ)
#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ)
#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ)
#define IS_OT_WRITE(a) \
((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \
||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \
||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE)
#define IS_OT_WRITE_GM(a) \
((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \
||(a)==Enums::OT_READONLY_WRITE)
#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE)
#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE)
#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
||(a)==Enums::OT_SHARED_ATOMIC \
||(a)==Enums::OT_PRIVATE_ATOMIC \
||(a)==Enums::OT_SPILL_ATOMIC \
||(a)==Enums::OT_READONLY_ATOMIC \
||(a)==Enums::OT_FLAT_ATOMIC)
#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
||(a)==Enums::OT_SPILL_ATOMIC \
||(a)==Enums::OT_READONLY_ATOMIC \
||(a)==Enums::OT_GLOBAL_MEMFENCE \
||(a)==Enums::OT_BOTH_MEMFENCE)
#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
||(a)==Enums::OT_SHARED_MEMFENCE \
||(a)==Enums::OT_BOTH_MEMFENCE)
#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \
||(a)==Enums::OT_SHARED_HIST \
||(a)==Enums::OT_PRIVATE_HIST \
||(a)==Enums::OT_SPILL_HIST \
||(a)==Enums::OT_READONLY_HIST \
||(a)==Enums::OT_FLAT_HIST)
#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \
||(a)==Enums::OT_SPILL_HIST \
||(a)==Enums::OT_READONLY_HIST)
#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST)
#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST)
#endif // __CODE_ENUMS_HH__

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,767 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos, Anthony Gutierrez
*/
#ifndef __COMPUTE_UNIT_HH__
#define __COMPUTE_UNIT_HH__
#include <deque>
#include <map>
#include <unordered_map>
#include <vector>
#include "base/callback.hh"
#include "base/statistics.hh"
#include "base/types.hh"
#include "enums/PrefetchType.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/fetch_stage.hh"
#include "gpu-compute/global_memory_pipeline.hh"
#include "gpu-compute/local_memory_pipeline.hh"
#include "gpu-compute/qstruct.hh"
#include "gpu-compute/schedule_stage.hh"
#include "gpu-compute/scoreboard_check_stage.hh"
#include "mem/mem_object.hh"
#include "mem/port.hh"
static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
static const int MAX_WIDTH_FOR_MEM_INST = 32;
class NDRange;
class Shader;
class VectorRegisterFile;
struct ComputeUnitParams;
enum EXEC_POLICY
{
OLDEST = 0,
RR
};
// List of execution units
enum EXEC_UNIT
{
SIMD0 = 0,
SIMD1,
SIMD2,
SIMD3,
GLBMEM_PIPE,
LDSMEM_PIPE,
NUM_UNITS
};
enum TLB_CACHE
{
TLB_MISS_CACHE_MISS = 0,
TLB_MISS_CACHE_HIT,
TLB_HIT_CACHE_MISS,
TLB_HIT_CACHE_HIT
};
class ComputeUnit : public MemObject
{
public:
FetchStage fetchStage;
ScoreboardCheckStage scoreboardCheckStage;
ScheduleStage scheduleStage;
ExecStage execStage;
GlobalMemPipeline globalMemoryPipe;
LocalMemPipeline localMemoryPipe;
// Buffers used to communicate between various pipeline stages
// List of waves which are ready to be scheduled.
// Each execution resource has a ready list. readyList is
// used to communicate between scoreboardCheck stage and
// schedule stage
// TODO: make enum to index readyList
std::vector<std::vector<Wavefront*>> readyList;
// Stores the status of waves. A READY implies the
// wave is ready to be scheduled this cycle and
// is already present in the readyList. waveStatusList is
// used to communicate between scoreboardCheck stage and
// schedule stage
// TODO: convert std::pair to a class to increase readability
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
// List of waves which will be dispatched to
// each execution resource. A FILLED implies
// dispatch list is non-empty and
// execution unit has something to execute
// this cycle. Currently, the dispatch list of
// an execution resource can hold only one wave because
// an execution resource can execute only one wave in a cycle.
// dispatchList is used to communicate between schedule
// and exec stage
// TODO: convert std::pair to a class to increase readability
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
int rrNextMemID; // used by RR WF exec policy to cycle through WF's
int rrNextALUWp;
typedef ComputeUnitParams Params;
std::vector<std::vector<Wavefront*>> wfList;
int cu_id;
// array of vector register files, one per SIMD
std::vector<VectorRegisterFile*> vrf;
// Number of vector ALU units (SIMDs) in CU
int numSIMDs;
// number of pipe stages for bypassing data to next dependent single
// precision vector instruction inside the vector ALU pipeline
int spBypassPipeLength;
// number of pipe stages for bypassing data to next dependent double
// precision vector instruction inside the vector ALU pipeline
int dpBypassPipeLength;
// number of cycles per issue period
int issuePeriod;
// Number of global and local memory execution resources in CU
int numGlbMemUnits;
int numLocMemUnits;
// tracks the last cycle a vector instruction was executed on a SIMD
std::vector<uint64_t> lastExecCycle;
// true if we allow a separate TLB per lane
bool perLaneTLB;
// if 0, TLB prefetching is off.
int prefetchDepth;
// if fixed-stride prefetching, this is the stride.
int prefetchStride;
class LastVaddrWave
{
public:
Addr vaddrs[VSZ];
Addr& operator[](int idx) {
return vaddrs[idx];
}
LastVaddrWave() {
for (int i = 0; i < VSZ; ++i)
vaddrs[i] = 0;
}
};
LastVaddrWave lastVaddrCU;
std::vector<LastVaddrWave> lastVaddrPhase;
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
Enums::PrefetchType prefetchType;
EXEC_POLICY exec_policy;
bool xact_cas_mode;
bool debugSegFault;
bool functionalTLB;
bool localMemBarrier;
/*
* for Counting page accesses
*
* cuExitCallback inherits from Callback. When you register a callback
* function as an exit callback, it will get added to an exit callback
* queue, such that on simulation exit, all callbacks in the callback
* queue will have their process() function called.
*/
bool countPages;
Shader *shader;
uint32_t barrier_id;
// vector of Vector ALU (MACC) pipelines
std::vector<WaitClass> aluPipe;
// minimum issue period per SIMD unit (in cycles)
std::vector<WaitClass> wfWait;
// Resource control for Vector Register File->Global Memory pipe buses
std::vector<WaitClass> vrfToGlobalMemPipeBus;
// Resource control for Vector Register File->Local Memory pipe buses
std::vector<WaitClass> vrfToLocalMemPipeBus;
int nextGlbMemBus;
int nextLocMemBus;
// Resource control for global memory to VRF data/address bus
WaitClass glbMemToVrfBus;
// Resource control for local memory to VRF data/address bus
WaitClass locMemToVrfBus;
uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
Tick req_tick_latency;
Tick resp_tick_latency;
// number of vector registers being reserved for each SIMD unit
std::vector<int> vectorRegsReserved;
// number of vector registers per SIMD unit
uint32_t numVecRegsPerSimd;
// Support for scheduling VGPR status update events
std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
std::vector<uint64_t> timestampVec;
std::vector<uint8_t> statusVec;
void
registerEvent(uint32_t simdId,
uint32_t regIdx,
uint32_t operandSize,
uint64_t when,
uint8_t newStatus) {
regIdxVec.push_back(std::make_pair(simdId, regIdx));
timestampVec.push_back(when);
statusVec.push_back(newStatus);
if (operandSize > 4) {
regIdxVec.push_back(std::make_pair(simdId,
((regIdx + 1) %
numVecRegsPerSimd)));
timestampVec.push_back(when);
statusVec.push_back(newStatus);
}
}
void updateEvents();
// this hash map will keep track of page divergence
// per memory instruction per wavefront. The hash map
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
std::map<Addr, int> pagesTouched;
ComputeUnit(const Params *p);
~ComputeUnit();
int spBypassLength() { return spBypassPipeLength; };
int dpBypassLength() { return dpBypassPipeLength; };
int storeBusLength() { return numCyclesPerStoreTransfer; };
int loadBusLength() { return numCyclesPerLoadTransfer; };
int wfSize() const { return wavefrontSize; };
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
void exec();
void initiateFetch(Wavefront *wavefront);
void fetch(PacketPtr pkt, Wavefront *wavefront);
void FillKernelState(Wavefront *w, NDRange *ndr);
void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
int trueWgSizeTotal);
void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
int trueWgSize[], int trueWgSizeTotal,
LdsChunk *ldsChunk, uint64_t origSpillMemStart);
void StartWorkgroup(NDRange *ndr);
int ReadyWorkgroup(NDRange *ndr);
bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
int GlbMemUnitId() { return GLBMEM_PIPE; }
int ShrMemUnitId() { return LDSMEM_PIPE; }
int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
/* This function cycles through all the wavefronts in all the phases to see
* if all of the wavefronts which should be associated with one barrier
* (denoted with _barrier_id), are all at the same barrier in the program
* (denoted by bcnt). When the number at the barrier matches bslots, then
* return true.
*/
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
bool cedeSIMD(int simdId, int wfSlotId);
template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
virtual void init();
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
bool kernelLaunch=true,
RequestPtr req=nullptr);
void handleMemPacket(PacketPtr pkt, int memport_index);
bool processTimingPacket(PacketPtr pkt);
void processFetchReturn(PacketPtr pkt);
void updatePageDivergenceDist(Addr addr);
MasterID masterId() { return _masterId; }
bool isDone() const;
bool isSimdDone(uint32_t) const;
protected:
MasterID _masterId;
LdsState &lds;
public:
// the following stats compute the avg. TLB accesslatency per
// uncoalesced request (only for data)
Stats::Scalar tlbRequests;
Stats::Scalar tlbCycles;
Stats::Formula tlbLatency;
// hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
Stats::Vector hitsPerTLBLevel;
Stats::Scalar ldsBankAccesses;
Stats::Distribution ldsBankConflictDist;
// over all memory instructions executed over all wavefronts
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
Stats::Distribution pageDivergenceDist;
Stats::Scalar dynamicGMemInstrCnt;
Stats::Scalar dynamicLMemInstrCnt;
Stats::Scalar wgBlockedDueLdsAllocation;
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
// when the instruction is committed, this number is still incremented by 1
Stats::Scalar numInstrExecuted;
// Number of cycles among successive instruction executions across all
// wavefronts of the same CU
Stats::Distribution execRateDist;
// number of individual vector operations executed
Stats::Scalar numVecOpsExecuted;
// Total cycles that something is running on the GPU
Stats::Scalar totalCycles;
Stats::Formula vpc; // vector ops per cycle
Stats::Formula ipc; // vector instructions per cycle
Stats::Distribution controlFlowDivergenceDist;
Stats::Distribution activeLanesPerGMemInstrDist;
Stats::Distribution activeLanesPerLMemInstrDist;
// number of vector ALU instructions received
Stats::Formula numALUInstsExecuted;
// number of times a WG can not start due to lack of free VGPRs in SIMDs
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
Stats::Scalar numCASOps;
Stats::Scalar numFailedCASOps;
Stats::Scalar completedWfs;
// flag per vector SIMD unit that is set when there is at least one
// WV that has a vector ALU instruction as the oldest in its
// Instruction Buffer: Defined in the Scoreboard stage, consumed
// by the Execute stage.
std::vector<bool> vectorAluInstAvail;
// number of available (oldest) LDS instructions that could have
// been issued to the LDS at a specific issue slot
int shrMemInstAvail;
// number of available Global memory instructions that could have
// been issued to TCP at a specific issue slot
int glbMemInstAvail;
void
regStats();
LdsState &
getLds() const
{
return lds;
}
int32_t
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
bool
sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
pageDataStruct pageAccesses;
class CUExitCallback : public Callback
{
private:
ComputeUnit *computeUnit;
public:
virtual ~CUExitCallback() { }
CUExitCallback(ComputeUnit *_cu)
{
computeUnit = _cu;
}
virtual void
process();
};
CUExitCallback *cuExitCallback;
/** Data access Port **/
class DataPort : public MasterPort
{
public:
DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
: MasterPort(_name, _cu), computeUnit(_cu),
index(_index) { }
bool snoopRangeSent;
struct SenderState : public Packet::SenderState
{
GPUDynInstPtr _gpuDynInst;
int port_index;
Packet::SenderState *saved;
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
Packet::SenderState *sender_state=nullptr)
: _gpuDynInst(gpuDynInst),
port_index(_port_index),
saved(sender_state) { }
};
class MemReqEvent : public Event
{
private:
DataPort *dataPort;
PacketPtr pkt;
public:
MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
: Event(), dataPort(_data_port), pkt(_pkt)
{
setFlags(Event::AutoDelete);
}
void process();
const char *description() const;
};
class MemRespEvent : public Event
{
private:
DataPort *dataPort;
PacketPtr pkt;
public:
MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
: Event(), dataPort(_data_port), pkt(_pkt)
{
setFlags(Event::AutoDelete);
}
void process();
const char *description() const;
};
std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
protected:
ComputeUnit *computeUnit;
int index;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
{
resp.clear();
snoop = true;
}
};
// Instruction cache access port
class SQCPort : public MasterPort
{
public:
SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
: MasterPort(_name, _cu), computeUnit(_cu),
index(_index) { }
bool snoopRangeSent;
struct SenderState : public Packet::SenderState
{
Wavefront *wavefront;
Packet::SenderState *saved;
SenderState(Wavefront *_wavefront, Packet::SenderState
*sender_state=nullptr)
: wavefront(_wavefront), saved(sender_state) { }
};
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
protected:
ComputeUnit *computeUnit;
int index;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
{
resp.clear();
snoop = true;
}
};
/** Data TLB port **/
class DTLBPort : public MasterPort
{
public:
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
: MasterPort(_name, _cu), computeUnit(_cu),
index(_index), stalled(false)
{ }
bool isStalled() { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the translation requests that were
* not successfully sent.
*/
std::deque<PacketPtr> retries;
/** SenderState is information carried along with the packet
* throughout the TLB hierarchy
*/
struct SenderState: public Packet::SenderState
{
// the memInst that this is associated with
GPUDynInstPtr _gpuDynInst;
// the lane in the memInst this is associated with, so we send
// the memory request down the right port
int portIndex;
// constructor used for packets involved in timing accesses
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
: _gpuDynInst(gpuDynInst), portIndex(port_index) { }
};
protected:
ComputeUnit *computeUnit;
int index;
bool stalled;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
class ITLBPort : public MasterPort
{
public:
ITLBPort(const std::string &_name, ComputeUnit *_cu)
: MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
bool isStalled() { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the translation requests that were
* not successfully sent.
*/
std::deque<PacketPtr> retries;
/** SenderState is information carried along with the packet
* throughout the TLB hierarchy
*/
struct SenderState: public Packet::SenderState
{
// The wavefront associated with this request
Wavefront *wavefront;
SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
};
protected:
ComputeUnit *computeUnit;
bool stalled;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
/**
* the port intended to communicate between the CU and its LDS
*/
class LDSPort : public MasterPort
{
public:
LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
: MasterPort(_name, _cu, _id), computeUnit(_cu)
{
}
bool isStalled() const { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the requests that were
* not successfully sent.
*/
std::queue<PacketPtr> retries;
/**
* SenderState is information carried along with the packet, esp. the
* GPUDynInstPtr
*/
class SenderState: public Packet::SenderState
{
protected:
// The actual read/write/atomic request that goes with this command
GPUDynInstPtr _gpuDynInst = nullptr;
public:
SenderState(GPUDynInstPtr gpuDynInst):
_gpuDynInst(gpuDynInst)
{
}
GPUDynInstPtr
getMemInst() const
{
return _gpuDynInst;
}
};
virtual bool
sendTimingReq(PacketPtr pkt);
protected:
bool stalled = false; ///< whether or not it is stalled
ComputeUnit *computeUnit;
virtual bool
recvTimingResp(PacketPtr pkt);
virtual Tick
recvAtomic(PacketPtr pkt) { return 0; }
virtual void
recvFunctional(PacketPtr pkt)
{
}
virtual void
recvRangeChange()
{
}
virtual void
recvReqRetry();
};
/** The port to access the Local Data Store
* Can be connected to a LDS object
*/
LDSPort *ldsPort = nullptr;
LDSPort *
getLdsPort() const
{
return ldsPort;
}
/** The memory port for SIMD data accesses.
* Can be connected to PhysMem for Ruby for timing simulations
*/
std::vector<DataPort*> memPort;
// port to the TLB hierarchy (i.e., the L1 TLB)
std::vector<DTLBPort*> tlbPort;
// port to the SQC (i.e. the I-cache)
SQCPort *sqcPort;
// port to the SQC TLB (there's a separate TLB for each I-cache)
ITLBPort *sqcTLBPort;
virtual BaseMasterPort&
getMasterPort(const std::string &if_name, PortID idx)
{
if (if_name == "memory_port") {
memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
this, idx);
return *memPort[idx];
} else if (if_name == "translation_port") {
tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
this, idx);
return *tlbPort[idx];
} else if (if_name == "sqc_port") {
sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
this, idx);
return *sqcPort;
} else if (if_name == "sqc_tlb_port") {
sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
return *sqcTLBPort;
} else if (if_name == "ldsPort") {
if (ldsPort) {
fatal("an LDS port was already allocated");
}
ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
return *ldsPort;
} else {
panic("incorrect port name");
}
}
// xact_cas_load()
class waveIdentifier
{
public:
waveIdentifier() { }
waveIdentifier(int _simdId, int _wfSlotId)
: simdId(_simdId), wfSlotId(_wfSlotId) { }
int simdId;
int wfSlotId;
};
class waveQueue
{
public:
std::list<waveIdentifier> waveIDQueue;
};
std::map<unsigned, waveQueue> xactCasLoadMap;
uint64_t getAndIncSeqNum() { return globalSeqNum++; }
private:
uint64_t globalSeqNum;
int wavefrontSize;
};
#endif // __COMPUTE_UNIT_HH__

View file

@ -0,0 +1,83 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos
*/
#include "gpu-compute/condition_register_state.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
ConditionRegisterState::ConditionRegisterState()
{
computeUnit = nullptr;
c_reg.clear();
busy.clear();
}
void
ConditionRegisterState::setParent(ComputeUnit *_computeUnit)
{
computeUnit = _computeUnit;
_name = computeUnit->name() + ".CondRegState";
}
void
ConditionRegisterState::init(uint32_t _size)
{
c_reg.resize(_size);
busy.resize(_size, 0);
}
void
ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w)
{
// iterate over all operands
for (auto i = 0; i < ii->getNumOperands(); ++i) {
// is this a condition register destination operand?
if (ii->isCondRegister(i) && ii->isDstOperand(i)) {
// mark the register as busy
markReg(ii->getRegisterIndex(i), 1);
uint32_t pipeLen = w->computeUnit->spBypassLength();
// schedule an event for marking the register as ready
w->computeUnit->
registerEvent(w->simdId, ii->getRegisterIndex(i),
ii->getOperandSize(i),
w->computeUnit->shader->tick_cnt +
w->computeUnit->shader->ticks(pipeLen), 0);
}
}
}

View file

@ -0,0 +1,101 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos
*/
#ifndef __CONDITION_REGISTER_STATE_HH__
#define __CONDITION_REGISTER_STATE_HH__
#include <string>
#include <vector>
#include "gpu-compute/misc.hh"
class ComputeUnit;
class GPUStaticInst;
class Shader;
class Wavefront;
// Condition Register State (used only when executing HSAIL)
class ConditionRegisterState
{
public:
ConditionRegisterState();
void init(uint32_t _size);
const std::string name() const { return _name; }
void setParent(ComputeUnit *_computeUnit);
void regStats() { }
template<typename T>
T
read(int regIdx, int threadId)
{
bool tmp = c_reg[regIdx][threadId];
T *p0 = (T*)(&tmp);
return *p0;
}
template<typename T>
void
write(int regIdx, int threadId, T value)
{
c_reg[regIdx][threadId] = (bool)(value & 0x01);
}
void
markReg(int regIdx, uint8_t value)
{
busy.at(regIdx) = value;
}
uint8_t
regBusy(int idx)
{
uint8_t status = busy.at(idx);
return status;
}
int numRegs() { return c_reg.size(); }
void exec(GPUStaticInst *ii, Wavefront *w);
private:
ComputeUnit* computeUnit;
std::string _name;
// Condition Register state
std::vector<VectorMask> c_reg;
// flag indicating if a register is busy
std::vector<uint8_t> busy;
};
#endif

View file

@ -0,0 +1,394 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Brad Beckmann, Marc Orr
*/
#include "gpu-compute/dispatcher.hh"
#include "cpu/base.hh"
#include "debug/GPUDisp.hh"
#include "gpu-compute/cl_driver.hh"
#include "gpu-compute/cl_event.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/packet_access.hh"
GpuDispatcher *GpuDispatcher::instance = nullptr;
GpuDispatcher::GpuDispatcher(const Params *p)
: DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
dispatchCount(0), dispatchActive(false), cpu(p->cpu),
shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
{
shader->handshake(this);
driver->handshake(this);
ndRange.wg_disp_rem = false;
ndRange.globalWgId = 0;
schedule(&tickEvent, 0);
// translation port for the dispatcher
tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
num_kernelLaunched
.name(name() + ".num_kernel_launched")
.desc("number of kernel launched")
;
}
GpuDispatcher *GpuDispatcherParams::create()
{
GpuDispatcher *dispatcher = new GpuDispatcher(this);
GpuDispatcher::setInstance(dispatcher);
return GpuDispatcher::getInstance();
}
void
GpuDispatcher::serialize(CheckpointOut &cp) const
{
Tick event_tick = 0;
if (ndRange.wg_disp_rem)
fatal("Checkpointing not supported during active workgroup execution");
if (tickEvent.scheduled())
event_tick = tickEvent.when();
SERIALIZE_SCALAR(event_tick);
}
void
GpuDispatcher::unserialize(CheckpointIn &cp)
{
Tick event_tick;
if (tickEvent.scheduled())
deschedule(&tickEvent);
UNSERIALIZE_SCALAR(event_tick);
if (event_tick)
schedule(&tickEvent, event_tick);
}
AddrRangeList
GpuDispatcher::getAddrRanges() const
{
AddrRangeList ranges;
DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
pioAddr, pioSize);
ranges.push_back(RangeSize(pioAddr, pioSize));
return ranges;
}
Tick
GpuDispatcher::read(PacketPtr pkt)
{
assert(pkt->getAddr() >= pioAddr);
assert(pkt->getAddr() < pioAddr + pioSize);
int offset = pkt->getAddr() - pioAddr;
pkt->allocate();
DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
if (offset < 8) {
assert(!offset);
assert(pkt->getSize() == 8);
uint64_t retval = dispatchActive;
pkt->set(retval);
} else {
offset -= 8;
assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
char *curTaskPtr = (char*)&curTask;
memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
}
pkt->makeAtomicResponse();
return pioDelay;
}
Tick
GpuDispatcher::write(PacketPtr pkt)
{
assert(pkt->getAddr() >= pioAddr);
assert(pkt->getAddr() < pioAddr + pioSize);
int offset = pkt->getAddr() - pioAddr;
#if TRACING_ON
uint64_t data_val = 0;
switch (pkt->getSize()) {
case 1:
data_val = pkt->get<uint8_t>();
break;
case 2:
data_val = pkt->get<uint16_t>();
break;
case 4:
data_val = pkt->get<uint32_t>();
break;
case 8:
data_val = pkt->get<uint64_t>();
break;
default:
DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
}
DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
pkt->getSize());
#endif
if (!offset) {
static int nextId = 0;
// The depends field of the qstruct, which was previously unused, is
// used to communicate with simulated application.
if (curTask.depends) {
HostState hs;
shader->ReadMem((uint64_t)(curTask.depends), &hs,
sizeof(HostState), 0);
// update event start time (in nano-seconds)
uint64_t start = curTick() / 1000;
shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
&start, sizeof(uint64_t), 0);
}
// launch kernel
++num_kernelLaunched;
NDRange *ndr = &(ndRangeMap[nextId]);
// copy dispatch info
ndr->q = curTask;
// update the numDispTask polled by the runtime
accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
ndr->numWgTotal = 1;
for (int i = 0; i < 3; ++i) {
ndr->wgId[i] = 0;
ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
ndr->numWgTotal *= ndr->numWg[i];
}
ndr->numWgCompleted = 0;
ndr->globalWgId = 0;
ndr->wg_disp_rem = true;
ndr->execDone = false;
ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
ndr->dispatchId = nextId;
ndr->curTid = pkt->req->threadId();
DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
execIds.push(nextId);
++nextId;
dispatchActive = true;
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->ticks(1));
}
} else {
// populate current task struct
// first 64 bits are launch reg
offset -= 8;
assert(offset < sizeof(HsaQueueEntry));
char *curTaskPtr = (char*)&curTask;
memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
}
pkt->makeAtomicResponse();
return pioDelay;
}
BaseMasterPort&
GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
{
if (if_name == "translation_port") {
return *tlbPort;
}
return DmaDevice::getMasterPort(if_name, idx);
}
void
GpuDispatcher::exec()
{
int fail_count = 0;
// There are potentially multiple outstanding kernel launches.
// It is possible that the workgroups in a different kernel
// can fit on the GPU even if another kernel's workgroups cannot
DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
while (execIds.size() > fail_count) {
int execId = execIds.front();
while (ndRangeMap[execId].wg_disp_rem) {
//update the thread context
shader->updateThreadContext(ndRangeMap[execId].curTid);
// attempt to dispatch_workgroup
if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
// if we failed try the next kernel,
// it may have smaller workgroups.
// put it on the queue to rety latter
DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
execIds.push(execId);
++fail_count;
break;
}
}
// let's try the next kernel_id
execIds.pop();
}
DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
if (doneIds.size() && cpu) {
shader->hostWakeUp(cpu);
}
while (doneIds.size()) {
// wakeup the CPU if any Kernels completed this cycle
DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
doneIds.pop();
}
}
void
GpuDispatcher::notifyWgCompl(Wavefront *w)
{
int kern_id = w->kern_id;
DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
assert(ndRangeMap[kern_id].dispatchId == kern_id);
ndRangeMap[kern_id].numWgCompleted++;
if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
ndRangeMap[kern_id].execDone = true;
doneIds.push(kern_id);
if (ndRangeMap[kern_id].addrToNotify) {
accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
0);
}
accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
// update event end time (in nano-seconds)
if (ndRangeMap[kern_id].q.depends) {
HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
uint64_t event;
shader->ReadMem((uint64_t)(&host_state->event), &event,
sizeof(uint64_t), 0);
uint64_t end = curTick() / 1000;
shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
sizeof(uint64_t), 0);
}
}
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->ticks(1));
}
}
void
GpuDispatcher::scheduleDispatch()
{
if (!tickEvent.scheduled())
schedule(&tickEvent, curTick() + shader->ticks(1));
}
void
GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
{
if (cpu) {
if (off) {
shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
true);
val += off;
}
shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
} else {
panic("Cannot find host");
}
}
GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
: Event(CPU_Tick_Pri), dispatcher(_dispatcher)
{
}
void
GpuDispatcher::TickEvent::process()
{
dispatcher->exec();
}
const char*
GpuDispatcher::TickEvent::description() const
{
return "GPU Dispatcher tick";
}
// helper functions for driver to retrieve GPU attributes
int
GpuDispatcher::getNumCUs()
{
return shader->cuList.size();
}
void
GpuDispatcher::setFuncargsSize(int funcargs_size)
{
shader->funcargs_size = funcargs_size;
}

View file

@ -0,0 +1,163 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Brad Beckmann, Marc Orr
*/
#ifndef __GPU_DISPATCHER_HH__
#define __GPU_DISPATCHER_HH__
#include <queue>
#include <vector>
#include "base/statistics.hh"
#include "dev/dma_device.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/ndrange.hh"
#include "gpu-compute/qstruct.hh"
#include "mem/port.hh"
#include "params/GpuDispatcher.hh"
class BaseCPU;
class Shader;
class GpuDispatcher : public DmaDevice
{
public:
typedef GpuDispatcherParams Params;
class TickEvent : public Event
{
private:
GpuDispatcher *dispatcher;
public:
TickEvent(GpuDispatcher *);
void process();
const char *description() const;
};
MasterID masterId() { return _masterId; }
protected:
MasterID _masterId;
// Base and length of PIO register space
Addr pioAddr;
Addr pioSize;
Tick pioDelay;
HsaQueueEntry curTask;
std::unordered_map<int, NDRange> ndRangeMap;
NDRange ndRange;
// list of kernel_ids to launch
std::queue<int> execIds;
// list of kernel_ids that have finished
std::queue<int> doneIds;
uint64_t dispatchCount;
// is there a kernel in execution?
bool dispatchActive;
BaseCPU *cpu;
Shader *shader;
ClDriver *driver;
TickEvent tickEvent;
static GpuDispatcher *instance;
// sycall emulation mode can have only 1 application running(?)
// else we have to do some pid based tagging
// unused
typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
TranslationBuffer tlb;
public:
/*statistics*/
Stats::Scalar num_kernelLaunched;
GpuDispatcher(const Params *p);
~GpuDispatcher() { }
void exec();
virtual void serialize(CheckpointOut &cp) const;
virtual void unserialize(CheckpointIn &cp);
void notifyWgCompl(Wavefront *w);
void scheduleDispatch();
void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
// using singleton so that glue code can pass pointer locations
// to the dispatcher. when there are multiple dispatchers, we can
// call something like getInstance(index)
static void
setInstance(GpuDispatcher *_instance)
{
instance = _instance;
}
static GpuDispatcher* getInstance() { return instance; }
class TLBPort : public MasterPort
{
public:
TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
: MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
protected:
GpuDispatcher *dispatcher;
virtual bool recvTimingResp(PacketPtr pkt) { return true; }
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry() { }
};
TLBPort *tlbPort;
virtual BaseMasterPort& getMasterPort(const std::string &if_name,
PortID idx);
AddrRangeList getAddrRanges() const;
Tick read(PacketPtr pkt);
Tick write(PacketPtr pkt);
// helper functions to retrieve/set GPU attributes
int getNumCUs();
void setFuncargsSize(int funcargs_size);
};
#endif // __GPU_DISPATCHER_HH__

View file

@ -0,0 +1,203 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos, Sooraj Puthoor
*/
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/wavefront.hh"
ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
thisTimeInstExecuted(false), instrExecuted (false),
executionResourcesUsed(0)
{
numTransActiveIdle = 0;
idle_dur = 0;
}
void
ExecStage::init(ComputeUnit *cu)
{
computeUnit = cu;
_name = computeUnit->name() + ".ExecStage";
dispatchList = &computeUnit->dispatchList;
vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
glbMemInstAvail= &(computeUnit->glbMemInstAvail);
shrMemInstAvail= &(computeUnit->shrMemInstAvail);
idle_dur = 0;
}
void
ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
if (stage == IdleExec) {
// count cycles of no vector ALU instruction executed
// even if one was the oldest in a WV of that vector SIMD unit
if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
numCyclesWithNoInstrTypeIssued[unitId]++;
}
// count cycles of no global memory (vector) instruction executed
// even if one was the oldest in a WV of that vector SIMD unit
if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
numCyclesWithNoInstrTypeIssued[unitId]++;
(*glbMemInstAvail)--;
}
// count cycles of no shared memory (vector) instruction executed
// even if one was the oldest in a WV of that vector SIMD unit
if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
numCyclesWithNoInstrTypeIssued[unitId]++;
(*shrMemInstAvail)--;
}
} else if (stage == BusyExec) {
// count the number of cycles an instruction to a specific unit
// was issued
numCyclesWithInstrTypeIssued[unitId]++;
thisTimeInstExecuted = true;
instrExecuted = true;
++executionResourcesUsed;
} else if (stage == PostExec) {
// count the number of transitions from active to idle
if (lastTimeInstExecuted && !thisTimeInstExecuted) {
++numTransActiveIdle;
}
if (!lastTimeInstExecuted && thisTimeInstExecuted) {
idleDur.sample(idle_dur);
idle_dur = 0;
} else if (!thisTimeInstExecuted) {
idle_dur++;
}
lastTimeInstExecuted = thisTimeInstExecuted;
// track the number of cycles we either issued one vector instruction
// or issued no instructions at all
if (instrExecuted) {
numCyclesWithInstrIssued++;
} else {
numCyclesWithNoIssue++;
}
spc.sample(executionResourcesUsed);
}
}
void
ExecStage::initStatistics()
{
instrExecuted = false;
executionResourcesUsed = 0;
thisTimeInstExecuted = false;
}
void
ExecStage::exec()
{
initStatistics();
for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
// if dispatch list for this execution resource is empty,
// skip this execution resource this cycle
if (dispatchList->at(unitId).second == EMPTY) {
collectStatistics(IdleExec, unitId);
continue;
}
collectStatistics(BusyExec, unitId);
// execute an instruction for the WF
dispatchList->at(unitId).first->exec();
// clear the dispatch list entry
dispatchList->at(unitId).second = EMPTY;
dispatchList->at(unitId).first = (Wavefront*)nullptr;
}
collectStatistics(PostExec, 0);
}
void
ExecStage::regStats()
{
numTransActiveIdle
.name(name() + ".num_transitions_active_to_idle")
.desc("number of CU transitions from active to idle")
;
numCyclesWithNoIssue
.name(name() + ".num_cycles_with_no_issue")
.desc("number of cycles the CU issues nothing")
;
numCyclesWithInstrIssued
.name(name() + ".num_cycles_with_instr_issued")
.desc("number of cycles the CU issued at least one instruction")
;
spc
.init(0, numSIMDs + numMemUnits, 1)
.name(name() + ".spc")
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
;
idleDur
.init(0,75,5)
.name(name() + ".idle_duration_in_cycles")
.desc("duration of idle periods in cycles")
;
numCyclesWithInstrTypeIssued
.init(numSIMDs + numMemUnits)
.name(name() + ".num_cycles_with_instrtype_issue")
.desc("Number of cycles at least one instruction of specific type "
"issued")
;
numCyclesWithNoInstrTypeIssued
.init(numSIMDs + numMemUnits)
.name(name() + ".num_cycles_with_instr_type_no_issue")
.desc("Number of cycles no instruction of specific type issued")
;
for (int i = 0; i < numSIMDs; ++i) {
numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
}
numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
}

View file

@ -0,0 +1,129 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos, Sooraj Puthoor
*/
#ifndef __EXEC_STAGE_HH__
#define __EXEC_STAGE_HH__
#include <string>
#include <utility>
#include <vector>
#include "sim/stats.hh"
class ComputeUnit;
class Wavefront;
struct ComputeUnitParams;
enum STAT_STATUS
{
IdleExec,
BusyExec,
PostExec
};
enum DISPATCH_STATUS
{
EMPTY = 0,
FILLED
};
// Execution stage.
// Each execution resource executes the
// wave which is in its dispatch list.
// The schedule stage is responsible for
// adding a wave into each execution resource's
// dispatch list.
class ExecStage
{
public:
ExecStage(const ComputeUnitParams* params);
~ExecStage() { }
void init(ComputeUnit *cu);
void exec();
std::string name() { return _name; }
void regStats();
// number of idle cycles
Stats::Scalar numCyclesWithNoIssue;
// number of busy cycles
Stats::Scalar numCyclesWithInstrIssued;
// number of cycles (per execution unit) during which at least one
// instruction was issued to that unit
Stats::Vector numCyclesWithInstrTypeIssued;
// number of idle cycles (per execution unit) during which the unit issued
// no instruction targeting that unit, even though there is at least one
// Wavefront with such an instruction as the oldest
Stats::Vector numCyclesWithNoInstrTypeIssued;
// SIMDs active per cycle
Stats::Distribution spc;
private:
void collectStatistics(enum STAT_STATUS stage, int unitId);
void initStatistics();
ComputeUnit *computeUnit;
uint32_t numSIMDs;
// Number of memory execution resources;
// both global and local memory execution resources in CU
uint32_t numMemUnits;
// List of waves which will be dispatched to
// each execution resource. A FILLED implies
// dispatch list is non-empty and
// execution unit has something to execute
// this cycle. Currently, the dispatch list of
// an execution resource can hold only one wave because
// an execution resource can execute only one wave in a cycle.
// dispatchList is used to communicate between schedule
// and exec stage
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
// flag per vector SIMD unit that is set when there is at least one
// WV that has a vector ALU instruction as the oldest in its
// Instruction Buffer
std::vector<bool> *vectorAluInstAvail;
int *glbMemInstAvail;
int *shrMemInstAvail;
bool lastTimeInstExecuted;
bool thisTimeInstExecuted;
bool instrExecuted;
Stats::Scalar numTransActiveIdle;
Stats::Distribution idleDur;
uint32_t executionResourcesUsed;
uint64_t idle_dur;
std::string _name;
};
#endif // __EXEC_STAGE_HH__

View file

@ -0,0 +1,106 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez, Sooraj Puthoor
*/
#include "gpu-compute/fetch_stage.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/wavefront.hh"
FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
computeUnit(nullptr)
{
for (int j = 0; j < numSIMDs; ++j) {
FetchUnit newFetchUnit(p);
fetchUnit.push_back(newFetchUnit);
}
}
FetchStage::~FetchStage()
{
fetchUnit.clear();
}
void
FetchStage::init(ComputeUnit *cu)
{
computeUnit = cu;
_name = computeUnit->name() + ".FetchStage";
for (int j = 0; j < numSIMDs; ++j) {
fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
fetchUnit[j].init(computeUnit);
}
}
void
FetchStage::exec()
{
for (int j = 0; j < numSIMDs; ++j) {
fetchUnit[j].exec();
}
}
void
FetchStage::processFetchReturn(PacketPtr pkt)
{
ComputeUnit::SQCPort::SenderState *sender_state =
safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
Wavefront *wavefront = sender_state->wavefront;
const unsigned num_instructions = pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst);
instFetchInstReturned.sample(num_instructions);
uint32_t simdId = wavefront->simdId;
fetchUnit[simdId].processFetchReturn(pkt);
}
void
FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
{
fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
}
void
FetchStage::regStats()
{
instFetchInstReturned
.init(1, 32, 1)
.name(name() + ".inst_fetch_instr_returned")
.desc("For each instruction fetch request recieved record how many "
"instructions you got from it")
;
}

View file

@ -0,0 +1,78 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez, Sooraj Puthoor
*/
#ifndef __FETCH_STAGE_HH__
#define __FETCH_STAGE_HH__
#include <string>
#include <vector>
#include "gpu-compute/fetch_unit.hh"
// Instruction fetch stage.
// All dispatched wavefronts for all SIMDS are analyzed for the
// need to fetch instructions. From the fetch eligible waves,
// one wave is selected from each SIMD and fetch is initiated
// for the selected waves.
class ComputeUnit;
class Wavefront;
class FetchStage
{
public:
FetchStage(const ComputeUnitParams* params);
~FetchStage();
void init(ComputeUnit *cu);
void exec();
void processFetchReturn(PacketPtr pkt);
void fetch(PacketPtr pkt, Wavefront *wave);
// Stats related variables and methods
std::string name() { return _name; }
void regStats();
Stats::Distribution instFetchInstReturned;
private:
uint32_t numSIMDs;
ComputeUnit *computeUnit;
// List of fetch units. A fetch unit is
// instantiated per SIMD
std::vector<FetchUnit> fetchUnit;
std::string _name;
};
#endif // __FETCH_STAGE_HH__

View file

@ -0,0 +1,293 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Brad Beckmann, Sooraj Puthoor
*/
#include "gpu-compute/fetch_unit.hh"
#include "debug/GPUFetch.hh"
#include "debug/GPUPort.hh"
#include "debug/GPUTLB.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/ruby/system/RubySystem.hh"
uint32_t FetchUnit::globalFetchUnitID;
FetchUnit::FetchUnit(const ComputeUnitParams* params) :
timingSim(true),
computeUnit(nullptr),
fetchScheduler(params),
waveList(nullptr)
{
}
FetchUnit::~FetchUnit()
{
fetchQueue.clear();
fetchStatusQueue.clear();
}
void
FetchUnit::init(ComputeUnit *cu)
{
computeUnit = cu;
timingSim = computeUnit->shader->timingSim;
fetchQueue.clear();
fetchStatusQueue.resize(computeUnit->shader->n_wf);
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
}
fetchScheduler.bindList(&fetchQueue);
}
void
FetchUnit::exec()
{
// re-evaluate waves which are marked as not ready for fetch
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
// Following code assumes 64-bit opertaion and all insts are
// represented by 64-bit pointers to inst objects.
Wavefront *curWave = fetchStatusQueue[j].first;
assert (curWave);
// The wavefront has to be active, the IB occupancy has to be
// 4 or less instructions and it can not have any branches to
// prevent speculative instruction fetches
if (!fetchStatusQueue[j].second) {
if (curWave->status == Wavefront::S_RUNNING &&
curWave->instructionBuffer.size() <= 4 &&
!curWave->instructionBufferHasBranch() &&
!curWave->pendingFetch) {
fetchQueue.push_back(curWave);
fetchStatusQueue[j].second = true;
}
}
}
// Fetch only if there is some wave ready to be fetched
// An empty fetchQueue will cause the schedular to panic
if (fetchQueue.size()) {
Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
waveToBeFetched->pendingFetch = true;
fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
initiateFetch(waveToBeFetched);
}
}
void
FetchUnit::initiateFetch(Wavefront *wavefront)
{
// calculate the virtual address to fetch from the SQC
Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
vaddr = wavefront->base_ptr + vaddr * sizeof(GPUStaticInst*);
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
// Since this is an instruction prefetch, if you're split then just finish
// out the current line.
unsigned block_size = RubySystem::getBlockSizeBytes();
// check for split accesses
Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
unsigned size = block_size;
if (split_addr > vaddr) {
// misaligned access, just grab the rest of the line
size = split_addr - vaddr;
}
// set up virtual request
Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
computeUnit->masterId(), 0, 0, 0);
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
// This fetchBlock is kind of faux right now - because the translations so
// far don't actually return Data
uint64_t fetchBlock;
pkt->dataStatic(&fetchBlock);
if (timingSim) {
// SenderState needed on Return
pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
// Sender State needed by TLB hierarchy
pkt->senderState =
new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
computeUnit->shader->gpuTc,
false, pkt->senderState);
if (computeUnit->sqcTLBPort->isStalled()) {
assert(computeUnit->sqcTLBPort->retries.size() > 0);
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
vaddr);
computeUnit->sqcTLBPort->retries.push_back(pkt);
} else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
// Stall the data port;
// No more packet is issued till
// ruby indicates resources are freed by
// a recvReqRetry() call back on this port.
computeUnit->sqcTLBPort->stallPort();
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
vaddr);
computeUnit->sqcTLBPort->retries.push_back(pkt);
} else {
DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
}
} else {
pkt->senderState =
new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
computeUnit->shader->gpuTc);
computeUnit->sqcTLBPort->sendFunctional(pkt);
TheISA::GpuTLB::TranslationState *sender_state =
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
delete sender_state->tlbEntry;
delete sender_state;
// fetch the instructions from the SQC when we operate in
// functional mode only
fetch(pkt, wavefront);
}
}
void
FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
{
assert(pkt->req->hasPaddr());
assert(pkt->req->hasSize());
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
// this is necessary because the GPU TLB receives packets instead of
// requests. when the translation is complete, all relevent fields in the
// request will be populated, but not in the packet. here we create the
// new packet so we can set the size, addr, and proper flags.
PacketPtr oldPkt = pkt;
pkt = new Packet(oldPkt->req, oldPkt->cmd);
delete oldPkt;
TheGpuISA::RawMachInst *data =
new TheGpuISA::RawMachInst[pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst)];
pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
// New SenderState for the memory access
pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
if (timingSim) {
// translation is done. Send the appropriate timing memory request.
if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
wavefront));
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
} else {
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
}
} else {
computeUnit->sqcPort->sendFunctional(pkt);
processFetchReturn(pkt);
}
}
void
FetchUnit::processFetchReturn(PacketPtr pkt)
{
ComputeUnit::SQCPort::SenderState *sender_state =
safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
Wavefront *wavefront = sender_state->wavefront;
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
"%d bytes, %d instructions!\n", computeUnit->cu_id,
wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
pkt->req->getSize(), pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst));
if (wavefront->dropFetch) {
assert(wavefront->instructionBuffer.empty());
wavefront->dropFetch = false;
} else {
TheGpuISA::RawMachInst *inst_index_ptr =
(TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
assert(wavefront->instructionBuffer.size() <= 4);
for (int i = 0; i < pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst); ++i) {
GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
assert(inst_ptr);
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
computeUnit->cu_id, wavefront->simdId,
wavefront->wfSlotId, inst_ptr->disassemble());
GPUDynInstPtr gpuDynInst =
std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
computeUnit->getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpuDynInst);
}
}
wavefront->pendingFetch = false;
delete pkt->senderState;
delete pkt->req;
delete pkt;
}
void
FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
{
waveList = wave_list;
}

View file

@ -0,0 +1,89 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Brad Beckmann, Sooraj Puthoor
*/
#ifndef __FETCH_UNIT_HH__
#define __FETCH_UNIT_HH__
#include <string>
#include <utility>
#include <vector>
#include "arch/gpu_decoder.hh"
#include "base/statistics.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/scheduler.hh"
#include "mem/packet.hh"
class ComputeUnit;
class Wavefront;
class FetchUnit
{
public:
FetchUnit(const ComputeUnitParams* params);
~FetchUnit();
void init(ComputeUnit *cu);
void exec();
void bindWaveList(std::vector<Wavefront*> *list);
void initiateFetch(Wavefront *wavefront);
void fetch(PacketPtr pkt, Wavefront *wavefront);
void processFetchReturn(PacketPtr pkt);
static uint32_t globalFetchUnitID;
private:
bool timingSim;
ComputeUnit *computeUnit;
TheGpuISA::Decoder decoder;
// Fetch scheduler; Selects one wave from
// the fetch queue for instruction fetching.
// The selection is made according to
// a scheduling policy
Scheduler fetchScheduler;
// Stores the list of waves that are
// ready to be fetched this cycle
std::vector<Wavefront*> fetchQueue;
// Stores the fetch status of all waves dispatched to this SIMD.
// TRUE implies the wave is ready to fetch and is already
// moved to fetchQueue
std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;
// Pointer to list of waves dispatched on to this SIMD unit
std::vector<Wavefront*> *waveList;
};
#endif // __FETCH_UNIT_HH__

View file

@ -0,0 +1,242 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos, Sooraj Puthoor
*/
#include "gpu-compute/global_memory_pipeline.hh"
#include "debug/GPUMem.hh"
#include "debug/GPUReg.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
inflightStores(0), inflightLoads(0)
{
}
void
GlobalMemPipeline::init(ComputeUnit *cu)
{
computeUnit = cu;
globalMemSize = computeUnit->shader->globalMemSize;
_name = computeUnit->name() + ".GlobalMemPipeline";
}
void
GlobalMemPipeline::exec()
{
// apply any returned global memory operations
GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
!gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
bool accessVrf = true;
// check the VRF to see if the operands of a load (or load component
// of an atomic) are accessible
if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
accessVrf =
w->computeUnit->vrf[m->simdId]->
vrfOperandAccessReady(m->seqNum(), w, m,
VrfAccessType::WRITE);
}
if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
accessVrf && m->statusBitVector == VectorMask(0) &&
(computeUnit->shader->coissue_return ||
computeUnit->wfWait.at(m->pipeId).rdy())) {
if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
doGmReturn<uint32_t, uint8_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
doGmReturn<uint32_t, uint16_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
doGmReturn<uint32_t, uint32_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
doGmReturn<int32_t, int8_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
doGmReturn<int32_t, int16_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
doGmReturn<int32_t, int32_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
doGmReturn<float, Float16>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
doGmReturn<float, float>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
doGmReturn<uint64_t, uint8_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
doGmReturn<uint64_t, uint16_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
doGmReturn<uint64_t, uint32_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
doGmReturn<uint64_t, uint64_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
doGmReturn<int64_t, int8_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
doGmReturn<int64_t, int16_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
doGmReturn<int64_t, int32_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
doGmReturn<int64_t, int64_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
doGmReturn<double, Float16>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
doGmReturn<double, float>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
doGmReturn<double, double>(m);
}
// If pipeline has executed a global memory instruction
// execute global memory packets and issue global
// memory packets to DTLB
if (!gmIssuedRequests.empty()) {
GPUDynInstPtr mp = gmIssuedRequests.front();
if (mp->m_op == Enums::MO_LD ||
(mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) ||
(mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) {
if (inflightLoads >= gmQueueSize) {
return;
} else {
++inflightLoads;
}
} else {
if (inflightStores >= gmQueueSize) {
return;
} else {
++inflightStores;
}
}
mp->initiateAcc(mp);
gmIssuedRequests.pop();
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
computeUnit->cu_id, mp->simdId, mp->wfSlotId,
Enums::MemOpTypeStrings[mp->m_op]);
}
}
template<typename c0, typename c1>
void
GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
{
Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
// Return data to registers
if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
gmReturnedLoads.pop();
assert(inflightLoads > 0);
--inflightLoads;
if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
std::vector<uint32_t> regVec;
// iterate over number of destination register operands since
// this is a load or atomic operation
for (int k = 0; k < m->n_reg; ++k) {
assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
int dst = m->dst_reg + k;
if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
dst = m->dst_reg_vec[k];
// virtual->physical VGPR mapping
int physVgpr = w->remap(dst, sizeof(c0), 1);
// save the physical VGPR index
regVec.push_back(physVgpr);
c1 *p1 = &((c1*)m->d_data)[k * VSZ];
for (int i = 0; i < VSZ; ++i) {
if (m->exec_mask[i]) {
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
"$%s%d <- %d global ld done (src = wavefront "
"ld inst)\n", w->computeUnit->cu_id, w->simdId,
w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
dst, *p1);
// write the value into the physical VGPR. This is a
// purely functional operation. No timing is modeled.
w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
*p1, i);
}
++p1;
}
}
// Schedule the write operation of the load data on the VRF.
// This simply models the timing aspect of the VRF write operation.
// It does not modify the physical VGPR.
loadVrfBankConflictCycles +=
w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
w, regVec, sizeof(c0),
m->time);
}
} else {
gmReturnedStores.pop();
assert(inflightStores > 0);
--inflightStores;
}
// Decrement outstanding register count
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) ||
MO_H(m->m_op)) {
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time,
-1);
}
if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time,
-1);
}
// Mark write bus busy for appropriate amount of time
computeUnit->glbMemToVrfBus.set(m->time);
if (!computeUnit->shader->coissue_return)
w->computeUnit->wfWait.at(m->pipeId).set(m->time);
}
void
GlobalMemPipeline::regStats()
{
loadVrfBankConflictCycles
.name(name() + ".load_vrf_bank_conflict_cycles")
.desc("total number of cycles GM data are delayed before updating "
"the VRF")
;
}

View file

@ -0,0 +1,123 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos, Sooraj Puthoor
*/
#ifndef __GLOBAL_MEMORY_PIPELINE_HH__
#define __GLOBAL_MEMORY_PIPELINE_HH__
#include <queue>
#include <string>
#include "gpu-compute/misc.hh"
#include "params/ComputeUnit.hh"
#include "sim/stats.hh"
/*
* @file global_memory_pipeline.hh
*
* The global memory pipeline issues newly created global memory packets
* from the pipeline to DTLB. The exec() method of the memory packet issues
* the packet to the DTLB if there is space available in the return fifo.
* This stage also retires previously issued loads and stores that have
* returned from the memory sub-system.
*/
class ComputeUnit;
class GlobalMemPipeline
{
public:
GlobalMemPipeline(const ComputeUnitParams *params);
void init(ComputeUnit *cu);
void exec();
template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
bool
isGMLdRespFIFOWrRdy() const
{
return gmReturnedLoads.size() < gmQueueSize;
}
bool
isGMStRespFIFOWrRdy() const
{
return gmReturnedStores.size() < gmQueueSize;
}
bool
isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
{
return (gmIssuedRequests.size() + pendReqs) < gmQueueSize;
}
const std::string &name() const { return _name; }
void regStats();
private:
ComputeUnit *computeUnit;
std::string _name;
int gmQueueSize;
// number of cycles of delaying the update of a VGPR that is the
// target of a load instruction (or the load component of an atomic)
// The delay is due to VRF bank conflicts
Stats::Scalar loadVrfBankConflictCycles;
// Counters to track the inflight loads and stores
// so that we can provide the proper backpressure
// on the number of inflight memory operations.
int inflightStores;
int inflightLoads;
// The size of global memory.
int globalMemSize;
// Global Memory Request FIFO: all global memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> gmIssuedRequests;
// Globa Store Response FIFO: all responses of global memory
// stores are sent to this FIFO from TCP
std::queue<GPUDynInstPtr> gmReturnedStores;
// Global Load Response FIFO: all responses of global memory
// loads are sent to this FIFO from TCP
std::queue<GPUDynInstPtr> gmReturnedLoads;
};
#endif // __GLOBAL_MEMORY_PIPELINE_HH__

View file

@ -0,0 +1,198 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#include "gpu-compute/gpu_dyn_inst.hh"
#include "debug/GPUMem.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
GPUStaticInst *_staticInst, uint64_t instSeqNum)
: GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
{
tlbHitLevel.assign(VSZ, -1);
}
void
GPUDynInst::execute()
{
GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst,
_seqNum);
staticInst->execute(gpuDynInst);
}
int
GPUDynInst::numSrcRegOperands()
{
return staticInst->numSrcRegOperands();
}
int
GPUDynInst::numDstRegOperands()
{
return staticInst->numDstRegOperands();
}
int
GPUDynInst::getNumOperands()
{
return staticInst->getNumOperands();
}
bool
GPUDynInst::isVectorRegister(int operandIdx)
{
return staticInst->isVectorRegister(operandIdx);
}
bool
GPUDynInst::isScalarRegister(int operandIdx)
{
return staticInst->isVectorRegister(operandIdx);
}
int
GPUDynInst::getRegisterIndex(int operandIdx)
{
return staticInst->getRegisterIndex(operandIdx);
}
int
GPUDynInst::getOperandSize(int operandIdx)
{
return staticInst->getOperandSize(operandIdx);
}
bool
GPUDynInst::isDstOperand(int operandIdx)
{
return staticInst->isDstOperand(operandIdx);
}
bool
GPUDynInst::isSrcOperand(int operandIdx)
{
return staticInst->isSrcOperand(operandIdx);
}
bool
GPUDynInst::isArgLoad()
{
return staticInst->isArgLoad();
}
const std::string&
GPUDynInst::disassemble() const
{
return staticInst->disassemble();
}
uint64_t
GPUDynInst::seqNum() const
{
return _seqNum;
}
Enums::OpType
GPUDynInst::opType()
{
return staticInst->o_type;
}
Enums::StorageClassType
GPUDynInst::executedAs()
{
return staticInst->executed_as;
}
// Process a memory instruction and (if necessary) submit timing request
void
GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
{
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
cu->cu_id, simdId, wfSlotId, exec_mask);
staticInst->initiateAcc(gpuDynInst);
time = 0;
}
bool
GPUDynInst::scalarOp() const
{
return staticInst->scalarOp();
}
void
GPUDynInst::updateStats()
{
if (staticInst->isLocalMem()) {
// access to LDS (shared) memory
cu->dynamicLMemInstrCnt++;
} else {
// access to global memory
// update PageDivergence histogram
int number_pages_touched = cu->pagesTouched.size();
assert(number_pages_touched);
cu->pageDivergenceDist.sample(number_pages_touched);
std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
for (auto it : cu->pagesTouched) {
// see if this page has been touched before. if not, this also
// inserts the page into the table.
ret = cu->pageAccesses
.insert(ComputeUnit::pageDataStruct::value_type(it.first,
std::make_pair(1, it.second)));
// if yes, then update the stats
if (!ret.second) {
ret.first->second.first++;
ret.first->second.second += it.second;
}
}
cu->pagesTouched.clear();
// total number of memory instructions (dynamic)
// Atomics are counted as a single memory instruction.
// this is # memory instructions per wavefronts, not per workitem
cu->dynamicGMemInstrCnt++;
}
}

View file

@ -0,0 +1,464 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __GPU_DYN_INST_HH__
#define __GPU_DYN_INST_HH__
#include <cstdint>
#include <string>
#include "enums/GenericMemoryOrder.hh"
#include "enums/GenericMemoryScope.hh"
#include "enums/MemOpType.hh"
#include "enums/MemType.hh"
#include "enums/OpType.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"
class GPUStaticInst;
template<typename T>
class AtomicOpAnd : public TypedAtomicOpFunctor<T>
{
public:
T a;
AtomicOpAnd(T _a) : a(_a) { }
void execute(T *b) { *b &= a; }
};
template<typename T>
class AtomicOpOr : public TypedAtomicOpFunctor<T>
{
public:
T a;
AtomicOpOr(T _a) : a(_a) { }
void execute(T *b) { *b |= a; }
};
template<typename T>
class AtomicOpXor : public TypedAtomicOpFunctor<T>
{
public:
T a;
AtomicOpXor(T _a) : a(_a) {}
void execute(T *b) { *b ^= a; }
};
template<typename T>
class AtomicOpCAS : public TypedAtomicOpFunctor<T>
{
public:
T c;
T s;
ComputeUnit *computeUnit;
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
: c(_c), s(_s), computeUnit(compute_unit) { }
void
execute(T *b)
{
computeUnit->numCASOps++;
if (*b == c) {
*b = s;
} else {
computeUnit->numFailedCASOps++;
}
if (computeUnit->xact_cas_mode) {
computeUnit->xactCasLoadMap.clear();
}
}
};
template<typename T>
class AtomicOpExch : public TypedAtomicOpFunctor<T>
{
public:
T a;
AtomicOpExch(T _a) : a(_a) { }
void execute(T *b) { *b = a; }
};
template<typename T>
class AtomicOpAdd : public TypedAtomicOpFunctor<T>
{
public:
T a;
AtomicOpAdd(T _a) : a(_a) { }
void execute(T *b) { *b += a; }
};
template<typename T>
class AtomicOpSub : public TypedAtomicOpFunctor<T>
{
public:
T a;
AtomicOpSub(T _a) : a(_a) { }
void execute(T *b) { *b -= a; }
};
template<typename T>
class AtomicOpInc : public TypedAtomicOpFunctor<T>
{
public:
AtomicOpInc() { }
void execute(T *b) { *b += 1; }
};
template<typename T>
class AtomicOpDec : public TypedAtomicOpFunctor<T>
{
public:
AtomicOpDec() {}
void execute(T *b) { *b -= 1; }
};
template<typename T>
class AtomicOpMax : public TypedAtomicOpFunctor<T>
{
public:
T a;
AtomicOpMax(T _a) : a(_a) { }
void
execute(T *b)
{
if (a > *b)
*b = a;
}
};
template<typename T>
class AtomicOpMin : public TypedAtomicOpFunctor<T>
{
public:
T a;
AtomicOpMin(T _a) : a(_a) {}
void
execute(T *b)
{
if (a < *b)
*b = a;
}
};
#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
typedef enum
{
VT_32,
VT_64,
} vgpr_type;
typedef enum
{
SEG_PRIVATE,
SEG_SPILL,
SEG_GLOBAL,
SEG_SHARED,
SEG_READONLY,
SEG_FLAT
} seg_type;
class GPUDynInst : public GPUExecContext
{
public:
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
uint64_t instSeqNum);
void execute();
int numSrcRegOperands();
int numDstRegOperands();
int getNumOperands();
bool isVectorRegister(int operandIdx);
bool isScalarRegister(int operandIdx);
int getRegisterIndex(int operandIdx);
int getOperandSize(int operandIdx);
bool isDstOperand(int operandIdx);
bool isSrcOperand(int operandIdx);
bool isArgLoad();
const std::string &disassemble() const;
uint64_t seqNum() const;
Enums::OpType opType();
Enums::StorageClassType executedAs();
// The address of the memory operation
Addr addr[VSZ];
Addr pAddr;
// The data to get written
uint8_t d_data[VSZ * 16];
// Additional data (for atomics)
uint8_t a_data[VSZ * 8];
// Additional data (for atomics)
uint8_t x_data[VSZ * 8];
// The execution mask
VectorMask exec_mask;
// The memory type (M_U32, M_S32, ...)
Enums::MemType m_type;
// The memory operation (MO_LD, MO_ST, ...)
Enums::MemOpType m_op;
Enums::GenericMemoryOrder memoryOrder;
// Scope of the request
Enums::GenericMemoryScope scope;
// The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
seg_type s_type;
// The equivalency class
int equiv;
// The return VGPR type (VT_32 or VT_64)
vgpr_type v_type;
// Number of VGPR's accessed (1, 2, or 4)
int n_reg;
// The return VGPR index
int dst_reg;
// There can be max 4 dest regs>
int dst_reg_vec[4];
// SIMD where the WF of the memory instruction has been mapped to
int simdId;
// unique id of the WF where the memory instruction belongs to
int wfDynId;
// The kernel id of the requesting wf
int kern_id;
// The CU id of the requesting wf
int cu_id;
// HW slot id where the WF is mapped to inside a SIMD unit
int wfSlotId;
// execution pipeline id where the memory instruction has been scheduled
int pipeId;
// The execution time of this operation
Tick time;
// The latency of this operation
WaitClass latency;
// A list of bank conflicts for the 4 cycles.
uint32_t bc[4];
// A pointer to ROM
uint8_t *rom;
// The size of the READONLY segment
int sz_rom;
// Initiate the specified memory operation, by creating a
// memory request and sending it off to the memory system.
void initiateAcc(GPUDynInstPtr gpuDynInst);
void updateStats();
GPUStaticInst* staticInstruction() { return staticInst; }
// Is the instruction a scalar or vector op?
bool scalarOp() const;
/*
* Loads/stores/atomics may have acquire/release semantics associated
* withthem. Some protocols want to see the acquire/release as separate
* requests from the load/store/atomic. We implement that separation
* using continuations (i.e., a function pointer with an object associated
* with it). When, for example, the front-end generates a store with
* release semantics, we will first issue a normal store and set the
* continuation in the GPUDynInst to a function that generate a
* release request. That continuation will be called when the normal
* store completes (in ComputeUnit::DataPort::recvTimingResponse). The
* continuation will be called in the context of the same GPUDynInst
* that generated the initial store.
*/
std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
// when true, call execContinuation when response arrives
bool useContinuation;
template<typename c0> AtomicOpFunctor*
makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
{
using namespace Enums;
switch(op) {
case MO_AAND:
case MO_ANRAND:
return new AtomicOpAnd<c0>(*reg0);
case MO_AOR:
case MO_ANROR:
return new AtomicOpOr<c0>(*reg0);
case MO_AXOR:
case MO_ANRXOR:
return new AtomicOpXor<c0>(*reg0);
case MO_ACAS:
case MO_ANRCAS:
return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
case MO_AEXCH:
case MO_ANREXCH:
return new AtomicOpExch<c0>(*reg0);
case MO_AADD:
case MO_ANRADD:
return new AtomicOpAdd<c0>(*reg0);
case MO_ASUB:
case MO_ANRSUB:
return new AtomicOpSub<c0>(*reg0);
case MO_AINC:
case MO_ANRINC:
return new AtomicOpInc<c0>();
case MO_ADEC:
case MO_ANRDEC:
return new AtomicOpDec<c0>();
case MO_AMAX:
case MO_ANRMAX:
return new AtomicOpMax<c0>(*reg0);
case MO_AMIN:
case MO_ANRMIN:
return new AtomicOpMin<c0>(*reg0);
default:
panic("Unrecognized atomic operation");
}
}
void
setRequestFlags(Request *req, bool setMemOrder=true)
{
// currently these are the easy scopes to deduce
switch (s_type) {
case SEG_PRIVATE:
req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
break;
case SEG_SPILL:
req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
break;
case SEG_GLOBAL:
req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
break;
case SEG_READONLY:
req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
break;
case SEG_SHARED:
req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
break;
case SEG_FLAT:
// TODO: translate to correct scope
assert(false);
default:
panic("Bad segment type");
break;
}
switch (scope) {
case Enums::MEMORY_SCOPE_NONE:
case Enums::MEMORY_SCOPE_WORKITEM:
break;
case Enums::MEMORY_SCOPE_WAVEFRONT:
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
Request::WAVEFRONT_SCOPE);
break;
case Enums::MEMORY_SCOPE_WORKGROUP:
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
Request::WORKGROUP_SCOPE);
break;
case Enums::MEMORY_SCOPE_DEVICE:
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
Request::DEVICE_SCOPE);
break;
case Enums::MEMORY_SCOPE_SYSTEM:
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
Request::SYSTEM_SCOPE);
break;
default:
panic("Bad scope type");
break;
}
if (setMemOrder) {
// set acquire and release flags
switch (memoryOrder){
case Enums::MEMORY_ORDER_SC_ACQUIRE:
req->setFlags(Request::ACQUIRE);
break;
case Enums::MEMORY_ORDER_SC_RELEASE:
req->setFlags(Request::RELEASE);
break;
case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
req->setFlags(Request::ACQUIRE | Request::RELEASE);
break;
default:
break;
}
}
// set atomic type
// currently, the instruction genenerator only produces atomic return
// but a magic instruction can produce atomic no return
if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
m_op == Enums::MO_ACAS) {
req->setFlags(Request::ATOMIC_RETURN_OP);
} else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
m_op == Enums::MO_ANRCAS) {
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
}
}
// Map returned packets and the addresses they satisfy with which lane they
// were requested from
typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
StatusVector memStatusVector;
// Track the status of memory requests per lane, a bit per lane
VectorMask statusBitVector;
// for ld_v# or st_v#
std::vector<int> statusVector;
std::vector<int> tlbHitLevel;
private:
GPUStaticInst *staticInst;
uint64_t _seqNum;
};
#endif // __GPU_DYN_INST_HH__

View file

@ -0,0 +1,53 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#include "gpu-compute/gpu_exec_context.hh"
GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
: cu(_cu), wf(_wf)
{
}
ComputeUnit*
GPUExecContext::computeUnit()
{
return cu;
}
Wavefront*
GPUExecContext::wavefront()
{
return wf;
}

View file

@ -0,0 +1,54 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __GPU_EXEC_CONTEXT_HH__
#define __GPU_EXEC_CONTEXT_HH__
class ComputeUnit;
class Wavefront;
class GPUExecContext
{
public:
GPUExecContext(ComputeUnit *_cu, Wavefront *_wf);
Wavefront* wavefront();
ComputeUnit* computeUnit();
protected:
ComputeUnit *cu;
Wavefront *wf;
};
#endif // __GPU_EXEC_CONTEXT_HH__

View file

@ -0,0 +1,42 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#include "gpu-compute/gpu_static_inst.hh"
GPUStaticInst::GPUStaticInst(const std::string &opcode)
: o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode),
_instNum(0), _scalarOp(false)
{
}

View file

@ -0,0 +1,166 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __GPU_STATIC_INST_HH__
#define __GPU_STATIC_INST_HH__
/*
* @file gpu_static_inst.hh
*
* Defines the base class representing static instructions for the GPU. The
* instructions are "static" because they contain no dynamic instruction
* information. GPUStaticInst corresponds to the StaticInst class for the CPU
* models.
*/
#include <cstdint>
#include <string>
#include "enums/OpType.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/misc.hh"
class BaseOperand;
class BaseRegOperand;
class Wavefront;
class GPUStaticInst
{
public:
GPUStaticInst(const std::string &opcode);
void instNum(int num) { _instNum = num; }
int instNum() { return _instNum; }
void ipdInstNum(int num) { _ipdInstNum = num; }
int ipdInstNum() const { return _ipdInstNum; }
virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
virtual void generateDisassembly() = 0;
virtual const std::string &disassemble() = 0;
virtual int getNumOperands() = 0;
virtual bool isCondRegister(int operandIndex) = 0;
virtual bool isScalarRegister(int operandIndex) = 0;
virtual bool isVectorRegister(int operandIndex) = 0;
virtual bool isSrcOperand(int operandIndex) = 0;
virtual bool isDstOperand(int operandIndex) = 0;
virtual int getOperandSize(int operandIndex) = 0;
virtual int getRegisterIndex(int operandIndex) = 0;
virtual int numDstRegOperands() = 0;
virtual int numSrcRegOperands() = 0;
/*
* Most instructions (including all HSAIL instructions)
* are vector ops, so _scalarOp will be false by default.
* Derived instruction objects that are scalar ops must
* set _scalarOp to true in their constructors.
*/
bool scalarOp() const { return _scalarOp; }
virtual bool isLocalMem() const
{
fatal("calling isLocalMem() on non-memory instruction.\n");
return false;
}
bool isArgLoad() { return false; }
virtual uint32_t instSize() = 0;
// only used for memory instructions
virtual void
initiateAcc(GPUDynInstPtr gpuDynInst)
{
fatal("calling initiateAcc() on a non-memory instruction.\n");
}
virtual uint32_t getTargetPc() { return 0; }
/**
* Query whether the instruction is an unconditional jump i.e., the jump
* is always executed because there is no condition to be evaluated.
*
* If the instruction is not of branch type, the result is always false.
*
* @return True if the instruction is an unconditional jump.
*/
virtual bool unconditionalJumpInstruction() { return false; }
static uint64_t dynamic_id_count;
Enums::OpType o_type;
// For flat memory accesses
Enums::StorageClassType executed_as;
protected:
virtual void
execLdAcq(GPUDynInstPtr gpuDynInst)
{
fatal("calling execLdAcq() on a non-load instruction.\n");
}
virtual void
execSt(GPUDynInstPtr gpuDynInst)
{
fatal("calling execLdAcq() on a non-load instruction.\n");
}
virtual void
execAtomic(GPUDynInstPtr gpuDynInst)
{
fatal("calling execAtomic() on a non-atomic instruction.\n");
}
virtual void
execAtomicAcq(GPUDynInstPtr gpuDynInst)
{
fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
}
const std::string opcode;
std::string disassembly;
int _instNum;
/**
* Identifier of the immediate post-dominator instruction.
*/
int _ipdInstNum;
bool _scalarOp;
};
#endif // __GPU_STATIC_INST_HH__

1801
src/gpu-compute/gpu_tlb.cc Normal file

File diff suppressed because it is too large Load diff

465
src/gpu-compute/gpu_tlb.hh Normal file
View file

@ -0,0 +1,465 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Lisa Hsu
*/
#ifndef __GPU_TLB_HH__
#define __GPU_TLB_HH__
#include <fstream>
#include <list>
#include <queue>
#include <string>
#include <vector>
#include "arch/generic/tlb.hh"
#include "arch/x86/pagetable.hh"
#include "arch/x86/pagetable_walker.hh"
#include "arch/x86/regs/segment.hh"
#include "base/callback.hh"
#include "base/misc.hh"
#include "base/statistics.hh"
#include "gpu-compute/compute_unit.hh"
#include "mem/mem_object.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/X86GPUTLB.hh"
#include "sim/sim_object.hh"
class BaseTLB;
class Packet;
class ThreadContext;
namespace X86ISA
{
class GpuTlbEntry : public TlbEntry
{
public:
GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
: TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
GpuTlbEntry() : TlbEntry() { }
bool valid;
};
class GpuTLB : public MemObject
{
protected:
friend class Walker;
typedef std::list<GpuTlbEntry*> EntryList;
uint32_t configAddress;
// TLB clock: will inherit clock from shader's clock period in terms
// of nuber of ticks of curTime (aka global simulation clock)
// The assignment of TLB clock from shader clock is done in the python
// config files.
int clock;
public:
// clock related functions ; maps to-and-from Simulation ticks and
// object clocks.
Tick frequency() const { return SimClock::Frequency / clock; }
Tick
ticks(int numCycles) const
{
return (Tick)clock * numCycles;
}
Tick curCycle() const { return curTick() / clock; }
Tick tickToCycles(Tick val) const { return val / clock;}
typedef X86GPUTLBParams Params;
GpuTLB(const Params *p);
~GpuTLB();
typedef enum BaseTLB::Mode Mode;
class Translation
{
public:
virtual ~Translation() { }
/**
* Signal that the translation has been delayed due to a hw page
* table walk.
*/
virtual void markDelayed() = 0;
/**
* The memory for this object may be dynamically allocated, and it
* may be responsible for cleaning itslef up which will happen in
* this function. Once it's called the object is no longer valid.
*/
virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
Mode mode) = 0;
};
void dumpAll();
GpuTlbEntry *lookup(Addr va, bool update_lru=true);
void setConfigAddress(uint32_t addr);
protected:
EntryList::iterator lookupIt(Addr va, bool update_lru=true);
Walker *walker;
public:
Walker *getWalker();
void invalidateAll();
void invalidateNonGlobal();
void demapPage(Addr va, uint64_t asn);
protected:
int size;
int assoc;
int numSets;
/**
* true if this is a fully-associative TLB
*/
bool FA;
Addr setMask;
/**
* Allocation Policy: true if we always allocate on a hit, false
* otherwise. Default is true.
*/
bool allocationPolicy;
/**
* if true, then this is not the last level TLB
*/
bool hasMemSidePort;
/**
* Print out accessDistance stats. One stat file
* per TLB.
*/
bool accessDistance;
GpuTlbEntry *tlb;
/*
* It's a per-set list. As long as we have not reached
* the full capacity of the given set, grab an entry from
* the freeList.
*/
std::vector<EntryList> freeList;
/**
* An entryList per set is the equivalent of an LRU stack;
* it's used to guide replacement decisions. The head of the list
* contains the MRU TLB entry of the given set. If the freeList
* for this set is empty, the last element of the list
* is evicted (i.e., dropped on the floor).
*/
std::vector<EntryList> entryList;
Fault translateInt(RequestPtr req, ThreadContext *tc);
Fault translate(RequestPtr req, ThreadContext *tc,
Translation *translation, Mode mode, bool &delayedResponse,
bool timing, int &latency);
public:
// latencies for a TLB hit, miss and page fault
int hitLatency;
int missLatency1;
int missLatency2;
// local_stats are as seen from the TLB
// without taking into account coalescing
Stats::Scalar localNumTLBAccesses;
Stats::Scalar localNumTLBHits;
Stats::Scalar localNumTLBMisses;
Stats::Formula localTLBMissRate;
// global_stats are as seen from the
// CU's perspective taking into account
// all coalesced requests.
Stats::Scalar globalNumTLBAccesses;
Stats::Scalar globalNumTLBHits;
Stats::Scalar globalNumTLBMisses;
Stats::Formula globalTLBMissRate;
// from the CU perspective (global)
Stats::Scalar accessCycles;
// from the CU perspective (global)
Stats::Scalar pageTableCycles;
Stats::Scalar numUniquePages;
// from the perspective of this TLB
Stats::Scalar localCycles;
// from the perspective of this TLB
Stats::Formula localLatency;
// I take the avg. per page and then
// the avg. over all pages.
Stats::Scalar avgReuseDistance;
void regStats();
void updatePageFootprint(Addr virt_page_addr);
void printAccessPattern();
Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
int &latency);
void translateTiming(RequestPtr req, ThreadContext *tc,
Translation *translation, Mode mode,
int &latency);
Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
// Checkpointing
virtual void serialize(CheckpointOut& cp) const;
virtual void unserialize(CheckpointIn& cp);
void issueTranslation();
enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
void handleTranslationReturn(Addr addr, tlbOutcome outcome,
PacketPtr pkt);
void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
GpuTlbEntry *tlb_entry, Mode mode);
void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
Addr phys_page_addr);
void issueTLBLookup(PacketPtr pkt);
// CpuSidePort is the TLB Port closer to the CPU/CU side
class CpuSidePort : public SlavePort
{
public:
CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
PortID _index)
: SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
protected:
GpuTLB *tlb;
int index;
virtual bool recvTimingReq(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt);
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void recvRespRetry() { assert(false); }
virtual AddrRangeList getAddrRanges() const;
};
/**
* MemSidePort is the TLB Port closer to the memory side
* If this is a last level TLB then this port will not be connected.
*
* Future action item: if we ever do real page walks, then this port
* should be connected to a RubyPort.
*/
class MemSidePort : public MasterPort
{
public:
MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
PortID _index)
: MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
std::deque<PacketPtr> retries;
protected:
GpuTLB *tlb;
int index;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
// TLB ports on the cpu Side
std::vector<CpuSidePort*> cpuSidePort;
// TLB ports on the memory side
std::vector<MemSidePort*> memSidePort;
BaseMasterPort &getMasterPort(const std::string &if_name,
PortID idx=InvalidPortID);
BaseSlavePort &getSlavePort(const std::string &if_name,
PortID idx=InvalidPortID);
/**
* TLB TranslationState: this currently is a somewhat bastardization of
* the usage of SenderState, whereby the receiver of a packet is not
* usually supposed to need to look at the contents of the senderState,
* you're really only supposed to look at what you pushed on, pop it
* off, and send it back.
*
* However, since there is state that we want to pass to the TLBs using
* the send/recv Timing/Functional/etc. APIs, which don't allow for new
* arguments, we need a common TLB senderState to pass between TLBs,
* both "forwards" and "backwards."
*
* So, basically, the rule is that any packet received by a TLB port
* (cpuside OR memside) must be safely castable to a TranslationState.
*/
struct TranslationState : public Packet::SenderState
{
// TLB mode, read or write
Mode tlbMode;
// Thread context associated with this req
ThreadContext *tc;
/*
* TLB entry to be populated and passed back and filled in
* previous TLBs. Equivalent to the data cache concept of
* "data return."
*/
GpuTlbEntry *tlbEntry;
// Is this a TLB prefetch request?
bool prefetch;
// When was the req for this translation issued
uint64_t issueTime;
// Remember where this came from
std::vector<SlavePort*>ports;
// keep track of #uncoalesced reqs per packet per TLB level;
// reqCnt per level >= reqCnt higher level
std::vector<int> reqCnt;
// TLB level this packet hit in; 0 if it hit in the page table
int hitLevel;
Packet::SenderState *saved;
TranslationState(Mode tlb_mode, ThreadContext *_tc,
bool _prefetch=false,
Packet::SenderState *_saved=nullptr)
: tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
prefetch(_prefetch), issueTime(0),
hitLevel(0),saved(_saved) { }
};
// maximum number of permitted coalesced requests per cycle
int maxCoalescedReqs;
// Current number of outstandings coalesced requests.
// Should be <= maxCoalescedReqs
int outstandingReqs;
/**
* A TLBEvent is scheduled after the TLB lookup and helps us take the
* appropriate actions:
* (e.g., update TLB on a hit,
* send request to lower level TLB on a miss,
* or start a page walk if this was the last-level TLB).
*/
void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
PacketPtr pkt);
class TLBEvent : public Event
{
private:
GpuTLB *tlb;
Addr virtPageAddr;
/**
* outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
*/
tlbOutcome outcome;
PacketPtr pkt;
public:
TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
PacketPtr _pkt);
void process();
const char *description() const;
// updateOutcome updates the tlbOutcome of a TLBEvent
void updateOutcome(tlbOutcome _outcome);
Addr getTLBEventVaddr();
};
std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
// this FIFO queue keeps track of the virt. page addresses
// that are pending cleanup
std::queue<Addr> cleanupQueue;
// the cleanupEvent is scheduled after a TLBEvent triggers in order to
// free memory and do the required clean-up
void cleanup();
EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
/**
* This hash map will use the virtual page address as a key
* and will keep track of total number of accesses per page
*/
struct AccessInfo
{
unsigned int lastTimeAccessed; // last access to this page
unsigned int accessesPerPage;
// need to divide it by accessesPerPage at the end
unsigned int totalReuseDistance;
/**
* The field below will help us compute the access distance,
* that is the number of (coalesced) TLB accesses that
* happened in between each access to this page
*
* localTLBAccesses[x] is the value of localTLBNumAccesses
* when the page <Addr> was accessed for the <x>th time
*/
std::vector<unsigned int> localTLBAccesses;
unsigned int sumDistance;
unsigned int meanDistance;
};
typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
AccessPatternTable TLBFootprint;
// Called at the end of simulation to dump page access stats.
void exitCallback();
EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
};
}
#endif // __GPU_TLB_HH__

101
src/gpu-compute/hsa_code.hh Normal file
View file

@ -0,0 +1,101 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __HSA_CODE_HH__
#define __HSA_CODE_HH__
#include <string>
#include <vector>
#include "arch/gpu_types.hh"
#include "config/the_gpu_isa.hh"
class HsaKernelInfo;
/* @class HsaCode
* base code object for the set of HSA kernels associated
* with a single application. this class provides the common
* methods for creating, accessing, and storing information
* about kernel and variable symbols, symbol name, memory
* segment sizes, and instruction count, etc.
*/
class HsaCode
{
public:
HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0),
_name(name)
{
}
enum class MemorySegment {
NONE,
FLAT,
GLOBAL,
READONLY,
KERNARG,
GROUP,
PRIVATE,
SPILL,
ARG,
EXTSPACE0
};
const std::string& name() const { return _name; }
int numInsts() const { return _insts.size(); }
std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; }
void
setReadonlyData(uint8_t *_readonly_data)
{
readonly_data = _readonly_data;
}
virtual int getSize(MemorySegment segment) const = 0;
virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0;
uint8_t *readonly_data;
int funcarg_size;
protected:
// An array that stores instruction indices (0 through kernel size)
// for a kernel passed to code object constructor as an argument.
std::vector<TheGpuISA::RawMachInst> _insts;
private:
const std::string _name;
};
#endif // __HSA_CODE_HH__

View file

@ -0,0 +1,79 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __HSA_KERNEL_INFO_HH__
#define __HSA_KERNEL_INFO_HH__
// This file defines the public interface between the HSA emulated
// driver and application programs.
#include <cstdint>
static const int HSA_GET_SIZES = 0x4801;
static const int HSA_GET_KINFO = 0x4802;
static const int HSA_GET_STRINGS = 0x4803;
static const int HSA_GET_CODE = 0x4804;
static const int HSA_GET_READONLY_DATA = 0x4805;
static const int HSA_GET_CU_CNT = 0x4806;
static const int HSA_GET_VSZ = 0x4807;
// Return value (via buffer ptr) for HSA_GET_SIZES
struct HsaDriverSizes
{
uint32_t num_kernels;
uint32_t string_table_size;
uint32_t code_size;
uint32_t readonly_size;
};
// HSA_GET_KINFO returns an array of num_kernels of these structs
struct HsaKernelInfo
{
// byte offset into string table
uint32_t name_offs;
// byte offset into code array
uint32_t code_offs;
uint32_t static_lds_size;
uint32_t private_mem_size;
uint32_t spill_mem_size;
// Number of s registers
uint32_t sRegCount;
// Number of d registers
uint32_t dRegCount;
// Number of c registers
uint32_t cRegCount;
};
#endif // __HSA_KERNEL_INFO_HH__

View file

@ -0,0 +1,76 @@
/*
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#include "gpu-compute/hsa_object.hh"
#include <fstream>
#include "gpu-compute/brig_object.hh"
HsaObject::HsaObject(const std::string &fname)
: readonlyData(nullptr), filename(fname)
{
}
HsaObject*
HsaObject::createHsaObject(const std::string &fname)
{
HsaObject *hsaObj = nullptr;
uint8_t *file_data = nullptr;
int file_length = 0;
std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in |
std::ifstream::binary);
assert(code_file.is_open());
assert(code_file.good());
file_length = code_file.tellg();
code_file.seekg(0, code_file.beg);
file_data = new uint8_t[file_length];
code_file.read((char*)file_data, file_length);
code_file.close();
for (const auto &tryFile : tryFileFuncs) {
if ((hsaObj = tryFile(fname, file_length, file_data))) {
return hsaObj;
}
}
delete[] file_data;
fatal("Unknown HSA object type for file: %s.\n", fname);
return nullptr;
}

View file

@ -0,0 +1,74 @@
/*
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Anthony Gutierrez
*/
#ifndef __HSA_OBJECT_HH__
#define __HSA_OBJECT_HH__
#include <functional>
#include <string>
#include <vector>
class HsaCode;
/* @class HsaObject
* base loader object for HSA kernels. this class provides
* the base method definitions for loading, storing, and
* accessing HSA kernel objects into the simulator.
*/
class HsaObject
{
public:
HsaObject(const std::string &fileName);
static HsaObject* createHsaObject(const std::string &fname);
static std::vector<std::function<HsaObject*(const std::string&, int,
uint8_t*)>> tryFileFuncs;
virtual HsaCode* getKernel(const std::string &name) const = 0;
virtual HsaCode* getKernel(int i) const = 0;
virtual HsaCode* getFunction(const std::string &name) const = 0;
virtual int numKernels() const = 0;
const std::string& name() const { return filename; }
uint8_t *readonlyData;
protected:
const std::string filename;
};
#endif // __HSA_OBJECT_HH__

View file

@ -0,0 +1,453 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#include "gpu-compute/hsail_code.hh"
#include "arch/gpu_types.hh"
#include "arch/hsail/Brig.h"
#include "arch/hsail/operand.hh"
#include "config/the_gpu_isa.hh"
#include "debug/BRIG.hh"
#include "debug/HSAILObject.hh"
#include "gpu-compute/brig_object.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/kernel_cfg.hh"
using namespace Brig;
int getBrigDataTypeBytes(BrigType16_t t);
HsailCode::HsailCode(const std::string &name_str)
: HsaCode(name_str), private_size(-1), readonly_size(-1)
{
}
void
HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj,
StorageMap *objStorageMap)
{
storageMap = objStorageMap;
// set pointer so that decoding process can find this kernel context when
// needed
obj->currentCode = this;
if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION &&
code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) {
fatal("unexpected directive kind %d inside kernel/function init\n",
code_dir->base.kind);
}
DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n",
code_dir->firstCodeBlockEntry);
// clear these static vars so we can properly track the max index
// for this kernel
SRegOperand::maxRegIdx = 0;
DRegOperand::maxRegIdx = 0;
CRegOperand::maxRegIdx = 0;
setPrivateSize(0);
const BrigBase *entryPtr = brigNext((BrigBase*)code_dir);
const BrigBase *endPtr =
obj->getCodeSectionEntry(code_dir->nextModuleEntry);
int inst_idx = 0;
std::vector<GPUStaticInst*> instructions;
int funcarg_size_scope = 0;
// walk through instructions in code section and directives in
// directive section in parallel, processing directives that apply
// when we reach the relevant code point.
while (entryPtr < endPtr) {
switch (entryPtr->kind) {
case BRIG_KIND_DIRECTIVE_VARIABLE:
{
const BrigDirectiveVariable *sym =
(const BrigDirectiveVariable*)entryPtr;
DPRINTF(HSAILObject,"Initializing code, directive is "
"kind_variable, symbol is: %s\n",
obj->getString(sym->name));
StorageElement *se = storageMap->addSymbol(sym, obj);
if (sym->segment == BRIG_SEGMENT_PRIVATE) {
setPrivateSize(se->size);
} else { // spill
funcarg_size_scope += se->size;
}
}
break;
case BRIG_KIND_DIRECTIVE_LABEL:
{
const BrigDirectiveLabel *lbl =
(const BrigDirectiveLabel*)entryPtr;
DPRINTF(HSAILObject,"Initializing code, directive is "
"kind_label, label is: %s \n",
obj->getString(lbl->name));
labelMap.addLabel(lbl, inst_idx, obj);
}
break;
case BRIG_KIND_DIRECTIVE_PRAGMA:
{
DPRINTF(HSAILObject, "Initializing code, directive "
"is kind_pragma\n");
}
break;
case BRIG_KIND_DIRECTIVE_COMMENT:
{
DPRINTF(HSAILObject, "Initializing code, directive is "
"kind_comment\n");
}
break;
case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
{
DPRINTF(HSAILObject, "Initializing code, directive is "
"kind_arg_block_start\n");
storageMap->resetOffset(BRIG_SEGMENT_ARG);
funcarg_size_scope = 0;
}
break;
case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
{
DPRINTF(HSAILObject, "Initializing code, directive is "
"kind_arg_block_end\n");
funcarg_size = funcarg_size < funcarg_size_scope ?
funcarg_size_scope : funcarg_size;
}
break;
case BRIG_KIND_DIRECTIVE_END:
DPRINTF(HSAILObject, "Initializing code, dircetive is "
"kind_end\n");
break;
default:
if (entryPtr->kind >= BRIG_KIND_INST_BEGIN &&
entryPtr->kind <= BRIG_KIND_INST_END) {
BrigInstBase *instPtr = (BrigInstBase*)entryPtr;
TheGpuISA::MachInst machInst = { instPtr, obj };
GPUStaticInst *iptr = decoder.decode(machInst);
if (iptr) {
DPRINTF(HSAILObject, "Initializing code, processing inst "
"#%d idx %d: OPCODE=%d\n",
inst_idx, _insts.size(), instPtr->opcode);
TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr);
iptr->instNum(inst_idx);
_insts.push_back(inst_num);
instructions.push_back(iptr);
}
++inst_idx;
} else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
entryPtr->kind < BRIG_KIND_OPERAND_END) {
warn("unexpected operand entry in code segment\n");
} else {
// there are surely some more cases we will need to handle,
// but we'll deal with them as we find them.
fatal("unexpected directive kind %d inside kernel scope\n",
entryPtr->kind);
}
}
entryPtr = brigNext(entryPtr);
}
// compute Control Flow Graph for current kernel
ControlFlowInfo::assignImmediatePostDominators(instructions);
max_sreg = SRegOperand::maxRegIdx;
max_dreg = DRegOperand::maxRegIdx;
max_creg = CRegOperand::maxRegIdx;
obj->currentCode = nullptr;
}
HsailCode::HsailCode(const std::string &name_str,
const BrigDirectiveExecutable *code_dir,
const BrigObject *obj, StorageMap *objStorageMap)
: HsaCode(name_str), private_size(-1), readonly_size(-1)
{
init(code_dir, obj, objStorageMap);
}
void
LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index,
const BrigObject *obj)
{
std::string lbl_name = obj->getString(lblDir->name);
Label &lbl = map[lbl_name];
if (lbl.defined()) {
fatal("Attempt to redefine existing label %s\n", lbl_name);
}
lbl.define(lbl_name, inst_index);
DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index);
}
Label*
LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir,
const BrigObject *obj)
{
std::string name = obj->getString(lblDir->name);
Label &lbl = map[name];
lbl.checkName(name);
return &lbl;
}
int
getBrigDataTypeBytes(BrigType16_t t)
{
switch (t) {
case BRIG_TYPE_S8:
case BRIG_TYPE_U8:
case BRIG_TYPE_B8:
return 1;
case BRIG_TYPE_S16:
case BRIG_TYPE_U16:
case BRIG_TYPE_B16:
case BRIG_TYPE_F16:
return 2;
case BRIG_TYPE_S32:
case BRIG_TYPE_U32:
case BRIG_TYPE_B32:
case BRIG_TYPE_F32:
return 4;
case BRIG_TYPE_S64:
case BRIG_TYPE_U64:
case BRIG_TYPE_B64:
case BRIG_TYPE_F64:
return 8;
case BRIG_TYPE_B1:
default:
fatal("unhandled symbol data type %d", t);
return 0;
}
}
StorageElement*
StorageSpace::addSymbol(const BrigDirectiveVariable *sym,
const BrigObject *obj)
{
const char *sym_name = obj->getString(sym->name);
uint64_t size = 0;
uint64_t offset = 0;
if (sym->type & BRIG_TYPE_ARRAY) {
size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY);
size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo);
offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type &
~BRIG_TYPE_ARRAY));
} else {
size = getBrigDataTypeBytes(sym->type);
offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type));
}
nextOffset = offset + size;
DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n",
segmentNames[segment], sym_name, size, offset, sym->init);
StorageElement* se = new StorageElement(sym_name, offset, size, sym);
elements.push_back(se);
elements_by_addr.insert(AddrRange(offset, offset + size - 1), se);
elements_by_brigptr[sym] = se;
return se;
}
StorageElement*
StorageSpace::findSymbol(std::string name)
{
for (auto it : elements) {
if (it->name == name) {
return it;
}
}
return nullptr;
}
StorageElement*
StorageSpace::findSymbol(uint64_t addr)
{
assert(elements_by_addr.size() > 0);
auto se = elements_by_addr.find(addr);
if (se == elements_by_addr.end()) {
return nullptr;
} else {
return se->second;
}
}
StorageElement*
StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr)
{
assert(elements_by_brigptr.size() > 0);
auto se = elements_by_brigptr.find(brigptr);
if (se == elements_by_brigptr.end()) {
return nullptr;
} else {
return se->second;
}
}
StorageMap::StorageMap(StorageMap *outerScope)
: outerScopeMap(outerScope)
{
for (int i = 0; i < NumSegments; ++i)
space[i] = new StorageSpace((BrigSegment)i);
}
StorageElement*
StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj)
{
BrigSegment8_t segment = sym->segment;
assert(segment >= Brig::BRIG_SEGMENT_FLAT);
assert(segment < NumSegments);
return space[segment]->addSymbol(sym, obj);
}
int
StorageMap::getSize(Brig::BrigSegment segment)
{
assert(segment > Brig::BRIG_SEGMENT_GLOBAL);
assert(segment < NumSegments);
if (segment != Brig::BRIG_SEGMENT_GROUP &&
segment != Brig::BRIG_SEGMENT_READONLY) {
return space[segment]->getSize();
} else {
int ret = space[segment]->getSize();
if (outerScopeMap) {
ret += outerScopeMap->getSize(segment);
}
return ret;
}
}
void
StorageMap::resetOffset(Brig::BrigSegment segment)
{
space[segment]->resetOffset();
}
StorageElement*
StorageMap::findSymbol(BrigSegment segment, std::string name)
{
StorageElement *se = space[segment]->findSymbol(name);
if (se)
return se;
if (outerScopeMap)
return outerScopeMap->findSymbol(segment, name);
return nullptr;
}
StorageElement*
StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr)
{
StorageSpace *sp = space[segment];
if (!sp) {
// there is no memory in segment?
return nullptr;
}
StorageElement *se = sp->findSymbol(addr);
if (se)
return se;
if (outerScopeMap)
return outerScopeMap->findSymbol(segment, addr);
return nullptr;
}
StorageElement*
StorageMap::findSymbol(Brig::BrigSegment segment,
const BrigDirectiveVariable *brigptr)
{
StorageSpace *sp = space[segment];
if (!sp) {
// there is no memory in segment?
return nullptr;
}
StorageElement *se = sp->findSymbol(brigptr);
if (se)
return se;
if (outerScopeMap)
return outerScopeMap->findSymbol(segment, brigptr);
return nullptr;
}

View file

@ -0,0 +1,447 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __HSAIL_CODE_HH__
#define __HSAIL_CODE_HH__
#include <cassert>
#include <list>
#include <map>
#include <string>
#include <vector>
#include "arch/gpu_decoder.hh"
#include "arch/hsail/Brig.h"
#include "base/addr_range_map.hh"
#include "base/intmath.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/hsa_code.hh"
#include "gpu-compute/hsa_kernel_info.hh"
#include "gpu-compute/misc.hh"
class BrigObject;
class GPUStaticInst;
inline int
popcount(uint64_t src, int sz)
{
int cnt = 0;
for (int i = 0; i < sz; ++i) {
if (src & 1)
++cnt;
src >>= 1;
}
return cnt;
}
inline int
firstbit(uint64_t src, int sz)
{
int i;
for (i = 0; i < sz; ++i) {
if (src & 1)
break;
src >>= 1;
}
return i;
}
inline int
lastbit(uint64_t src, int sz)
{
int i0 = -1;
for (int i = 0; i < sz; ++i) {
if (src & 1)
i0 = i;
src >>= 1;
}
return i0;
}
inline int
signbit(uint64_t src, int sz)
{
int i0 = -1;
if (src & (1 << (sz - 1))) {
for (int i = 0; i < sz - 1; ++i) {
if (!(src & 1))
i0 = i;
src >>= 1;
}
} else {
for (int i = 0; i < sz - 1; ++i) {
if (src & 1)
i0 = i;
src >>= 1;
}
}
return i0;
}
inline uint64_t
bitrev(uint64_t src, int sz)
{
uint64_t r = 0;
for (int i = 0; i < sz; ++i) {
r <<= 1;
if (src & 1)
r |= 1;
src >>= 1;
}
return r;
}
inline uint64_t
mul_hi(uint32_t a, uint32_t b)
{
return ((uint64_t)a * (uint64_t)b) >> 32;
}
inline uint64_t
mul_hi(int32_t a, int32_t b)
{
return ((int64_t)a * (int64_t)b) >> 32;
}
inline uint64_t
mul_hi(uint64_t a, uint64_t b)
{
return ((uint64_t)a * (uint64_t)b) >> 32;
}
inline uint64_t
mul_hi(int64_t a, int64_t b)
{
return ((int64_t)a * (int64_t)b) >> 32;
}
inline uint64_t
mul_hi(double a, double b)
{
return 0;
}
class Label
{
public:
std::string name;
int value;
Label() : value(-1)
{
}
bool defined() { return value != -1; }
void
checkName(std::string &_name)
{
if (name.empty()) {
name = _name;
} else {
assert(name == _name);
}
}
void
define(std::string &_name, int _value)
{
assert(!defined());
assert(_value != -1);
value = _value;
checkName(_name);
}
int
get()
{
assert(defined());
return value;
}
};
class LabelMap
{
std::map<std::string, Label> map;
public:
LabelMap() { }
void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index,
const BrigObject *obj);
Label *refLabel(const Brig::BrigDirectiveLabel *lbl,
const BrigObject *obj);
};
const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN;
extern const char *segmentNames[];
class StorageElement
{
public:
std::string name;
uint64_t offset;
uint64_t size;
const Brig::BrigDirectiveVariable *brigSymbol;
StorageElement(const char *_name, uint64_t _offset, int _size,
const Brig::BrigDirectiveVariable *sym)
: name(_name), offset(_offset), size(_size), brigSymbol(sym)
{
}
};
class StorageSpace
{
typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*>
DirVarToSE_map;
std::list<StorageElement*> elements;
AddrRangeMap<StorageElement*> elements_by_addr;
DirVarToSE_map elements_by_brigptr;
uint64_t nextOffset;
Brig::BrigSegment segment;
public:
StorageSpace(Brig::BrigSegment _class)
: nextOffset(0), segment(_class)
{
}
StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
const BrigObject *obj);
StorageElement* findSymbol(std::string name);
StorageElement* findSymbol(uint64_t addr);
StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr);
int getSize() { return nextOffset; }
void resetOffset() { nextOffset = 0; }
};
class StorageMap
{
StorageMap *outerScopeMap;
StorageSpace *space[NumSegments];
public:
StorageMap(StorageMap *outerScope = nullptr);
StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
const BrigObject *obj);
StorageElement* findSymbol(Brig::BrigSegment segment, std::string name);
StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr);
StorageElement* findSymbol(Brig::BrigSegment segment,
const Brig::BrigDirectiveVariable *brigptr);
// overloaded version to avoid casting
StorageElement*
findSymbol(Brig::BrigSegment8_t segment, std::string name)
{
return findSymbol((Brig::BrigSegment)segment, name);
}
int getSize(Brig::BrigSegment segment);
void resetOffset(Brig::BrigSegment segment);
};
typedef enum
{
BT_DEFAULT,
BT_B8,
BT_U8,
BT_U16,
BT_U32,
BT_U64,
BT_S8,
BT_S16,
BT_S32,
BT_S64,
BT_F16,
BT_F32,
BT_F64,
BT_NULL
} base_type_e;
/* @class HsailCode
* the HsailCode class is used to store information
* about HSA kernels stored in the BRIG format. it holds
* all information about a kernel, function, or variable
* symbol and provides methods for accessing that
* information.
*/
class HsailCode final : public HsaCode
{
public:
TheGpuISA::Decoder decoder;
StorageMap *storageMap;
LabelMap labelMap;
uint32_t kernarg_start;
uint32_t kernarg_end;
int32_t private_size;
int32_t readonly_size;
// We track the maximum register index used for each register
// class when we load the code so we can size the register files
// appropriately (i.e., one more than the max index).
uint32_t max_creg; // maximum c-register index
uint32_t max_sreg; // maximum s-register index
uint32_t max_dreg; // maximum d-register index
HsailCode(const std::string &name_str,
const Brig::BrigDirectiveExecutable *code_dir,
const BrigObject *obj,
StorageMap *objStorageMap);
// this version is used to create a placeholder when
// we encounter a kernel-related directive before the
// kernel itself
HsailCode(const std::string &name_str);
void init(const Brig::BrigDirectiveExecutable *code_dir,
const BrigObject *obj, StorageMap *objStorageMap);
void
generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const
{
hsaKernelInfo->sRegCount = max_sreg + 1;
hsaKernelInfo->dRegCount = max_dreg + 1;
hsaKernelInfo->cRegCount = max_creg + 1;
hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP);
hsaKernelInfo->private_mem_size =
roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8);
hsaKernelInfo->spill_mem_size =
roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8);
}
int
getSize(MemorySegment segment) const
{
Brig::BrigSegment brigSeg;
switch (segment) {
case MemorySegment::NONE:
brigSeg = Brig::BRIG_SEGMENT_NONE;
break;
case MemorySegment::FLAT:
brigSeg = Brig::BRIG_SEGMENT_FLAT;
break;
case MemorySegment::GLOBAL:
brigSeg = Brig::BRIG_SEGMENT_GLOBAL;
break;
case MemorySegment::READONLY:
brigSeg = Brig::BRIG_SEGMENT_READONLY;
break;
case MemorySegment::KERNARG:
brigSeg = Brig::BRIG_SEGMENT_KERNARG;
break;
case MemorySegment::GROUP:
brigSeg = Brig::BRIG_SEGMENT_GROUP;
break;
case MemorySegment::PRIVATE:
brigSeg = Brig::BRIG_SEGMENT_PRIVATE;
break;
case MemorySegment::SPILL:
brigSeg = Brig::BRIG_SEGMENT_SPILL;
break;
case MemorySegment::ARG:
brigSeg = Brig::BRIG_SEGMENT_ARG;
break;
case MemorySegment::EXTSPACE0:
brigSeg = Brig::BRIG_SEGMENT_AMD_GCN;
break;
default:
fatal("Unknown BrigSegment type.\n");
}
return getSize(brigSeg);
}
private:
int
getSize(Brig::BrigSegment segment) const
{
if (segment == Brig::BRIG_SEGMENT_PRIVATE) {
// with the code generated by new HSA compiler the assertion
// does not hold anymore..
//assert(private_size != -1);
return private_size;
} else {
return storageMap->getSize(segment);
}
}
public:
StorageElement*
findSymbol(Brig::BrigSegment segment, uint64_t addr)
{
return storageMap->findSymbol(segment, addr);
}
void
setPrivateSize(int32_t _private_size)
{
private_size = _private_size;
}
Label*
refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj)
{
return labelMap.refLabel(lbl, obj);
}
};
#endif // __HSAIL_CODE_HH__

View file

@ -0,0 +1,296 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#include "gpu-compute/kernel_cfg.hh"
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstring>
#include <iostream>
#include <iterator>
#include <map>
#include <string>
#include "gpu-compute/gpu_static_inst.hh"
void
ControlFlowInfo::assignImmediatePostDominators(
const std::vector<GPUStaticInst*>& instructions)
{
ControlFlowInfo cfg(instructions);
cfg.findImmediatePostDominators();
}
ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) :
instructions(insts)
{
createBasicBlocks();
connectBasicBlocks();
}
BasicBlock*
ControlFlowInfo::basicBlock(int inst_num) const {
for (auto& block: basicBlocks) {
int first_block_id = block->firstInstruction->instNum();
if (inst_num >= first_block_id &&
inst_num < first_block_id + block->size) {
return block.get();
}
}
return nullptr;
}
GPUStaticInst*
ControlFlowInfo::lastInstruction(const BasicBlock* block) const
{
if (block->isExit()) {
return nullptr;
}
return instructions.at(block->firstInstruction->instNum() +
block->size - 1);
}
BasicBlock*
ControlFlowInfo::postDominator(const BasicBlock* block) const
{
if (block->isExit()) {
return nullptr;
}
return basicBlock(lastInstruction(block)->ipdInstNum());
}
void
ControlFlowInfo::createBasicBlocks()
{
assert(!instructions.empty());
std::set<int> leaders;
// first instruction is a leader
leaders.insert(0);
for (int i = 1; i < instructions.size(); i++) {
GPUStaticInst* instruction = instructions[i];
if (instruction->o_type == Enums::OT_BRANCH) {
const int target_pc = instruction->getTargetPc();
leaders.insert(target_pc);
leaders.insert(i + 1);
}
}
size_t block_size = 0;
for (int i = 0; i < instructions.size(); i++) {
if (leaders.find(i) != leaders.end()) {
uint32_t id = basicBlocks.size();
if (id > 0) {
basicBlocks.back()->size = block_size;
}
block_size = 0;
basicBlocks.emplace_back(new BasicBlock(id, instructions[i]));
}
block_size++;
}
basicBlocks.back()->size = block_size;
// exit basic block
basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr));
}
void
ControlFlowInfo::connectBasicBlocks()
{
BasicBlock* exit_bb = basicBlocks.back().get();
for (auto& bb : basicBlocks) {
if (bb->isExit()) {
break;
}
GPUStaticInst* last = lastInstruction(bb.get());
if (last->o_type == Enums::OT_RET) {
bb->successorIds.insert(exit_bb->id);
break;
}
if (last->o_type == Enums::OT_BRANCH) {
const uint32_t target_pc = last->getTargetPc();
BasicBlock* target_bb = basicBlock(target_pc);
bb->successorIds.insert(target_bb->id);
}
// Unconditional jump instructions have a unique successor
if (!last->unconditionalJumpInstruction()) {
BasicBlock* next_bb = basicBlock(last->instNum() + 1);
bb->successorIds.insert(next_bb->id);
}
}
}
// In-place set intersection
static void
intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b)
{
std::set<uint32_t>::iterator it = a.begin();
while (it != a.end()) {
it = b.find(*it) != b.end() ? ++it : a.erase(it);
}
}
void
ControlFlowInfo::findPostDominators()
{
// the only postdominator of the exit block is itself
basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id);
//copy all basic blocks to all postdominator lists except for exit block
for (auto& block : basicBlocks) {
if (!block->isExit()) {
for (uint32_t i = 0; i < basicBlocks.size(); i++) {
block->postDominatorIds.insert(i);
}
}
}
bool change = true;
while (change) {
change = false;
for (int h = basicBlocks.size() - 2; h >= 0; --h) {
size_t num_postdominators =
basicBlocks[h]->postDominatorIds.size();
for (int s : basicBlocks[h]->successorIds) {
intersect(basicBlocks[h]->postDominatorIds,
basicBlocks[s]->postDominatorIds);
}
basicBlocks[h]->postDominatorIds.insert(h);
change |= (num_postdominators
!= basicBlocks[h]->postDominatorIds.size());
}
}
}
// In-place set difference
static void
setDifference(std::set<uint32_t>&a,
const std::set<uint32_t>& b, uint32_t exception)
{
for (uint32_t b_elem : b) {
if (b_elem != exception) {
a.erase(b_elem);
}
}
}
void
ControlFlowInfo::findImmediatePostDominators()
{
assert(basicBlocks.size() > 1); // Entry and exit blocks must be present
findPostDominators();
for (auto& basicBlock : basicBlocks) {
if (basicBlock->isExit()) {
continue;
}
std::set<uint32_t> candidates = basicBlock->postDominatorIds;
candidates.erase(basicBlock->id);
for (uint32_t postDominatorId : basicBlock->postDominatorIds) {
if (postDominatorId != basicBlock->id) {
setDifference(candidates,
basicBlocks[postDominatorId]->postDominatorIds,
postDominatorId);
}
}
assert(candidates.size() == 1);
GPUStaticInst* last_instruction = lastInstruction(basicBlock.get());
BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get();
if (!ipd_block->isExit()) {
GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction;
last_instruction->ipdInstNum(ipd_first_inst->instNum());
} else {
last_instruction->ipdInstNum(last_instruction->instNum() + 1);
}
}
}
void
ControlFlowInfo::printPostDominators() const
{
for (auto& block : basicBlocks) {
std::cout << "PD(" << block->id << ") = {";
std::copy(block->postDominatorIds.begin(),
block->postDominatorIds.end(),
std::ostream_iterator<uint32_t>(std::cout, ", "));
std::cout << "}" << std::endl;
}
}
void
ControlFlowInfo::printImmediatePostDominators() const
{
for (const auto& block : basicBlocks) {
if (block->isExit()) {
continue;
}
std::cout << "IPD(" << block->id << ") = ";
std::cout << postDominator(block.get())->id << ", ";
}
std::cout << std::endl;
}
void
ControlFlowInfo::printBasicBlocks() const
{
for (GPUStaticInst* inst : instructions) {
int inst_num = inst->instNum();
std::cout << inst_num << " [" << basicBlock(inst_num)->id
<< "]: " << inst->disassemble();
if (inst->o_type == Enums::OT_BRANCH) {
std::cout << ", PC = " << inst->getTargetPc();
}
std::cout << std::endl;
}
}
void
ControlFlowInfo::printBasicBlockDot() const
{
printf("digraph {\n");
for (const auto& basic_block : basicBlocks) {
printf("\t");
for (uint32_t successorId : basic_block->successorIds) {
printf("%d -> %d; ", basic_block->id, successorId);
}
printf("\n");
}
printf("}\n");
}

View file

@ -0,0 +1,133 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __KERNEL_CFG_HH__
#define __KERNEL_CFG_HH__
#include <cstddef>
#include <cstdint>
#include <memory>
#include <set>
#include <vector>
class GPUStaticInst;
class HsailCode;
struct BasicBlock
{
BasicBlock(uint32_t num, GPUStaticInst* begin) :
id(num), size(0), firstInstruction(begin)
{
}
bool
isEntry() const
{
return !id;
}
bool
isExit() const
{
return !size;
}
/**
* Unique identifier for the block within a given kernel.
*/
const uint32_t id;
/**
* Number of instructions contained in the block
*/
size_t size;
/**
* Pointer to first instruction of the block.
*/
GPUStaticInst* firstInstruction;
/**
* Identifiers of the blocks that follow (are reachable from) this block.
*/
std::set<uint32_t> successorIds;
/**
* Identifiers of the blocks that will be visited from this block.
*/
std::set<uint32_t> postDominatorIds;
};
class ControlFlowInfo
{
public:
/**
* Compute immediate post-dominator instruction for kernel instructions.
*/
static void assignImmediatePostDominators(
const std::vector<GPUStaticInst*>& instructions);
private:
ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions);
GPUStaticInst* lastInstruction(const BasicBlock* block) const;
BasicBlock* basicBlock(int inst_num) const;
BasicBlock* postDominator(const BasicBlock* block) const;
void createBasicBlocks();
void connectBasicBlocks();
void findPostDominators();
void findImmediatePostDominators();
void printBasicBlocks() const;
void printBasicBlockDot() const;
void printPostDominators() const;
void printImmediatePostDominators() const;
std::vector<std::unique_ptr<BasicBlock>> basicBlocks;
std::vector<GPUStaticInst*> instructions;
};
#endif // __KERNEL_CFG_HH__

View file

@ -0,0 +1,341 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos, Joe Gross
*/
#include "gpu-compute/lds_state.hh"
#include <array>
#include <cstdio>
#include <cstdlib>
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
/**
* the default constructor that works with SWIG
*/
LdsState::LdsState(const Params *params) :
MemObject(params),
tickEvent(this),
cuPort(name() + ".port", this),
maximumSize(params->size),
range(params->range),
bankConflictPenalty(params->bankConflictPenalty),
banks(params->banks)
{
fatal_if(params->banks <= 0,
"Number of LDS banks should be positive number");
fatal_if((params->banks & (params->banks - 1)) != 0,
"Number of LDS banks should be a power of 2");
fatal_if(params->size <= 0,
"cannot allocate an LDS with a size less than 1");
fatal_if(params->size % 2,
"the LDS should be an even number");
}
/**
* Needed by the SWIG compiler
*/
LdsState *
LdsStateParams::create()
{
return new LdsState(this);
}
/**
* set the parent and name based on the parent
*/
void
LdsState::setParent(ComputeUnit *x_parent)
{
// check that this gets assigned to the same thing each time
fatal_if(!x_parent, "x_parent should not be nullptr");
fatal_if(x_parent == parent,
"should not be setting the parent twice");
parent = x_parent;
_name = x_parent->name() + ".LdsState";
}
/**
* derive the gpu mem packet from the packet and then count the bank conflicts
*/
unsigned
LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
{
Packet::SenderState *baseSenderState = packet->senderState;
while (baseSenderState->predecessor) {
baseSenderState = baseSenderState->predecessor;
}
const ComputeUnit::LDSPort::SenderState *senderState =
dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
fatal_if(!senderState,
"did not get the right sort of sender state");
GPUDynInstPtr gpuDynInst = senderState->getMemInst();
return countBankConflicts(gpuDynInst, bankAccesses);
}
// Count the total number of bank conflicts for the local memory packet
unsigned
LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
unsigned *numBankAccesses)
{
int bank_conflicts = 0;
std::vector<int> bank;
// the number of LDS banks being touched by the memory instruction
int numBanks = std::min(parent->wfSize(), banks);
// if the wavefront size is larger than the number of LDS banks, we
// need to iterate over all work items to calculate the total
// number of bank conflicts
int groups = (parent->wfSize() > numBanks) ?
(parent->wfSize() / numBanks) : 1;
for (int i = 0; i < groups; i++) {
// Address Array holding all the work item addresses of an instruction
std::vector<Addr> addr_array;
addr_array.resize(numBanks, 0);
bank.clear();
bank.resize(banks, 0);
int max_bank = 0;
// populate the address array for all active work items
for (int j = 0; j < numBanks; j++) {
if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
} else {
addr_array[j] = std::numeric_limits<Addr>::max();
}
}
if (gpuDynInst->m_op == Enums::MO_LD ||
gpuDynInst->m_op == Enums::MO_ST) {
// mask identical addresses
for (int j = 0; j < numBanks; ++j) {
for (int j0 = 0; j0 < j; j0++) {
if (addr_array[j] != std::numeric_limits<Addr>::max()
&& addr_array[j] == addr_array[j0]) {
addr_array[j] = std::numeric_limits<Addr>::max();
}
}
}
}
// calculate bank conflicts
for (int j = 0; j < numBanks; ++j) {
if (addr_array[j] != std::numeric_limits<Addr>::max()) {
int bankId = addr_array[j] % banks;
bank[bankId]++;
max_bank = std::max(max_bank, bank[bankId]);
// Count the number of LDS banks accessed.
// Since we have masked identical addresses all remaining
// accesses will need to be serialized if they access
// the same bank (bank conflict).
(*numBankAccesses)++;
}
}
bank_conflicts += max_bank;
}
panic_if(bank_conflicts > parent->wfSize(),
"Max bank conflicts should match num of work items per instr");
return bank_conflicts;
}
/**
* receive the packet from the CU
*/
bool
LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
{
return ownerLds->processPacket(packet);
}
GPUDynInstPtr
LdsState::getDynInstr(PacketPtr packet)
{
ComputeUnit::LDSPort::SenderState *ss =
dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
packet->senderState);
return ss->getMemInst();
}
/**
* process an incoming packet, add it to the return queue
*/
bool
LdsState::processPacket(PacketPtr packet)
{
unsigned bankAccesses = 0;
// the number of conflicts this packet will have when accessing the LDS
unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
// count the total number of physical LDS bank accessed
parent->ldsBankAccesses += bankAccesses;
// count the LDS bank conflicts. A number set to 1 indicates one
// access per bank maximum so there are no bank conflicts
parent->ldsBankConflictDist.sample(bankConflicts-1);
GPUDynInstPtr dynInst = getDynInstr(packet);
// account for the LDS bank conflict overhead
int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
(dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
parent->loadBusLength();
// delay for accessing the LDS
Tick processingTime =
parent->shader->ticks(bankConflicts * bankConflictPenalty) +
parent->shader->ticks(busLength);
// choose (delay + last packet in queue) or (now + delay) as the time to
// return this
Tick doneAt = earliestReturnTime() + processingTime;
// then store it for processing
return returnQueuePush(std::make_pair(doneAt, packet));
}
/**
* add this to the queue of packets to be returned
*/
bool
LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
{
// TODO add time limits (e.g. one packet per cycle) and queue size limits
// and implement flow control
returnQueue.push(thePair);
// if there is no set wakeup time, look through the queue
if (!tickEvent.scheduled()) {
process();
}
return true;
}
/**
* receive a packet in functional mode
*/
void
LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
{
fatal("not implemented");
}
/**
* receive a retry for a response
*/
void
LdsState::CuSidePort::recvRespRetry()
{
// TODO verify that this is the right way to do this
assert(ownerLds->isRetryResp());
ownerLds->setRetryResp(false);
ownerLds->process();
}
/**
* receive a retry
*/
void
LdsState::CuSidePort::recvRetry()
{
fatal("not implemented");
}
/**
* look for packets to return at this time
*/
bool
LdsState::process()
{
Tick now = clockEdge();
// send back completed packets
while (!returnQueue.empty() && returnQueue.front().first <= now) {
PacketPtr packet = returnQueue.front().second;
ComputeUnit::LDSPort::SenderState *ss =
dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
packet->senderState);
GPUDynInstPtr gpuDynInst = ss->getMemInst();
gpuDynInst->initiateAcc(gpuDynInst);
packet->makeTimingResponse();
returnQueue.pop();
bool success = cuPort.sendTimingResp(packet);
if (!success) {
retryResp = true;
panic("have not handled timing responses being NACK'd when sent"
"back");
}
}
// determine the next wakeup time
if (!returnQueue.empty()) {
Tick next = returnQueue.front().first;
if (tickEvent.scheduled()) {
if (next < tickEvent.when()) {
tickEvent.deschedule();
tickEvent.schedule(next);
}
} else {
tickEvent.schedule(next);
}
}
return true;
}
/**
* wake up at this time and perform specified actions
*/
void
LdsState::TickEvent::process()
{
ldsState->process();
}
/**
*
*/
void
LdsState::regStats()
{
}

View file

@ -0,0 +1,512 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos, Joe Gross
*/
#ifndef __LDS_STATE_HH__
#define __LDS_STATE_HH__
#include <array>
#include <queue>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "enums/MemOpType.hh"
#include "enums/MemType.hh"
#include "gpu-compute/misc.hh"
#include "mem/mem_object.hh"
#include "mem/port.hh"
#include "params/LdsState.hh"
class ComputeUnit;
/**
* this represents a slice of the overall LDS, intended to be associated with an
* individual workgroup
*/
class LdsChunk
{
public:
LdsChunk(const uint32_t x_size):
chunk(x_size)
{
}
LdsChunk() {}
/**
* a read operation
*/
template<class T>
T
read(const uint32_t index)
{
fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
T *p0 = (T *) (&(chunk.at(index)));
return *p0;
}
/**
* a write operation
*/
template<class T>
void
write(const uint32_t index, const T value)
{
fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
T *p0 = (T *) (&(chunk.at(index)));
*p0 = value;
}
/**
* get the size of this chunk
*/
std::vector<uint8_t>::size_type
size() const
{
return chunk.size();
}
protected:
// the actual data store for this slice of the LDS
std::vector<uint8_t> chunk;
};
// Local Data Share (LDS) State per Wavefront (contents of the LDS region
// allocated to the WorkGroup of this Wavefront)
class LdsState: public MemObject
{
protected:
/**
* an event to allow event-driven execution
*/
class TickEvent: public Event
{
protected:
LdsState *ldsState = nullptr;
Tick nextTick = 0;
public:
TickEvent(LdsState *_ldsState) :
ldsState(_ldsState)
{
}
virtual void
process();
void
schedule(Tick when)
{
mainEventQueue[0]->schedule(this, when);
}
void
deschedule()
{
mainEventQueue[0]->deschedule(this);
}
};
/**
* CuSidePort is the LDS Port closer to the CU side
*/
class CuSidePort: public SlavePort
{
public:
CuSidePort(const std::string &_name, LdsState *_ownerLds) :
SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
{
}
protected:
LdsState *ownerLds;
virtual bool
recvTimingReq(PacketPtr pkt);
virtual Tick
recvAtomic(PacketPtr pkt)
{
return 0;
}
virtual void
recvFunctional(PacketPtr pkt);
virtual void
recvRangeChange()
{
}
virtual void
recvRetry();
virtual void
recvRespRetry();
virtual AddrRangeList
getAddrRanges() const
{
AddrRangeList ranges;
ranges.push_back(ownerLds->getAddrRange());
return ranges;
}
template<typename T>
void
loadData(PacketPtr packet);
template<typename T>
void
storeData(PacketPtr packet);
template<typename T>
void
atomicOperation(PacketPtr packet);
};
protected:
// the lds reference counter
// The key is the workgroup ID and dispatch ID
// The value is the number of wavefronts that reference this LDS, as
// wavefronts are launched, the counter goes up for that workgroup and when
// they return it decreases, once it reaches 0 then this chunk of the LDS is
// returned to the available pool. However,it is deallocated on the 1->0
// transition, not whenever the counter is 0 as it always starts with 0 when
// the workgroup asks for space
std::unordered_map<uint32_t,
std::unordered_map<uint32_t, int32_t>> refCounter;
// the map that allows workgroups to access their own chunk of the LDS
std::unordered_map<uint32_t,
std::unordered_map<uint32_t, LdsChunk>> chunkMap;
// an event to allow the LDS to wake up at a specified time
TickEvent tickEvent;
// the queue of packets that are going back to the CU after a
// read/write/atomic op
// TODO need to make this have a maximum size to create flow control
std::queue<std::pair<Tick, PacketPtr>> returnQueue;
// whether or not there are pending responses
bool retryResp = false;
bool
process();
GPUDynInstPtr
getDynInstr(PacketPtr packet);
bool
processPacket(PacketPtr packet);
unsigned
countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
unsigned
countBankConflicts(GPUDynInstPtr gpuDynInst,
unsigned *numBankAccesses);
public:
typedef LdsStateParams Params;
LdsState(const Params *params);
// prevent copy construction
LdsState(const LdsState&) = delete;
~LdsState()
{
parent = nullptr;
}
const Params *
params() const
{
return dynamic_cast<const Params *>(_params);
}
bool
isRetryResp() const
{
return retryResp;
}
void
setRetryResp(const bool value)
{
retryResp = value;
}
// prevent assignment
LdsState &
operator=(const LdsState &) = delete;
/**
* use the dynamic wave id to create or just increase the reference count
*/
int
increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
{
int refCount = getRefCounter(dispatchId, wgId);
fatal_if(refCount < 0,
"reference count should not be below zero");
return ++refCounter[dispatchId][wgId];
}
/**
* decrease the reference count after making sure it is in the list
* give back this chunk if the ref counter has reached 0
*/
int
decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
{
int refCount = getRefCounter(dispatchId, wgId);
fatal_if(refCount <= 0,
"reference count should not be below zero or at zero to"
"decrement");
refCounter[dispatchId][wgId]--;
if (refCounter[dispatchId][wgId] == 0) {
releaseSpace(dispatchId, wgId);
return 0;
} else {
return refCounter[dispatchId][wgId];
}
}
/**
* return the current reference count for this workgroup id
*/
int
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
{
auto dispatchIter = chunkMap.find(dispatchId);
fatal_if(dispatchIter == chunkMap.end(),
"could not locate this dispatch id [%d]", dispatchId);
auto workgroup = dispatchIter->second.find(wgId);
fatal_if(workgroup == dispatchIter->second.end(),
"could not find this workgroup id within this dispatch id"
" did[%d] wgid[%d]", dispatchId, wgId);
auto refCountIter = refCounter.find(dispatchId);
if (refCountIter == refCounter.end()) {
fatal("could not locate this dispatch id [%d]", dispatchId);
} else {
auto workgroup = refCountIter->second.find(wgId);
if (workgroup == refCountIter->second.end()) {
fatal("could not find this workgroup id within this dispatch id"
" did[%d] wgid[%d]", dispatchId, wgId);
} else {
return refCounter.at(dispatchId).at(wgId);
}
}
fatal("should not reach this point");
return 0;
}
/**
* assign a parent and request this amount of space be set aside
* for this wgid
*/
LdsChunk *
reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
const uint32_t size)
{
if (chunkMap.find(dispatchId) != chunkMap.end()) {
fatal_if(
chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
"duplicate workgroup ID asking for space in the LDS "
"did[%d] wgid[%d]", dispatchId, wgId);
}
fatal_if(bytesAllocated + size > maximumSize,
"request would ask for more space than is available");
bytesAllocated += size;
chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
// make an entry for this workgroup
refCounter[dispatchId][wgId] = 0;
return &chunkMap[dispatchId][wgId];
}
bool
returnQueuePush(std::pair<Tick, PacketPtr> thePair);
Tick
earliestReturnTime() const
{
// TODO set to max(lastCommand+1, curTick())
return returnQueue.empty() ? curTick() : returnQueue.back().first;
}
void
setParent(ComputeUnit *x_parent);
void
regStats();
// accessors
ComputeUnit *
getParent() const
{
return parent;
}
std::string
getName()
{
return _name;
}
int
getBanks() const
{
return banks;
}
ComputeUnit *
getComputeUnit() const
{
return parent;
}
int
getBankConflictPenalty() const
{
return bankConflictPenalty;
}
/**
* get the allocated size for this workgroup
*/
std::size_t
ldsSize(const uint32_t x_wgId)
{
return chunkMap[x_wgId].size();
}
AddrRange
getAddrRange() const
{
return range;
}
virtual BaseSlavePort &
getSlavePort(const std::string& if_name, PortID idx)
{
if (if_name == "cuPort") {
// TODO need to set name dynamically at this point?
return cuPort;
} else {
fatal("cannot resolve the port name " + if_name);
}
}
/**
* can this much space be reserved for a workgroup?
*/
bool
canReserve(uint32_t x_size) const
{
return bytesAllocated + x_size <= maximumSize;
}
private:
/**
* give back the space
*/
bool
releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
{
auto dispatchIter = chunkMap.find(x_dispatchId);
if (dispatchIter == chunkMap.end()) {
fatal("dispatch id not found [%d]", x_dispatchId);
} else {
auto workgroupIter = dispatchIter->second.find(x_wgId);
if (workgroupIter == dispatchIter->second.end()) {
fatal("workgroup id [%d] not found in dispatch id [%d]",
x_wgId, x_dispatchId);
}
}
fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
"releasing more space than was allocated");
bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
return true;
}
// the port that connects this LDS to its owner CU
CuSidePort cuPort;
ComputeUnit* parent = nullptr;
std::string _name;
// the number of bytes currently reserved by all workgroups
int bytesAllocated = 0;
// the size of the LDS, the most bytes available
int maximumSize;
// Address range of this memory
AddrRange range;
// the penalty, in cycles, for each LDS bank conflict
int bankConflictPenalty = 0;
// the number of banks in the LDS underlying data store
int banks = 0;
};
#endif // __LDS_STATE_HH__

View file

@ -0,0 +1,200 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#include "gpu-compute/local_memory_pipeline.hh"
#include "debug/GPUPort.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) :
computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size)
{
}
void
LocalMemPipeline::init(ComputeUnit *cu)
{
computeUnit = cu;
_name = computeUnit->name() + ".LocalMemPipeline";
}
void
LocalMemPipeline::exec()
{
// apply any returned shared (LDS) memory operations
GPUDynInstPtr m = !lmReturnedRequests.empty() ?
lmReturnedRequests.front() : nullptr;
bool accessVrf = true;
if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
accessVrf =
w->computeUnit->vrf[m->simdId]->
vrfOperandAccessReady(m->seqNum(), w, m,
VrfAccessType::WRITE);
}
if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
|| computeUnit->wfWait.at(m->pipeId).rdy())) {
if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
doSmReturn<uint32_t, uint8_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
doSmReturn<uint32_t, uint16_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
doSmReturn<uint32_t, uint32_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
doSmReturn<int32_t, int8_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
doSmReturn<int32_t, int16_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
doSmReturn<int32_t, int32_t>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
doSmReturn<float, Float16>(m);
else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
doSmReturn<float, float>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
doSmReturn<uint64_t, uint8_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
doSmReturn<uint64_t, uint16_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
doSmReturn<uint64_t, uint32_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
doSmReturn<uint64_t, uint64_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
doSmReturn<int64_t, int8_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
doSmReturn<int64_t, int16_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
doSmReturn<int64_t, int32_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
doSmReturn<int64_t, int64_t>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
doSmReturn<double, Float16>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
doSmReturn<double, float>(m);
else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
doSmReturn<double, double>(m);
}
// If pipeline has executed a local memory instruction
// execute local memory packet and issue the packets
// to LDS
if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
GPUDynInstPtr m = lmIssuedRequests.front();
bool returnVal = computeUnit->sendToLds(m);
if (!returnVal) {
DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
}
lmIssuedRequests.pop();
}
}
template<typename c0, typename c1>
void
LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
{
lmReturnedRequests.pop();
Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
// Return data to registers
if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
std::vector<uint32_t> regVec;
for (int k = 0; k < m->n_reg; ++k) {
int dst = m->dst_reg+k;
if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
dst = m->dst_reg_vec[k];
// virtual->physical VGPR mapping
int physVgpr = w->remap(dst,sizeof(c0),1);
// save the physical VGPR index
regVec.push_back(physVgpr);
c1 *p1 = &((c1*)m->d_data)[k * VSZ];
for (int i = 0; i < VSZ; ++i) {
if (m->exec_mask[i]) {
// write the value into the physical VGPR. This is a purely
// functional operation. No timing is modeled.
w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
*p1, i);
}
++p1;
}
}
// Schedule the write operation of the load data on the VRF. This simply
// models the timing aspect of the VRF write operation. It does not
// modify the physical VGPR.
loadVrfBankConflictCycles +=
w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
regVec, sizeof(c0), m->time);
}
// Decrement outstanding request count
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op)
|| MO_H(m->m_op)) {
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm,
m->time, -1);
}
if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm,
m->time, -1);
}
// Mark write bus busy for appropriate amount of time
computeUnit->locMemToVrfBus.set(m->time);
if (computeUnit->shader->coissue_return == 0)
w->computeUnit->wfWait.at(m->pipeId).set(m->time);
}
void
LocalMemPipeline::regStats()
{
loadVrfBankConflictCycles
.name(name() + ".load_vrf_bank_conflict_cycles")
.desc("total number of cycles LDS data are delayed before updating "
"the VRF")
;
}

View file

@ -0,0 +1,98 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#ifndef __LOCAL_MEMORY_PIPELINE_HH__
#define __LOCAL_MEMORY_PIPELINE_HH__
#include <queue>
#include <string>
#include "gpu-compute/misc.hh"
#include "params/ComputeUnit.hh"
#include "sim/stats.hh"
/*
* @file local_memory_pipeline.hh
*
* The local memory pipeline issues newly created local memory packets
* from pipeline to the LDS. This stage also retires previously issued
* loads and stores that have returned from the LDS.
*/
class ComputeUnit;
class Wavefront;
class LocalMemPipeline
{
public:
LocalMemPipeline(const ComputeUnitParams *params);
void init(ComputeUnit *cu);
void exec();
template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
bool
isLMRespFIFOWrRdy() const
{
return lmReturnedRequests.size() < lmQueueSize;
}
bool
isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
{
return (lmIssuedRequests.size() + pendReqs) < lmQueueSize;
}
const std::string& name() const { return _name; }
void regStats();
private:
ComputeUnit *computeUnit;
std::string _name;
int lmQueueSize;
Stats::Scalar loadVrfBankConflictCycles;
// Local Memory Request Fifo: all shared memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> lmIssuedRequests;
// Local Memory Response Fifo: all responses of shared memory
// requests are sent to this FIFO from LDS
std::queue<GPUDynInstPtr> lmReturnedRequests;
};
#endif // __LOCAL_MEMORY_PIPELINE_HH__

162
src/gpu-compute/misc.hh Normal file
View file

@ -0,0 +1,162 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __MISC_HH__
#define __MISC_HH__
#include <bitset>
#include <memory>
#include "base/misc.hh"
class GPUDynInst;
// wavefront size of the machine
static const int VSZ = 64;
/*
This check is necessary because std::bitset only provides conversion to
unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
a few places in the code where to_ullong() is used, however if VSZ is larger
than a value the host can support then bitset will throw a runtime exception.
we should remove all use of to_long() or to_ullong() so we can have VSZ
greater than 64b, however until that is done this assert is required.
*/
static_assert(VSZ <= sizeof(unsigned long long) * 8,
"VSZ is larger than the host can support");
typedef std::bitset<VSZ> VectorMask;
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
class WaitClass
{
public:
WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
void init(uint64_t *_tcnt, uint32_t _numStages=0)
{
tcnt = _tcnt;
numStages = _numStages;
}
void set(uint32_t i)
{
fatal_if(nxtAvail > *tcnt,
"Can't allocate resource because it is busy!!!");
nxtAvail = *tcnt + i;
}
void preset(uint32_t delay)
{
lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
}
bool rdy() const { return *tcnt >= nxtAvail; }
bool prerdy() const { return *tcnt >= lookAheadAvail; }
private:
// timestamp indicating when resource will be available
uint64_t nxtAvail;
// timestamp indicating when resource will be available including
// pending uses of the resource (when there is a cycle gap between
// rdy() and set()
uint64_t lookAheadAvail;
// current timestamp
uint64_t *tcnt;
// number of stages between checking if a resource is ready and
// setting the resource's utilization
uint32_t numStages;
};
class Float16
{
public:
uint16_t val;
Float16() { val = 0; }
Float16(const Float16 &x) : val(x.val) { }
Float16(float x)
{
uint32_t ai = *(uint32_t *)&x;
uint32_t s = (ai >> 31) & 0x1;
uint32_t exp = (ai >> 23) & 0xff;
uint32_t mant = (ai >> 0) & 0x7fffff;
if (exp == 0 || exp <= 0x70) {
exp = 0;
mant = 0;
} else if (exp == 0xff) {
exp = 0x1f;
} else if (exp >= 0x8f) {
exp = 0x1f;
mant = 0;
} else {
exp = exp - 0x7f + 0x0f;
}
mant = mant >> 13;
val = 0;
val |= (s << 15);
val |= (exp << 10);
val |= (mant << 0);
}
operator float() const
{
uint32_t s = (val >> 15) & 0x1;
uint32_t exp = (val >> 10) & 0x1f;
uint32_t mant = (val >> 0) & 0x3ff;
if (!exp) {
exp = 0;
mant = 0;
} else if (exp == 0x1f) {
exp = 0xff;
} else {
exp = exp - 0x0f + 0x7f;
}
uint32_t val1 = 0;
val1 |= (s << 31);
val1 |= (exp << 23);
val1 |= (mant << 13);
return *(float*)&val1;
}
};
#endif // __MISC_HH__

View file

@ -0,0 +1,70 @@
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __NDRANGE_HH__
#define __NDRANGE_HH__
#include "base/types.hh"
#include "gpu-compute/qstruct.hh"
struct NDRange
{
// copy of the queue entry provided at dispatch
HsaQueueEntry q;
// The current workgroup id (3 dimensions)
int wgId[3];
// The number of workgroups in each dimension
int numWg[3];
// The total number of workgroups
int numWgTotal;
// The number of completed work groups
int numWgCompleted;
// The global workgroup ID
uint32_t globalWgId;
// flag indicating whether all work groups have been launched
bool wg_disp_rem;
// kernel complete
bool execDone;
bool userDoorBellSet;
volatile bool *addrToNotify;
volatile uint32_t *numDispLeft;
int dispatchId;
int curTid; // Current thread id
};
#endif // __NDRANGE_HH__

View file

@ -0,0 +1,76 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#include "gpu-compute/of_scheduling_policy.hh"
#include "gpu-compute/wavefront.hh"
Wavefront*
OFSchedulingPolicy::chooseWave()
{
// Set when policy choose a wave to schedule
bool waveChosen = false;
Wavefront *selectedWave = nullptr;
int selectedWaveID = -1;
uint32_t selectedPosition = 0;
for (int position = 0; position < scheduleList->size(); ++position) {
Wavefront *curWave = scheduleList->at(position);
uint32_t curWaveID = curWave->wfDynId;
// Choosed wave with the lowest wave ID
if (selectedWaveID == -1 || curWaveID < selectedWaveID) {
waveChosen = true;
selectedWaveID = curWaveID;
selectedWave = curWave;
selectedPosition = position;
}
}
// Check to make sure ready list had atleast one schedulable wave
if (waveChosen) {
scheduleList->erase(scheduleList->begin() + selectedPosition);
} else {
panic("Empty ready list");
}
return selectedWave;
}
void
OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
{
scheduleList = list;
}

View file

@ -0,0 +1,61 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#ifndef __OF_SCHEDULING_POLICY_HH__
#define __OF_SCHEDULING_POLICY_HH__
#include <cstddef>
#include <vector>
#include "base/misc.hh"
class Wavefront;
// Oldest First where age is marked by the wave id
class OFSchedulingPolicy
{
public:
OFSchedulingPolicy() : scheduleList(nullptr) { }
Wavefront* chooseWave();
void bindList(std::vector<Wavefront*> *list);
private:
// List of waves which are participating in scheduling.
// This scheduler selects the oldest wave from this list
std::vector<Wavefront*> *scheduleList;
};
#endif // __OF_SCHEDULING_POLICY_HH__

View file

@ -0,0 +1,42 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos
*/
#include "gpu-compute/pool_manager.hh"
PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
: _minAllocation(minAlloc), _poolSize(poolSize)
{
assert(poolSize > 0);
}

View file

@ -0,0 +1,66 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos
*/
#ifndef __POOL_MANAGER_HH__
#define __POOL_MANAGER_HH__
#include <cassert>
#include <cstdint>
#include <string>
// Pool Manager Logic
class PoolManager
{
public:
PoolManager(uint32_t minAlloc, uint32_t poolSize);
uint32_t minAllocation() { return _minAllocation; }
virtual std::string printRegion() = 0;
virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0;
virtual uint32_t allocateRegion(const uint32_t size,
uint32_t *reserved) = 0;
virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
uint32_t poolSize() { return _poolSize; }
private:
// minimum size that can be reserved per allocation
uint32_t _minAllocation;
// pool size in number of elements
uint32_t _poolSize;
};
#endif // __POOL_MANAGER_HH__

201
src/gpu-compute/qstruct.hh Normal file
View file

@ -0,0 +1,201 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Brad Beckmann, Marc Orr
*/
#ifndef __Q_STRUCT_HH__
#define __Q_STRUCT_HH__
#include <bitset>
#include <cstdint>
// Maximum number of arguments
static const int KER_NUM_ARGS = 32;
// Kernel argument buffer size
static const int KER_ARGS_LENGTH = 512;
class LdsChunk;
struct NDRange;
// Be very careful of alignment in this structure. The structure
// must compile to the same layout in both 32-bit and 64-bit mode.
struct HsaQueueEntry
{
// Base pointer for array of instruction pointers
uint64_t code_ptr;
// Grid Size (3 dimensions)
uint32_t gdSize[3];
// Workgroup Size (3 dimensions)
uint32_t wgSize[3];
uint16_t sRegCount;
uint16_t dRegCount;
uint16_t cRegCount;
uint64_t privMemStart;
uint32_t privMemPerItem;
uint32_t privMemTotal;
uint64_t spillMemStart;
uint32_t spillMemPerItem;
uint32_t spillMemTotal;
uint64_t roMemStart;
uint32_t roMemTotal;
// Size (in bytes) of LDS
uint32_t ldsSize;
// Virtual Memory Id (unused right now)
uint32_t vmId;
// Pointer to dependency chain (unused now)
uint64_t depends;
// pointer to bool
uint64_t addrToNotify;
// pointer to uint32_t
uint64_t numDispLeft;
// variables to pass arguments when running in standalone mode,
// will be removed when run.py and sh.cpp have been updated to
// use args and offset arrays
uint64_t arg1;
uint64_t arg2;
uint64_t arg3;
uint64_t arg4;
// variables to pass arguments when running in cpu+gpu mode
uint8_t args[KER_ARGS_LENGTH];
uint16_t offsets[KER_NUM_ARGS];
uint16_t num_args;
};
// State used to start (or restart) a WF
struct WFContext
{
// 32 bit values
// barrier state
int bar_cnt[VSZ];
// id (which WF in the WG)
int cnt;
// more barrier state
int max_bar_cnt;
int old_barrier_cnt;
int barrier_cnt;
// More Program Counter Stuff
uint32_t pc;
// Program counter of the immediate post-dominator instruction
uint32_t rpc;
// WG wide state (I don't see how to avoid redundancy here)
int cu_id;
uint32_t wg_id;
uint32_t barrier_id;
// 64 bit values (these values depend on the wavefront size)
// masks
uint64_t init_mask;
uint64_t exec_mask;
// private memory;
Addr privBase;
Addr spillBase;
LdsChunk *ldsChunk;
/*
* Kernel wide state
* This is a hack. This state should be moved through simulated memory
* during a yield. Though not much is being used here, so it's probably
* probably not a big deal.
*
* Just to add to this comment... The ndr is derived from simulated
* memory when the cl-runtime allocates an HsaQueueEntry and populates it
* for a kernel launch. So in theory the runtime should be able to keep
* that state around. Then a WF can reference it upon restart to derive
* kernel wide state. The runtime can deallocate the state when the
* kernel completes.
*/
NDRange *ndr;
};
// State that needs to be passed between the simulation and simulated app, a
// pointer to this struct can be passed through the depends field in the
// HsaQueueEntry struct
struct HostState
{
// cl_event* has original HsaQueueEntry for init
uint64_t event;
};
// Total number of HSA queues
static const int HSAQ_NQUEUES = 8;
// These values will eventually live in memory mapped registers
// and be settable by the kernel mode driver.
// Number of entries in each HSA queue
static const int HSAQ_SIZE = 64;
// Address of first HSA queue index
static const int HSAQ_INDX_BASE = 0x10000ll;
// Address of first HSA queue
static const int HSAQ_BASE = 0x11000ll;
// Suggested start of HSA code
static const int HSA_CODE_BASE = 0x18000ll;
// These are shortcuts for deriving the address of a specific
// HSA queue or queue index
#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n)
#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue))
#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0))
#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1))
#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2))
/*
* Example code for writing to a queue
*
* void
* ToQueue(int n,struct fsaQueue *val)
* {
* int wi = *(int*)HSAQ_WI(n);
* int ri = *(int*)HSAQ_RI(n);
* int ci = *(int*)HSAQ_CI(n);
*
* if (ci - ri < HSAQ_SIZE) {
* (*(int*)HSAQ_CI(n))++;
* *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val;
* (*(int*)HSAQ_WI(n))++;
* }
* }
*/
#endif // __Q_STRUCT_HH__

View file

@ -0,0 +1,67 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#include "gpu-compute/rr_scheduling_policy.hh"
#include "gpu-compute/wavefront.hh"
Wavefront*
RRSchedulingPolicy::chooseWave()
{
Wavefront *selectedWave = nullptr;
// Check to make sure ready list had atleast one schedulable wave
if (scheduleList->size()) {
// For RR policy, select the wave which is at the
// front of the list. The selected wave is popped
// out from the schedule list immediately after selection
// to avoid starvation. It is the responsibility of the
// module invoking the RR scheduler to make surei scheduling
// eligible waves are added to the back of the schedule
// list
selectedWave = scheduleList->front();
scheduleList->erase(scheduleList->begin() + 0);
} else {
panic("Empty ready list");
}
return selectedWave;
}
void
RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
{
scheduleList = list;
}

View file

@ -0,0 +1,65 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#ifndef __RR_SCHEDULING_POLICY_HH__
#define __RR_SCHEDULING_POLICY_HH__
#include <inttypes.h>
#include <cstddef>
#include <utility>
#include <vector>
#include "base/misc.hh"
class Wavefront;
// Round-Robin pick among the list of ready waves
class RRSchedulingPolicy
{
public:
RRSchedulingPolicy() : scheduleList(nullptr) { }
Wavefront* chooseWave();
void bindList(std::vector<Wavefront*> *list);
private:
// List of waves which are participating in scheduling.
// This scheduler selects one wave from this list based on
// round robin policy
std::vector<Wavefront*> *scheduleList;
};
#endif // __RR_SCHEDULING_POLICY_HH__

View file

@ -0,0 +1,151 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#include "gpu-compute/schedule_stage.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
: numSIMDs(p->num_SIMDs),
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
{
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
Scheduler newScheduler(p);
scheduler.push_back(newScheduler);
}
}
ScheduleStage::~ScheduleStage()
{
scheduler.clear();
waveStatusList.clear();
}
void
ScheduleStage::init(ComputeUnit *cu)
{
computeUnit = cu;
_name = computeUnit->name() + ".ScheduleStage";
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
scheduler[j].bindList(&computeUnit->readyList[j]);
}
for (int j = 0; j < numSIMDs; ++j) {
waveStatusList.push_back(&computeUnit->waveStatusList[j]);
}
dispatchList = &computeUnit->dispatchList;
}
void
ScheduleStage::arbitrate()
{
// iterate over all Memory pipelines
for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
if (dispatchList->at(j).first) {
Wavefront *waveToMemPipe = dispatchList->at(j).first;
// iterate over all execution pipelines
for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
if ((i != j) && (dispatchList->at(i).first)) {
Wavefront *waveToExePipe = dispatchList->at(i).first;
// if the two selected wavefronts are mapped to the same
// SIMD unit then they share the VRF
if (waveToMemPipe->simdId == waveToExePipe->simdId) {
int simdId = waveToMemPipe->simdId;
// Read VRF port arbitration:
// If there are read VRF port conflicts between the
// a memory and another instruction we drop the other
// instruction. We don't need to check for write VRF
// port conflicts because the memory instruction either
// does not need to write to the VRF (store) or will
// write to the VRF when the data comes back (load) in
// which case the arbiter of the memory pipes will
// resolve any conflicts
if (computeUnit->vrf[simdId]->
isReadConflict(waveToMemPipe->wfSlotId,
waveToExePipe->wfSlotId)) {
// FIXME: The "second" member variable is never
// used in the model. I am setting it to READY
// simply to follow the protocol of setting it
// when the WF has an instruction ready to issue
waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
.second = READY;
dispatchList->at(i).first = nullptr;
dispatchList->at(i).second = EMPTY;
break;
}
}
}
}
}
}
}
void
ScheduleStage::exec()
{
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
uint32_t readyListSize = computeUnit->readyList[j].size();
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
continue;
}
Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
dispatchList->at(j).first = waveToBeDispatched;
waveToBeDispatched->updateResources();
dispatchList->at(j).second = FILLED;
waveStatusList[waveToBeDispatched->simdId]->at(
waveToBeDispatched->wfSlotId).second = BLOCKED;
assert(computeUnit->readyList[j].size() == readyListSize - 1);
}
// arbitrate over all shared resources among instructions being issued
// simultaneously
arbitrate();
}
void
ScheduleStage::regStats()
{
}

View file

@ -0,0 +1,95 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#ifndef __SCHEDULE_STAGE_HH__
#define __SCHEDULE_STAGE_HH__
#include <utility>
#include <vector>
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/scheduler.hh"
#include "gpu-compute/scoreboard_check_stage.hh"
// Schedule or execution arbitration stage.
// From the pool of ready waves in the ready list,
// one wave is selected for each execution resource.
// The selection is made based on a scheduling policy
class ComputeUnit;
class Wavefront;
struct ComputeUnitParams;
class ScheduleStage
{
public:
ScheduleStage(const ComputeUnitParams *params);
~ScheduleStage();
void init(ComputeUnit *cu);
void exec();
void arbitrate();
// Stats related variables and methods
std::string name() { return _name; }
void regStats();
private:
ComputeUnit *computeUnit;
uint32_t numSIMDs;
uint32_t numMemUnits;
// Each execution resource will have its own
// scheduler and a dispatch list
std::vector<Scheduler> scheduler;
// Stores the status of waves. A READY implies the
// wave is ready to be scheduled this cycle and
// is already present in the readyList
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
waveStatusList;
// List of waves which will be dispatched to
// each execution resource. A FILLED implies
// dispatch list is non-empty and
// execution unit has something to execute
// this cycle. Currently, the dispatch list of
// an execution resource can hold only one wave because
// an execution resource can execute only one wave in a cycle.
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
std::string _name;
};
#endif // __SCHEDULE_STAGE_HH__

View file

@ -0,0 +1,71 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#include "gpu-compute/scheduler.hh"
Scheduler::Scheduler(const ComputeUnitParams *p)
{
if (p->execPolicy == "OLDEST-FIRST") {
schedPolicy = SCHED_POLICY::OF_POLICY;
} else if (p->execPolicy == "ROUND-ROBIN") {
schedPolicy = SCHED_POLICY::RR_POLICY;
} else {
fatal("Unimplemented scheduling policy");
}
}
Wavefront*
Scheduler::chooseWave()
{
if (schedPolicy == SCHED_POLICY::OF_POLICY) {
return OFSchedPolicy.chooseWave();
} else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
return RRSchedPolicy.chooseWave();
} else {
fatal("Unimplemented scheduling policy");
}
}
void
Scheduler::bindList(std::vector<Wavefront*> *list)
{
if (schedPolicy == SCHED_POLICY::OF_POLICY) {
OFSchedPolicy.bindList(list);
} else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
RRSchedPolicy.bindList(list);
} else {
fatal("Unimplemented scheduling policy");
}
}

View file

@ -0,0 +1,63 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#ifndef __SCHEDULER_HH__
#define __SCHEDULER_HH__
#include "gpu-compute/of_scheduling_policy.hh"
#include "gpu-compute/rr_scheduling_policy.hh"
#include "gpu-compute/scheduling_policy.hh"
#include "params/ComputeUnit.hh"
enum SCHED_POLICY
{
OF_POLICY = 0,
RR_POLICY
};
class Scheduler
{
public:
Scheduler(const ComputeUnitParams *params);
Wavefront *chooseWave();
void bindList(std::vector<Wavefront*> *list);
private:
SCHED_POLICY schedPolicy;
SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy;
SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy;
};
#endif // __SCHEDULER_HH__

View file

@ -0,0 +1,57 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#ifndef __SCHEDULING_POLICY_HH__
#define __SCHEDULING_POLICY_HH__
#include <vector>
template<typename Impl>
class SchedulingPolicy
{
public:
Wavefront* chooseWave() { return policyImpl.chooseWave(); }
void
bindList(std::vector<Wavefront*> *list)
{
return policyImpl.bindList(list);
}
private:
Impl policyImpl;
};
#endif // __SCHEDULING_POLICY_HH__

View file

@ -0,0 +1,173 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#include "gpu-compute/scoreboard_check_stage.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "params/ComputeUnit.hh"
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
: numSIMDs(p->num_SIMDs),
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
numGlbMemPipes(p->num_global_mem_pipes),
numShrMemPipes(p->num_shared_mem_pipes),
vectorAluInstAvail(nullptr),
lastGlbMemSimd(-1),
lastShrMemSimd(-1), glbMemInstAvail(nullptr),
shrMemInstAvail(nullptr)
{
}
ScoreboardCheckStage::~ScoreboardCheckStage()
{
readyList.clear();
waveStatusList.clear();
shrMemInstAvail = nullptr;
glbMemInstAvail = nullptr;
}
void
ScoreboardCheckStage::init(ComputeUnit *cu)
{
computeUnit = cu;
_name = computeUnit->name() + ".ScoreboardCheckStage";
for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
readyList.push_back(&computeUnit->readyList[unitId]);
}
for (int unitId = 0; unitId < numSIMDs; ++unitId) {
waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
}
vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
glbMemInstAvail= &computeUnit->glbMemInstAvail;
shrMemInstAvail= &computeUnit->shrMemInstAvail;
}
void
ScoreboardCheckStage::initStatistics()
{
lastGlbMemSimd = -1;
lastShrMemSimd = -1;
*glbMemInstAvail = 0;
*shrMemInstAvail = 0;
for (int unitId = 0; unitId < numSIMDs; ++unitId)
vectorAluInstAvail->at(unitId) = false;
}
void
ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
{
if (curWave->instructionBuffer.empty())
return;
// track which vector SIMD unit has at least one WV with a vector
// ALU as the oldest instruction in its Instruction buffer
vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
curWave->isOldestInstALU();
// track how many vector SIMD units have at least one WV with a
// vector Global memory instruction as the oldest instruction
// in its Instruction buffer
if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
*glbMemInstAvail <= 1) {
(*glbMemInstAvail)++;
lastGlbMemSimd = unitId;
}
// track how many vector SIMD units have at least one WV with a
// vector shared memory (LDS) instruction as the oldest instruction
// in its Instruction buffer
// TODO: parametrize the limit of the LDS units
if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
lastShrMemSimd != unitId) {
(*shrMemInstAvail)++;
lastShrMemSimd = unitId;
}
}
void
ScoreboardCheckStage::exec()
{
initStatistics();
// reset the ready list for all execution units; it will be
// constructed every cycle since resource availability may change
for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
readyList[unitId]->clear();
}
// iterate over the Wavefronts of all SIMD units
for (int unitId = 0; unitId < numSIMDs; ++unitId) {
for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
// reset the ready status of each wavefront
waveStatusList[unitId]->at(wvId).second = BLOCKED;
Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
collectStatistics(curWave, unitId);
if (curWave->ready(Wavefront::I_ALU)) {
readyList[unitId]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
} else if (curWave->ready(Wavefront::I_GLOBAL)) {
if (computeUnit->cedeSIMD(unitId, wvId)) {
continue;
}
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
} else if (curWave->ready(Wavefront::I_SHARED)) {
readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
} else if (curWave->ready(Wavefront::I_FLAT)) {
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
} else if (curWave->ready(Wavefront::I_PRIVATE)) {
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
}
}
}
}
void
ScoreboardCheckStage::regStats()
{
}

View file

@ -0,0 +1,106 @@
/*
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Sooraj Puthoor
*/
#ifndef __SCOREBOARD_CHECK_STAGE_HH__
#define __SCOREBOARD_CHECK_STAGE_HH__
#include <cstdint>
#include <string>
#include <utility>
#include <vector>
class ComputeUnit;
class Wavefront;
struct ComputeUnitParams;
enum WAVE_STATUS
{
BLOCKED = 0,
READY
};
/*
* Scoreboard check stage.
* All wavefronts are analyzed to see if they are ready
* to be executed this cycle. Both structural and data
* hazards are considered while marking a wave "ready"
* for execution. After analysis, the ready waves are
* added to readyList.
*/
class ScoreboardCheckStage
{
public:
ScoreboardCheckStage(const ComputeUnitParams* params);
~ScoreboardCheckStage();
void init(ComputeUnit *cu);
void exec();
// Stats related variables and methods
const std::string& name() const { return _name; }
void regStats();
private:
void collectStatistics(Wavefront *curWave, int unitId);
void initStatistics();
ComputeUnit *computeUnit;
uint32_t numSIMDs;
uint32_t numMemUnits;
uint32_t numGlbMemPipes;
uint32_t numShrMemPipes;
// flag per vector SIMD unit that is set when there is at least one
// WF that has a vector ALU instruction as the oldest in its
// Instruction Buffer
std::vector<bool> *vectorAluInstAvail;
int lastGlbMemSimd;
int lastShrMemSimd;
int *glbMemInstAvail;
int *shrMemInstAvail;
// List of waves which are ready to be scheduled.
// Each execution resource has a ready list
std::vector<std::vector<Wavefront*>*> readyList;
// Stores the status of waves. A READY implies the
// wave is ready to be scheduled this cycle and
// is already present in the readyList
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
waveStatusList;
std::string _name;
};
#endif // __SCOREBOARD_CHECK_STAGE_HH__

412
src/gpu-compute/shader.cc Normal file
View file

@ -0,0 +1,412 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#include "gpu-compute/shader.hh"
#include <limits>
#include "arch/x86/linux/linux.hh"
#include "base/chunk_generator.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUMem.hh"
#include "debug/HSAIL.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/qstruct.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/packet.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "sim/sim_exit.hh"
Shader::Shader(const Params *p) : SimObject(p),
clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
separate_acquire_release(p->separate_acquire_release), coissue_return(1),
trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
box_tick_cnt(0), start_tick_cnt(0)
{
cuList.resize(n_cu);
for (int i = 0; i < n_cu; ++i) {
cuList[i] = p->CUs[i];
assert(i == cuList[i]->cu_id);
cuList[i]->shader = this;
}
}
Addr
Shader::mmap(int length)
{
Addr start;
// round up length to the next page
length = roundUp(length, TheISA::PageBytes);
if (X86Linux64::mmapGrowsDown()) {
DPRINTF(HSAIL, "GROWS DOWN");
start = gpuTc->getProcessPtr()->mmap_end -length;
gpuTc->getProcessPtr()->mmap_end = start;
} else {
DPRINTF(HSAIL, "GROWS UP");
start = gpuTc->getProcessPtr()->mmap_end;
gpuTc->getProcessPtr()->mmap_end += length;
// assertion to make sure we don't overwrite the stack (it grows down)
assert(gpuTc->getProcessPtr()->mmap_end <
gpuTc->getProcessPtr()->stack_base -
gpuTc->getProcessPtr()->max_stack_size);
}
DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
gpuTc->getProcessPtr()->allocateMem(start,length);
return start;
}
void
Shader::init()
{
// grab the threadContext of the thread running on the CPU
assert(cpuPointer);
gpuTc = cpuPointer->getContext(0);
assert(gpuTc);
}
Shader::~Shader()
{
for (int j = 0; j < n_cu; ++j)
delete cuList[j];
}
void
Shader::updateThreadContext(int tid) {
// thread context of the thread which dispatched work
assert(cpuPointer);
gpuTc = cpuPointer->getContext(tid);
assert(gpuTc);
}
void
Shader::hostWakeUp(BaseCPU *cpu) {
if (cpuPointer == cpu) {
if (gpuTc->status() == ThreadContext::Suspended)
cpu->activateContext(gpuTc->threadId());
} else {
//Make sure both dispatcher and shader are trying to
//wakeup same host. Hack here to enable kernel launch
//from multiple CPUs
panic("Dispatcher wants to wakeup a different host");
}
}
Shader*
ShaderParams::create()
{
return new Shader(this);
}
void
Shader::exec()
{
tick_cnt = curTick();
box_tick_cnt = curTick() - start_tick_cnt;
// apply any scheduled adds
for (int i = 0; i < sa_n; ++i) {
if (sa_when[i] <= tick_cnt) {
*sa_val[i] += sa_x[i];
sa_val.erase(sa_val.begin() + i);
sa_x.erase(sa_x.begin() + i);
sa_when.erase(sa_when.begin() + i);
--sa_n;
--i;
}
}
// clock all of the cu's
for (int i = 0; i < n_cu; ++i)
cuList[i]->exec();
}
bool
Shader::dispatch_workgroups(NDRange *ndr)
{
bool scheduledSomething = false;
int cuCount = 0;
int curCu = nextSchedCu;
while (cuCount < n_cu) {
//Every time we try a CU, update nextSchedCu
nextSchedCu = (nextSchedCu + 1) % n_cu;
// dispatch workgroup iff the following two conditions are met:
// (a) wg_rem is true - there are unassigned workgroups in the grid
// (b) there are enough free slots in cu cuList[i] for this wg
if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
scheduledSomething = true;
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
// ticks() member function translates cycles to simulation ticks.
if (!tickEvent.scheduled()) {
schedule(tickEvent, curTick() + this->ticks(1));
}
cuList[curCu]->StartWorkgroup(ndr);
ndr->wgId[0]++;
ndr->globalWgId++;
if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
ndr->wgId[0] = 0;
ndr->wgId[1]++;
if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
ndr->wgId[1] = 0;
ndr->wgId[2]++;
if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
ndr->wg_disp_rem = false;
break;
}
}
}
}
++cuCount;
curCu = nextSchedCu;
}
return scheduledSomething;
}
void
Shader::handshake(GpuDispatcher *_dispatcher)
{
dispatcher = _dispatcher;
}
void
Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
bool suppress_func_errors, int cu_id)
{
unsigned block_size = RubySystem::getBlockSizeBytes();
unsigned size = req->getSize();
Addr tmp_addr;
BaseTLB::Mode trans_mode;
if (cmd == MemCmd::ReadReq) {
trans_mode = BaseTLB::Read;
} else if (cmd == MemCmd::WriteReq) {
trans_mode = BaseTLB::Write;
} else {
fatal("unexcepted MemCmd\n");
}
tmp_addr = req->getVaddr();
Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
// Misaligned access
if (split_addr > tmp_addr) {
RequestPtr req1, req2;
req->splitOnVaddr(split_addr, req1, req2);
PacketPtr pkt1 = new Packet(req2, cmd);
PacketPtr pkt2 = new Packet(req1, cmd);
functionalTLBAccess(pkt1, cu_id, trans_mode);
functionalTLBAccess(pkt2, cu_id, trans_mode);
PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
new_pkt1->dataStatic(data);
new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
if (suppress_func_errors) {
new_pkt1->setSuppressFuncError();
new_pkt2->setSuppressFuncError();
}
// fixme: this should be cuList[cu_id] if cu_id != n_cu
// The latter requires a memPort in the dispatcher
cuList[0]->memPort[0]->sendFunctional(new_pkt1);
cuList[0]->memPort[0]->sendFunctional(new_pkt2);
delete new_pkt1;
delete new_pkt2;
delete pkt1;
delete pkt2;
} else {
PacketPtr pkt = new Packet(req, cmd);
functionalTLBAccess(pkt, cu_id, trans_mode);
PacketPtr new_pkt = new Packet(pkt->req, cmd);
new_pkt->dataStatic(data);
if (suppress_func_errors) {
new_pkt->setSuppressFuncError();
};
// fixme: this should be cuList[cu_id] if cu_id != n_cu
// The latter requires a memPort in the dispatcher
cuList[0]->memPort[0]->sendFunctional(new_pkt);
delete new_pkt;
delete pkt;
}
}
bool
Shader::busy()
{
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
if (!cuList[i_cu]->isDone()) {
return true;
}
}
return false;
}
void
Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
{
sa_val.push_back(val);
sa_when.push_back(tick_cnt + when);
sa_x.push_back(x);
++sa_n;
}
Shader::TickEvent::TickEvent(Shader *_shader)
: Event(CPU_Tick_Pri), shader(_shader)
{
}
void
Shader::TickEvent::process()
{
if (shader->busy()) {
shader->exec();
shader->schedule(this, curTick() + shader->ticks(1));
}
}
const char*
Shader::TickEvent::description() const
{
return "Shader tick";
}
void
Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
MemCmd cmd, bool suppress_func_errors)
{
uint8_t *data_buf = (uint8_t*)ptr;
for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
!gen.done(); gen.next()) {
Request *req = new Request(0, gen.addr(), gen.size(), 0,
cuList[0]->masterId(), 0, 0, 0);
doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
data_buf += gen.size();
delete req;
}
}
void
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
{
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
}
void
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
bool suppress_func_errors)
{
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
}
void
Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
{
AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
}
void
Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
bool suppress_func_errors)
{
AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
suppress_func_errors);
}
/*
* Send a packet through the appropriate TLB functional port.
* If cu_id=n_cu, then this is the dispatcher's TLB.
* Otherwise it's the TLB of the cu_id compute unit.
*/
void
Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
{
// update senderState. Need to know the gpuTc and the TLB mode
pkt->senderState =
new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
if (cu_id == n_cu) {
dispatcher->tlbPort->sendFunctional(pkt);
} else {
// even when the perLaneTLB flag is turned on
// it's ok tp send all accesses through lane 0
// since the lane # is not known here,
// This isn't important since these are functional accesses.
cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
}
/* safe_cast the senderState */
TheISA::GpuTLB::TranslationState *sender_state =
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
delete sender_state->tlbEntry;
delete pkt->senderState;
}

212
src/gpu-compute/shader.hh Normal file
View file

@ -0,0 +1,212 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __SHADER_HH__
#define __SHADER_HH__
#include <functional>
#include <string>
#include "arch/isa.hh"
#include "arch/isa_traits.hh"
#include "base/types.hh"
#include "cpu/simple/atomic.hh"
#include "cpu/simple/timing.hh"
#include "cpu/simple_thread.hh"
#include "cpu/thread_context.hh"
#include "cpu/thread_state.hh"
#include "enums/MemOpType.hh"
#include "enums/MemType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_tlb.hh"
#include "gpu-compute/lds_state.hh"
#include "gpu-compute/qstruct.hh"
#include "mem/page_table.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/Shader.hh"
#include "sim/faults.hh"
#include "sim/process.hh"
#include "sim/sim_object.hh"
class BaseTLB;
class GpuDispatcher;
namespace TheISA
{
class GpuTLB;
}
static const int LDS_SIZE = 65536;
// Class Shader: This describes a single shader instance. Most
// configurations will only have a single shader.
class Shader : public SimObject
{
protected:
// Shader's clock period in terms of number of ticks of curTime,
// aka global simulation clock
Tick clock;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
// clock related functions ; maps to-and-from
// Simulation ticks and shader clocks.
Tick frequency() const { return SimClock::Frequency / clock; }
Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
Tick getClock() const { return clock; }
Tick curCycle() const { return curTick() / clock; }
Tick tickToCycles(Tick val) const { return val / clock;}
SimpleThread *cpuThread;
ThreadContext *gpuTc;
BaseCPU *cpuPointer;
class TickEvent : public Event
{
private:
Shader *shader;
public:
TickEvent(Shader*);
void process();
const char* description() const;
};
TickEvent tickEvent;
// is this simulation going to be timing mode in the memory?
bool timingSim;
hsail_mode_e hsail_mode;
// If set, issue acq packet @ kernel launch
int impl_kern_boundary_sync;
// If set, generate a separate packet for acquire/release on
// ld_acquire/st_release/atomic operations
int separate_acquire_release;
// If set, fetch returns may be coissued with instructions
int coissue_return;
// If set, always dump all 64 gprs to trace
int trace_vgpr_all;
// Number of cu units in the shader
int n_cu;
// Number of wavefront slots per cu
int n_wf;
// The size of global memory
int globalMemSize;
/*
* Bytes/work-item for call instruction
* The number of arguments for an hsail function will
* vary. We simply determine the maximum # of arguments
* required by any hsail function up front before the
* simulation (during parsing of the Brig) and record
* that number here.
*/
int funcargs_size;
// Tracks CU that rr dispatcher should attempt scheduling
int nextSchedCu;
// Size of scheduled add queue
uint32_t sa_n;
// Pointer to value to be increments
std::vector<uint32_t*> sa_val;
// When to do the increment
std::vector<uint64_t> sa_when;
// Amount to increment by
std::vector<int32_t> sa_x;
// List of Compute Units (CU's)
std::vector<ComputeUnit*> cuList;
uint64_t tick_cnt;
uint64_t box_tick_cnt;
uint64_t start_tick_cnt;
GpuDispatcher *dispatcher;
Shader(const Params *p);
~Shader();
virtual void init();
// Run shader
void exec();
// Check to see if shader is busy
bool busy();
// Schedule a 32-bit value to be incremented some time in the future
void ScheduleAdd(uint32_t *val, Tick when, int x);
bool processTimingPacket(PacketPtr pkt);
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
MemCmd cmd, bool suppress_func_errors);
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
bool suppress_func_errors);
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
bool suppress_func_errors);
void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
bool suppress_func_errors, int cu_id);
void
registerCU(int cu_id, ComputeUnit *compute_unit)
{
cuList[cu_id] = compute_unit;
}
void handshake(GpuDispatcher *dispatcher);
bool dispatch_workgroups(NDRange *ndr);
Addr mmap(int length);
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
void updateThreadContext(int tid);
void hostWakeUp(BaseCPU *cpu);
};
#endif // __SHADER_HH__

View file

@ -0,0 +1,108 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos
*/
#include "gpu-compute/simple_pool_manager.hh"
#include "base/misc.hh"
// return the min number of elements that the manager can reserve given
// a request for "size" elements
uint32_t
SimplePoolManager::minAllocatedElements(uint32_t size)
{
fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
size);
return size % minAllocation() > 0 ?
(minAllocation() - (size % minAllocation())) + size : size;
}
std::string
SimplePoolManager::printRegion()
{
std::string _cout;
if (_reservedGroups == 0)
_cout = "VRF is empty\n";
else if (_reservedGroups > 0) {
uint32_t reservedEntries = _reservedGroups * _regionSize;
_cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
}
return _cout;
}
bool
SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
{
assert(numRegions * minAllocatedElements(size) <= poolSize());
return _reservedGroups == 0;
}
void
SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx)
{
assert(_reservedGroups > 0);
--_reservedGroups;
if (!_reservedGroups)
_nxtFreeIdx = 0;
}
uint32_t
SimplePoolManager::allocateRegion(const uint32_t size,
uint32_t *reservedPoolSize)
{
uint32_t actualSize = minAllocatedElements(size);
uint32_t startIdx = _nxtFreeIdx;
_nxtFreeIdx += actualSize;
_regionSize = actualSize;
assert(_nxtFreeIdx < poolSize());
*reservedPoolSize = actualSize;
++_reservedGroups;
return startIdx;
}
uint32_t
SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
{
bool wrapAround = (region.first > region.second);
if (!wrapAround) {
return region.second - region.first + 1;
} else {
return region.second + poolSize() - region.first + 1;
}
}

View file

@ -0,0 +1,72 @@
/*
* Copyright (c) 2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: John Kalamatianos
*/
#ifndef __SIMPLE_POOL_MANAGER_HH__
#define __SIMPLE_POOL_MANAGER_HH__
#include <cassert>
#include <cstdint>
#include "gpu-compute/pool_manager.hh"
// Simple Pool Manager: allows one region per pool. No region merging is
// supported.
class SimplePoolManager : public PoolManager
{
public:
SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
: PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
_reservedGroups(0)
{
}
uint32_t minAllocatedElements(uint32_t size);
std::string printRegion();
bool canAllocate(uint32_t numRegions, uint32_t size);
uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize);
void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
uint32_t regionSize(std::pair<uint32_t,uint32_t> &region);
private:
// actual size of a region (normalized to the minimum size that can
// be reserved)
uint32_t _regionSize;
// next index to allocate a region
uint8_t _nxtFreeIdx;
// number of groups that reserve a region
uint32_t _reservedGroups;
};
#endif // __SIMPLE_POOL_MANAGER_HH__

Some files were not shown because too many files have changed in this diff Show more