gpu-compute: AMD's baseline GPU model
This commit is contained in:
parent
28e353e040
commit
1a7d3f9fcb
191 changed files with 95286 additions and 92 deletions
41
SConstruct
41
SConstruct
|
@ -1065,7 +1065,9 @@ main = conf.Finish()
|
|||
|
||||
# Define the universe of supported ISAs
|
||||
all_isa_list = [ ]
|
||||
all_gpu_isa_list = [ ]
|
||||
Export('all_isa_list')
|
||||
Export('all_gpu_isa_list')
|
||||
|
||||
class CpuModel(object):
|
||||
'''The CpuModel class encapsulates everything the ISA parser needs to
|
||||
|
@ -1121,9 +1123,11 @@ for bdir in [ base_dir ] + extras_dir_list:
|
|||
SConscript(joinpath(root, 'SConsopts'))
|
||||
|
||||
all_isa_list.sort()
|
||||
all_gpu_isa_list.sort()
|
||||
|
||||
sticky_vars.AddVariables(
|
||||
EnumVariable('TARGET_ISA', 'Target ISA', 'alpha', all_isa_list),
|
||||
EnumVariable('TARGET_GPU_ISA', 'Target GPU ISA', 'hsail', all_gpu_isa_list),
|
||||
ListVariable('CPU_MODELS', 'CPU models',
|
||||
sorted(n for n,m in CpuModel.dict.iteritems() if m.default),
|
||||
sorted(CpuModel.dict.keys())),
|
||||
|
@ -1139,6 +1143,7 @@ sticky_vars.AddVariables(
|
|||
BoolVariable('USE_FENV', 'Use <fenv.h> IEEE mode control', have_fenv),
|
||||
BoolVariable('CP_ANNOTATE', 'Enable critical path annotation capability', False),
|
||||
BoolVariable('USE_KVM', 'Enable hardware virtualized (KVM) CPU models', have_kvm),
|
||||
BoolVariable('BUILD_GPU', 'Build the compute-GPU model', False),
|
||||
EnumVariable('PROTOCOL', 'Coherence protocol for Ruby', 'None',
|
||||
all_protocols),
|
||||
EnumVariable('BACKTRACE_IMPL', 'Post-mortem dump implementation',
|
||||
|
@ -1146,9 +1151,9 @@ sticky_vars.AddVariables(
|
|||
)
|
||||
|
||||
# These variables get exported to #defines in config/*.hh (see src/SConscript).
|
||||
export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'CP_ANNOTATE',
|
||||
'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL', 'HAVE_PROTOBUF',
|
||||
'HAVE_PERF_ATTR_EXCLUDE_HOST']
|
||||
export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', 'TARGET_ISA', 'TARGET_GPU_ISA',
|
||||
'CP_ANNOTATE', 'USE_POSIX_CLOCK', 'USE_KVM', 'PROTOCOL',
|
||||
'HAVE_PROTOBUF', 'HAVE_PERF_ATTR_EXCLUDE_HOST']
|
||||
|
||||
###################################################
|
||||
#
|
||||
|
@ -1226,6 +1231,7 @@ main.SConscript('ext/nomali/SConscript',
|
|||
###################################################
|
||||
|
||||
main['ALL_ISA_LIST'] = all_isa_list
|
||||
main['ALL_GPU_ISA_LIST'] = all_gpu_isa_list
|
||||
all_isa_deps = {}
|
||||
def make_switching_dir(dname, switch_headers, env):
|
||||
# Generate the header. target[0] is the full path of the output
|
||||
|
@ -1258,6 +1264,35 @@ def make_switching_dir(dname, switch_headers, env):
|
|||
|
||||
Export('make_switching_dir')
|
||||
|
||||
def make_gpu_switching_dir(dname, switch_headers, env):
|
||||
# Generate the header. target[0] is the full path of the output
|
||||
# header to generate. 'source' is a dummy variable, since we get the
|
||||
# list of ISAs from env['ALL_ISA_LIST'].
|
||||
def gen_switch_hdr(target, source, env):
|
||||
fname = str(target[0])
|
||||
|
||||
isa = env['TARGET_GPU_ISA'].lower()
|
||||
|
||||
try:
|
||||
f = open(fname, 'w')
|
||||
print >>f, '#include "%s/%s/%s"' % (dname, isa, basename(fname))
|
||||
f.close()
|
||||
except IOError:
|
||||
print "Failed to create %s" % fname
|
||||
raise
|
||||
|
||||
# Build SCons Action object. 'varlist' specifies env vars that this
|
||||
# action depends on; when env['ALL_ISA_LIST'] changes these actions
|
||||
# should get re-executed.
|
||||
switch_hdr_action = MakeAction(gen_switch_hdr,
|
||||
Transform("GENERATE"), varlist=['ALL_ISA_GPU_LIST'])
|
||||
|
||||
# Instantiate actions for each header
|
||||
for hdr in switch_headers:
|
||||
env.Command(hdr, [], switch_hdr_action)
|
||||
|
||||
Export('make_gpu_switching_dir')
|
||||
|
||||
# all-isas -> all-deps -> all-environs -> all_targets
|
||||
main.Alias('#all-isas', [])
|
||||
main.Alias('#all-deps', '#all-isas')
|
||||
|
|
5
build_opts/HSAIL_X86
Normal file
5
build_opts/HSAIL_X86
Normal file
|
@ -0,0 +1,5 @@
|
|||
PROTOCOL = 'GPU_RfO'
|
||||
TARGET_ISA = 'x86'
|
||||
TARGET_GPU_ISA = 'hsail'
|
||||
BUILD_GPU = True
|
||||
CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
|
3
build_opts/X86_MOESI_AMD_Base
Normal file
3
build_opts/X86_MOESI_AMD_Base
Normal file
|
@ -0,0 +1,3 @@
|
|||
PROTOCOL = 'MOESI_AMD_Base'
|
||||
TARGET_ISA = 'x86'
|
||||
CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
|
203
configs/common/GPUTLBConfig.py
Normal file
203
configs/common/GPUTLBConfig.py
Normal file
|
@ -0,0 +1,203 @@
|
|||
#
|
||||
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Lisa Hsu
|
||||
#
|
||||
|
||||
# Configure the TLB hierarchy
|
||||
# Places which would probably need to be modified if you
|
||||
# want a different hierarchy are specified by a <Modify here .. >'
|
||||
# comment
|
||||
import m5
|
||||
from m5.objects import *
|
||||
|
||||
def TLB_constructor(level):
|
||||
|
||||
constructor_call = "X86GPUTLB(size = options.L%(level)dTLBentries, \
|
||||
assoc = options.L%(level)dTLBassoc, \
|
||||
hitLatency = options.L%(level)dAccessLatency,\
|
||||
missLatency2 = options.L%(level)dMissLatency,\
|
||||
maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
|
||||
accessDistance = options.L%(level)dAccessDistanceStat,\
|
||||
clk_domain = SrcClockDomain(\
|
||||
clock = options.GPUClock,\
|
||||
voltage_domain = VoltageDomain(\
|
||||
voltage = options.gpu_voltage)))" % locals()
|
||||
return constructor_call
|
||||
|
||||
def Coalescer_constructor(level):
|
||||
|
||||
constructor_call = "TLBCoalescer(probesPerCycle = \
|
||||
options.L%(level)dProbesPerCycle, \
|
||||
coalescingWindow = options.L%(level)dCoalescingWindow,\
|
||||
disableCoalescing = options.L%(level)dDisableCoalescing,\
|
||||
clk_domain = SrcClockDomain(\
|
||||
clock = options.GPUClock,\
|
||||
voltage_domain = VoltageDomain(\
|
||||
voltage = options.gpu_voltage)))" % locals()
|
||||
return constructor_call
|
||||
|
||||
def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
|
||||
# arguments: options, TLB level, number of private structures for this Level,
|
||||
# TLB name and Coalescer name
|
||||
for i in xrange(my_index):
|
||||
TLB_name.append(eval(TLB_constructor(my_level)))
|
||||
Coalescer_name.append(eval(Coalescer_constructor(my_level)))
|
||||
|
||||
def config_tlb_hierarchy(options, system, shader_idx):
|
||||
n_cu = options.num_compute_units
|
||||
# Make this configurable now, instead of the hard coded val. The dispatcher
|
||||
# is always the last item in the system.cpu list.
|
||||
dispatcher_idx = len(system.cpu) - 1
|
||||
|
||||
if options.TLB_config == "perLane":
|
||||
num_TLBs = 64 * n_cu
|
||||
elif options.TLB_config == "mono":
|
||||
num_TLBs = 1
|
||||
elif options.TLB_config == "perCU":
|
||||
num_TLBs = n_cu
|
||||
elif options.TLB_config == "2CU":
|
||||
num_TLBs = n_cu >> 1
|
||||
else:
|
||||
print "Bad option for TLB Configuration."
|
||||
sys.exit(1)
|
||||
|
||||
#----------------------------------------------------------------------------------------
|
||||
# A visual representation of the TLB hierarchy
|
||||
# for ease of configuration
|
||||
# < Modify here the width and the number of levels if you want a different configuration >
|
||||
# width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
|
||||
L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
|
||||
{'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
|
||||
{'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
|
||||
|
||||
L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
|
||||
L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
|
||||
|
||||
TLB_hierarchy = [L1, L2, L3]
|
||||
|
||||
#----------------------------------------------------------------------------------------
|
||||
# Create the hiearchy
|
||||
# Call the appropriate constructors and add objects to the system
|
||||
|
||||
for i in xrange(len(TLB_hierarchy)):
|
||||
hierarchy_level = TLB_hierarchy[i]
|
||||
level = i+1
|
||||
for TLB_type in hierarchy_level:
|
||||
TLB_index = TLB_type['width']
|
||||
TLB_array = TLB_type['TLBarray']
|
||||
Coalescer_array = TLB_type['CoalescerArray']
|
||||
# If the sim calls for a fixed L1 TLB size across CUs,
|
||||
# override the TLB entries option
|
||||
if options.tot_L1TLB_size:
|
||||
options.L1TLBentries = options.tot_L1TLB_size / num_TLBs
|
||||
if options.L1TLBassoc > options.L1TLBentries:
|
||||
options.L1TLBassoc = options.L1TLBentries
|
||||
# call the constructors for the TLB and the Coalescer
|
||||
create_TLB_Coalescer(options, level, TLB_index,\
|
||||
TLB_array, Coalescer_array)
|
||||
|
||||
system_TLB_name = TLB_type['name'] + '_tlb'
|
||||
system_Coalescer_name = TLB_type['name'] + '_coalescer'
|
||||
|
||||
# add the different TLB levels to the system
|
||||
# Modify here if you want to make the TLB hierarchy a child of
|
||||
# the shader.
|
||||
exec('system.%s = TLB_array' % system_TLB_name)
|
||||
exec('system.%s = Coalescer_array' % system_Coalescer_name)
|
||||
|
||||
#===========================================================
|
||||
# Specify the TLB hierarchy (i.e., port connections)
|
||||
# All TLBs but the last level TLB need to have a memSidePort (master)
|
||||
#===========================================================
|
||||
|
||||
# Each TLB is connected with its Coalescer through a single port.
|
||||
# There is a one-to-one mapping of TLBs to Coalescers at a given level
|
||||
# This won't be modified no matter what the hierarchy looks like.
|
||||
for i in xrange(len(TLB_hierarchy)):
|
||||
hierarchy_level = TLB_hierarchy[i]
|
||||
level = i+1
|
||||
for TLB_type in hierarchy_level:
|
||||
name = TLB_type['name']
|
||||
for index in range(TLB_type['width']):
|
||||
exec('system.%s_coalescer[%d].master[0] = \
|
||||
system.%s_tlb[%d].slave[0]' % \
|
||||
(name, index, name, index))
|
||||
|
||||
# Connect the cpuSidePort (slave) of all the coalescers in level 1
|
||||
# < Modify here if you want a different configuration >
|
||||
for TLB_type in L1:
|
||||
name = TLB_type['name']
|
||||
num_TLBs = TLB_type['width']
|
||||
if name == 'l1': # L1 D-TLBs
|
||||
tlb_per_cu = num_TLBs / n_cu
|
||||
for cu_idx in range(n_cu):
|
||||
if tlb_per_cu:
|
||||
for tlb in range(tlb_per_cu):
|
||||
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
|
||||
system.l1_coalescer[%d].slave[%d]' % \
|
||||
(shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
|
||||
else:
|
||||
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
|
||||
system.l1_coalescer[%d].slave[%d]' % \
|
||||
(shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
|
||||
|
||||
elif name == 'dispatcher': # Dispatcher TLB
|
||||
for index in range(TLB_type['width']):
|
||||
exec('system.cpu[%d].translation_port = \
|
||||
system.dispatcher_coalescer[%d].slave[0]' % \
|
||||
(dispatcher_idx, index))
|
||||
elif name == 'sqc': # I-TLB
|
||||
for index in range(n_cu):
|
||||
sqc_tlb_index = index / options.cu_per_sqc
|
||||
sqc_tlb_port_id = index % options.cu_per_sqc
|
||||
exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
|
||||
system.sqc_coalescer[%d].slave[%d]' % \
|
||||
(shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
|
||||
|
||||
|
||||
# Connect the memSidePorts (masters) of all the TLBs with the
|
||||
# cpuSidePorts (slaves) of the Coalescers of the next level
|
||||
# < Modify here if you want a different configuration >
|
||||
# L1 <-> L2
|
||||
l2_coalescer_index = 0
|
||||
for TLB_type in L1:
|
||||
name = TLB_type['name']
|
||||
for index in range(TLB_type['width']):
|
||||
exec('system.%s_tlb[%d].master[0] = \
|
||||
system.l2_coalescer[0].slave[%d]' % \
|
||||
(name, index, l2_coalescer_index))
|
||||
l2_coalescer_index += 1
|
||||
# L2 <-> L3
|
||||
system.l2_tlb[0].master[0] = system.l3_coalescer[0].slave[0]
|
||||
|
||||
return system
|
109
configs/common/GPUTLBOptions.py
Normal file
109
configs/common/GPUTLBOptions.py
Normal file
|
@ -0,0 +1,109 @@
|
|||
#
|
||||
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Myrto Papadopoulou
|
||||
#
|
||||
|
||||
def tlb_options(parser):
|
||||
|
||||
#===================================================================
|
||||
# TLB Configuration
|
||||
#===================================================================
|
||||
|
||||
parser.add_option("--TLB-config", type="string", default="perCU",
|
||||
help="Options are: perCU (default), mono, 2CU, or perLane")
|
||||
|
||||
#===================================================================
|
||||
# L1 TLB Options (D-TLB, I-TLB, Dispatcher-TLB)
|
||||
#===================================================================
|
||||
|
||||
parser.add_option("--L1TLBentries", type='int', default="32")
|
||||
parser.add_option("--L1TLBassoc", type='int', default="32")
|
||||
parser.add_option("--L1AccessLatency", type='int', default="1",
|
||||
help="latency in gpu cycles")
|
||||
parser.add_option("--L1MissLatency", type='int', default="750",
|
||||
help="latency (in gpu cycles) of a page walk, "
|
||||
"if this is a last level TLB")
|
||||
parser.add_option("--L1MaxOutstandingReqs", type='int', default="64")
|
||||
parser.add_option("--L1AccessDistanceStat", action="store_true")
|
||||
parser.add_option("--tot-L1TLB-size", type="int", default="0")
|
||||
|
||||
#===================================================================
|
||||
# L2 TLB Options
|
||||
#===================================================================
|
||||
|
||||
parser.add_option("--L2TLBentries", type='int', default="4096")
|
||||
parser.add_option("--L2TLBassoc", type='int', default="32")
|
||||
parser.add_option("--L2AccessLatency", type='int', default="69",
|
||||
help="latency in gpu cycles")
|
||||
parser.add_option("--L2MissLatency", type='int', default="750",
|
||||
help="latency (in gpu cycles) of a page walk, "
|
||||
"if this is a last level TLB")
|
||||
parser.add_option("--L2MaxOutstandingReqs", type='int', default="64")
|
||||
parser.add_option("--L2AccessDistanceStat", action="store_true")
|
||||
|
||||
#===================================================================
|
||||
# L3 TLB Options
|
||||
#===================================================================
|
||||
|
||||
parser.add_option("--L3TLBentries", type='int', default="8192")
|
||||
parser.add_option("--L3TLBassoc", type='int', default="32")
|
||||
parser.add_option("--L3AccessLatency", type='int', default="150",
|
||||
help="latency in gpu cycles")
|
||||
parser.add_option("--L3MissLatency", type='int', default="750",
|
||||
help="latency (in gpu cycles) of a page walk")
|
||||
parser.add_option("--L3MaxOutstandingReqs", type='int', default="64")
|
||||
parser.add_option("--L3AccessDistanceStat", action="store_true")
|
||||
|
||||
#===================================================================
|
||||
# L1 TLBCoalescer Options
|
||||
#===================================================================
|
||||
|
||||
parser.add_option("--L1ProbesPerCycle", type='int', default="2")
|
||||
parser.add_option("--L1CoalescingWindow", type='int', default="1")
|
||||
parser.add_option("--L1DisableCoalescing", action="store_true")
|
||||
|
||||
#===================================================================
|
||||
# L2 TLBCoalescer Options
|
||||
#===================================================================
|
||||
|
||||
parser.add_option("--L2ProbesPerCycle", type='int', default="2")
|
||||
parser.add_option("--L2CoalescingWindow", type='int', default="1")
|
||||
parser.add_option("--L2DisableCoalescing", action="store_true")
|
||||
|
||||
#===================================================================
|
||||
# L3 TLBCoalescer Options
|
||||
#===================================================================
|
||||
|
||||
parser.add_option("--L3ProbesPerCycle", type='int', default="2")
|
||||
parser.add_option("--L3CoalescingWindow", type='int', default="1")
|
||||
parser.add_option("--L3DisableCoalescing", action="store_true")
|
499
configs/example/apu_se.py
Normal file
499
configs/example/apu_se.py
Normal file
|
@ -0,0 +1,499 @@
|
|||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Sooraj Puthoor
|
||||
#
|
||||
|
||||
import optparse, os, re
|
||||
import math
|
||||
import glob
|
||||
import inspect
|
||||
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.util import addToPath
|
||||
|
||||
addToPath('../ruby')
|
||||
addToPath('../common')
|
||||
addToPath('../topologies')
|
||||
|
||||
import Options
|
||||
import Ruby
|
||||
import Simulation
|
||||
import GPUTLBOptions, GPUTLBConfig
|
||||
|
||||
########################## Script Options ########################
|
||||
def setOption(parser, opt_str, value = 1):
|
||||
# check to make sure the option actually exists
|
||||
if not parser.has_option(opt_str):
|
||||
raise Exception("cannot find %s in list of possible options" % opt_str)
|
||||
|
||||
opt = parser.get_option(opt_str)
|
||||
# set the value
|
||||
exec("parser.values.%s = %s" % (opt.dest, value))
|
||||
|
||||
def getOption(parser, opt_str):
|
||||
# check to make sure the option actually exists
|
||||
if not parser.has_option(opt_str):
|
||||
raise Exception("cannot find %s in list of possible options" % opt_str)
|
||||
|
||||
opt = parser.get_option(opt_str)
|
||||
# get the value
|
||||
exec("return_value = parser.values.%s" % opt.dest)
|
||||
return return_value
|
||||
|
||||
# Adding script options
|
||||
parser = optparse.OptionParser()
|
||||
Options.addCommonOptions(parser)
|
||||
Options.addSEOptions(parser)
|
||||
|
||||
parser.add_option("--cpu-only-mode", action="store_true", default=False,
|
||||
help="APU mode. Used to take care of problems in "\
|
||||
"Ruby.py while running APU protocols")
|
||||
parser.add_option("-k", "--kernel-files",
|
||||
help="file(s) containing GPU kernel code (colon separated)")
|
||||
parser.add_option("-u", "--num-compute-units", type="int", default=1,
|
||||
help="number of GPU compute units"),
|
||||
parser.add_option("--num-cp", type="int", default=0,
|
||||
help="Number of GPU Command Processors (CP)")
|
||||
parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
|
||||
|
||||
# not super important now, but to avoid putting the number 4 everywhere, make
|
||||
# it an option/knob
|
||||
parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
|
||||
"sharing an SQC (icache, and thus icache TLB)")
|
||||
parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
|
||||
"per CU")
|
||||
parser.add_option("--wf-size", type="int", default=64,
|
||||
help="Wavefront size(in workitems)")
|
||||
parser.add_option("--sp-bypass-path-length", type="int", default=4, \
|
||||
help="Number of stages of bypass path in vector ALU for Single Precision ops")
|
||||
parser.add_option("--dp-bypass-path-length", type="int", default=4, \
|
||||
help="Number of stages of bypass path in vector ALU for Double Precision ops")
|
||||
# issue period per SIMD unit: number of cycles before issuing another vector
|
||||
parser.add_option("--issue-period", type="int", default=4, \
|
||||
help="Number of cycles per vector instruction issue period")
|
||||
parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
|
||||
help="VGPR to Coalescer (Global Memory) data bus width in bytes")
|
||||
parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
|
||||
help="Coalescer to VGPR (Global Memory) data bus width in bytes")
|
||||
# Currently we only support 1 local memory pipe
|
||||
parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
|
||||
help="Number of Shared Memory pipelines per CU")
|
||||
# Currently we only support 1 global memory pipe
|
||||
parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
|
||||
help="Number of Global Memory pipelines per CU")
|
||||
parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
|
||||
"WF slots per SIMD")
|
||||
|
||||
parser.add_option("--vreg-file-size", type="int", default=2048,
|
||||
help="number of physical vector registers per SIMD")
|
||||
parser.add_option("--bw-scalor", type="int", default=0,
|
||||
help="bandwidth scalor for scalability analysis")
|
||||
parser.add_option("--CPUClock", type="string", default="2GHz",
|
||||
help="CPU clock")
|
||||
parser.add_option("--GPUClock", type="string", default="1GHz",
|
||||
help="GPU clock")
|
||||
parser.add_option("--cpu-voltage", action="store", type="string",
|
||||
default='1.0V',
|
||||
help = """CPU voltage domain""")
|
||||
parser.add_option("--gpu-voltage", action="store", type="string",
|
||||
default='1.0V',
|
||||
help = """CPU voltage domain""")
|
||||
parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
|
||||
help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
|
||||
parser.add_option("--xact-cas-mode", action="store_true",
|
||||
help="enable load_compare mode (transactional CAS)")
|
||||
parser.add_option("--SegFaultDebug",action="store_true",
|
||||
help="checks for GPU seg fault before TLB access")
|
||||
parser.add_option("--FunctionalTLB",action="store_true",
|
||||
help="Assumes TLB has no latency")
|
||||
parser.add_option("--LocalMemBarrier",action="store_true",
|
||||
help="Barrier does not wait for writethroughs to complete")
|
||||
parser.add_option("--countPages", action="store_true",
|
||||
help="Count Page Accesses and output in per-CU output files")
|
||||
parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
|
||||
"TLBs")
|
||||
parser.add_option("--pf-type", type="string", help="type of prefetch: "\
|
||||
"PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
|
||||
parser.add_option("--pf-stride", type="int", help="set prefetch stride")
|
||||
parser.add_option("--numLdsBanks", type="int", default=32,
|
||||
help="number of physical banks per LDS module")
|
||||
parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
|
||||
help="number of cycles per LDS bank conflict")
|
||||
|
||||
|
||||
Ruby.define_options(parser)
|
||||
|
||||
#add TLB options to the parser
|
||||
GPUTLBOptions.tlb_options(parser)
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
# The GPU cache coherence protocols only work with the backing store
|
||||
setOption(parser, "--access-backing-store")
|
||||
|
||||
# if benchmark root is specified explicitly, that overrides the search path
|
||||
if options.benchmark_root:
|
||||
benchmark_path = [options.benchmark_root]
|
||||
else:
|
||||
# Set default benchmark search path to current dir
|
||||
benchmark_path = ['.']
|
||||
|
||||
########################## Sanity Check ########################
|
||||
|
||||
# Currently the gpu model requires ruby
|
||||
if buildEnv['PROTOCOL'] == 'None':
|
||||
fatal("GPU model requires ruby")
|
||||
|
||||
# Currently the gpu model requires only timing or detailed CPU
|
||||
if not (options.cpu_type == "timing" or
|
||||
options.cpu_type == "detailed"):
|
||||
fatal("GPU model requires timing or detailed CPU")
|
||||
|
||||
# This file can support multiple compute units
|
||||
assert(options.num_compute_units >= 1)
|
||||
|
||||
# Currently, the sqc (I-Cache of GPU) is shared by
|
||||
# multiple compute units(CUs). The protocol works just fine
|
||||
# even if sqc is not shared. Overriding this option here
|
||||
# so that the user need not explicitly set this (assuming
|
||||
# sharing sqc is the common usage)
|
||||
n_cu = options.num_compute_units
|
||||
num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
|
||||
options.num_sqc = num_sqc # pass this to Ruby
|
||||
|
||||
########################## Creating the GPU system ########################
|
||||
# shader is the GPU
|
||||
shader = Shader(n_wf = options.wfs_per_simd,
|
||||
clk_domain = SrcClockDomain(
|
||||
clock = options.GPUClock,
|
||||
voltage_domain = VoltageDomain(
|
||||
voltage = options.gpu_voltage)))
|
||||
|
||||
# GPU_RfO(Read For Ownership) implements SC/TSO memory model.
|
||||
# Other GPU protocols implement release consistency at GPU side.
|
||||
# So, all GPU protocols other than GPU_RfO should make their writes
|
||||
# visible to the global memory and should read from global memory
|
||||
# during kernal boundary. The pipeline initiates(or do not initiate)
|
||||
# the acquire/release operation depending on this impl_kern_boundary_sync
|
||||
# flag. This flag=true means pipeline initiates a acquire/release operation
|
||||
# at kernel boundary.
|
||||
if buildEnv['PROTOCOL'] == 'GPU_RfO':
|
||||
shader.impl_kern_boundary_sync = False
|
||||
else:
|
||||
shader.impl_kern_boundary_sync = True
|
||||
|
||||
# Switching off per-lane TLB by default
|
||||
per_lane = False
|
||||
if options.TLB_config == "perLane":
|
||||
per_lane = True
|
||||
|
||||
# List of compute units; one GPU can have multiple compute units
|
||||
compute_units = []
|
||||
for i in xrange(n_cu):
|
||||
compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
|
||||
num_SIMDs = options.simds_per_cu,
|
||||
wfSize = options.wf_size,
|
||||
spbypass_pipe_length = options.sp_bypass_path_length,
|
||||
dpbypass_pipe_length = options.dp_bypass_path_length,
|
||||
issue_period = options.issue_period,
|
||||
coalescer_to_vrf_bus_width = \
|
||||
options.glbmem_rd_bus_width,
|
||||
vrf_to_coalescer_bus_width = \
|
||||
options.glbmem_wr_bus_width,
|
||||
num_global_mem_pipes = \
|
||||
options.glb_mem_pipes_per_cu,
|
||||
num_shared_mem_pipes = \
|
||||
options.shr_mem_pipes_per_cu,
|
||||
n_wf = options.wfs_per_simd,
|
||||
execPolicy = options.CUExecPolicy,
|
||||
xactCasMode = options.xact_cas_mode,
|
||||
debugSegFault = options.SegFaultDebug,
|
||||
functionalTLB = options.FunctionalTLB,
|
||||
localMemBarrier = options.LocalMemBarrier,
|
||||
countPages = options.countPages,
|
||||
localDataStore = \
|
||||
LdsState(banks = options.numLdsBanks,
|
||||
bankConflictPenalty = \
|
||||
options.ldsBankConflictPenalty)))
|
||||
wavefronts = []
|
||||
vrfs = []
|
||||
for j in xrange(options.simds_per_cu):
|
||||
for k in xrange(shader.n_wf):
|
||||
wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
|
||||
vrfs.append(VectorRegisterFile(simd_id=j,
|
||||
num_regs_per_simd=options.vreg_file_size))
|
||||
compute_units[-1].wavefronts = wavefronts
|
||||
compute_units[-1].vector_register_file = vrfs
|
||||
if options.TLB_prefetch:
|
||||
compute_units[-1].prefetch_depth = options.TLB_prefetch
|
||||
compute_units[-1].prefetch_prev_type = options.pf_type
|
||||
|
||||
# attach the LDS and the CU to the bus (actually a Bridge)
|
||||
compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
|
||||
compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
|
||||
|
||||
# Attach compute units to GPU
|
||||
shader.CUs = compute_units
|
||||
|
||||
########################## Creating the CPU system ########################
|
||||
options.num_cpus = options.num_cpus
|
||||
|
||||
# The shader core will be whatever is after the CPU cores are accounted for
|
||||
shader_idx = options.num_cpus
|
||||
|
||||
# The command processor will be whatever is after the shader is accounted for
|
||||
cp_idx = shader_idx + 1
|
||||
cp_list = []
|
||||
|
||||
# List of CPUs
|
||||
cpu_list = []
|
||||
|
||||
# We only support timing mode for shader and memory
|
||||
shader.timing = True
|
||||
mem_mode = 'timing'
|
||||
|
||||
# create the cpus
|
||||
for i in range(options.num_cpus):
|
||||
cpu = None
|
||||
if options.cpu_type == "detailed":
|
||||
cpu = DerivO3CPU(cpu_id=i,
|
||||
clk_domain = SrcClockDomain(
|
||||
clock = options.CPUClock,
|
||||
voltage_domain = VoltageDomain(
|
||||
voltage = options.cpu_voltage)))
|
||||
elif options.cpu_type == "timing":
|
||||
cpu = TimingSimpleCPU(cpu_id=i,
|
||||
clk_domain = SrcClockDomain(
|
||||
clock = options.CPUClock,
|
||||
voltage_domain = VoltageDomain(
|
||||
voltage = options.cpu_voltage)))
|
||||
else:
|
||||
fatal("Atomic CPU not supported/tested")
|
||||
cpu_list.append(cpu)
|
||||
|
||||
# create the command processors
|
||||
for i in xrange(options.num_cp):
|
||||
cp = None
|
||||
if options.cpu_type == "detailed":
|
||||
cp = DerivO3CPU(cpu_id = options.num_cpus + i,
|
||||
clk_domain = SrcClockDomain(
|
||||
clock = options.CPUClock,
|
||||
voltage_domain = VoltageDomain(
|
||||
voltage = options.cpu_voltage)))
|
||||
elif options.cpu_type == 'timing':
|
||||
cp = TimingSimpleCPU(cpu_id=options.num_cpus + i,
|
||||
clk_domain = SrcClockDomain(
|
||||
clock = options.CPUClock,
|
||||
voltage_domain = VoltageDomain(
|
||||
voltage = options.cpu_voltage)))
|
||||
else:
|
||||
fatal("Atomic CPU not supported/tested")
|
||||
cp_list = cp_list + [cp]
|
||||
|
||||
########################## Creating the GPU dispatcher ########################
|
||||
# Dispatcher dispatches work from host CPU to GPU
|
||||
host_cpu = cpu_list[0]
|
||||
dispatcher = GpuDispatcher()
|
||||
|
||||
########################## Create and assign the workload ########################
|
||||
# Check for rel_path in elements of base_list using test, returning
|
||||
# the first full path that satisfies test
|
||||
def find_path(base_list, rel_path, test):
|
||||
for base in base_list:
|
||||
if not base:
|
||||
# base could be None if environment var not set
|
||||
continue
|
||||
full_path = os.path.join(base, rel_path)
|
||||
if test(full_path):
|
||||
return full_path
|
||||
fatal("%s not found in %s" % (rel_path, base_list))
|
||||
|
||||
def find_file(base_list, rel_path):
|
||||
return find_path(base_list, rel_path, os.path.isfile)
|
||||
|
||||
executable = find_path(benchmark_path, options.cmd, os.path.exists)
|
||||
# it's common for a benchmark to be in a directory with the same
|
||||
# name as the executable, so we handle that automatically
|
||||
if os.path.isdir(executable):
|
||||
benchmark_path = [executable]
|
||||
executable = find_file(benchmark_path, options.cmd)
|
||||
if options.kernel_files:
|
||||
kernel_files = [find_file(benchmark_path, f)
|
||||
for f in options.kernel_files.split(':')]
|
||||
else:
|
||||
# if kernel_files is not set, see if there's a unique .asm file
|
||||
# in the same directory as the executable
|
||||
kernel_path = os.path.dirname(executable)
|
||||
kernel_files = glob.glob(os.path.join(kernel_path, '*.asm'))
|
||||
if kernel_files:
|
||||
print "Using GPU kernel code file(s)", ",".join(kernel_files)
|
||||
else:
|
||||
fatal("Can't locate kernel code (.asm) in " + kernel_path)
|
||||
|
||||
# OpenCL driver
|
||||
driver = ClDriver(filename="hsa", codefile=kernel_files)
|
||||
for cpu in cpu_list:
|
||||
cpu.workload = LiveProcess(executable = executable,
|
||||
cmd = [options.cmd] + options.options.split(),
|
||||
drivers = [driver])
|
||||
for cp in cp_list:
|
||||
cp.workload = host_cpu.workload
|
||||
|
||||
########################## Create the overall system ########################
|
||||
# Full list of processing cores in the system. Note that
|
||||
# dispatcher is also added to cpu_list although it is
|
||||
# not a processing element
|
||||
cpu_list = cpu_list + [shader] + cp_list + [dispatcher]
|
||||
|
||||
# creating the overall system
|
||||
# notice the cpu list is explicitly added as a parameter to System
|
||||
system = System(cpu = cpu_list,
|
||||
mem_ranges = [AddrRange(options.mem_size)],
|
||||
cache_line_size = options.cacheline_size,
|
||||
mem_mode = mem_mode)
|
||||
system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
|
||||
system.clk_domain = SrcClockDomain(clock = options.sys_clock,
|
||||
voltage_domain = system.voltage_domain)
|
||||
|
||||
# configure the TLB hierarchy
|
||||
GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
|
||||
|
||||
# create Ruby system
|
||||
system.piobus = IOXBar(width=32, response_latency=0,
|
||||
frontend_latency=0, forward_latency=0)
|
||||
Ruby.create_system(options, None, system)
|
||||
system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
|
||||
voltage_domain = system.voltage_domain)
|
||||
|
||||
# attach the CPU ports to Ruby
|
||||
for i in range(options.num_cpus):
|
||||
ruby_port = system.ruby._cpu_ports[i]
|
||||
|
||||
# Create interrupt controller
|
||||
system.cpu[i].createInterruptController()
|
||||
|
||||
# Connect cache port's to ruby
|
||||
system.cpu[i].icache_port = ruby_port.slave
|
||||
system.cpu[i].dcache_port = ruby_port.slave
|
||||
|
||||
ruby_port.mem_master_port = system.piobus.slave
|
||||
if buildEnv['TARGET_ISA'] == "x86":
|
||||
system.cpu[i].interrupts[0].pio = system.piobus.master
|
||||
system.cpu[i].interrupts[0].int_master = system.piobus.slave
|
||||
system.cpu[i].interrupts[0].int_slave = system.piobus.master
|
||||
|
||||
# attach CU ports to Ruby
|
||||
# Because of the peculiarities of the CP core, you may have 1 CPU but 2
|
||||
# sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
|
||||
# hooked up until after the CP. To make this script generic, figure out
|
||||
# the index as below, but note that this assumes there is one sequencer
|
||||
# per compute unit and one sequencer per SQC for the math to work out
|
||||
# correctly.
|
||||
gpu_port_idx = len(system.ruby._cpu_ports) \
|
||||
- options.num_compute_units - options.num_sqc
|
||||
gpu_port_idx = gpu_port_idx - options.num_cp * 2
|
||||
|
||||
wavefront_size = options.wf_size
|
||||
for i in xrange(n_cu):
|
||||
# The pipeline issues wavefront_size number of uncoalesced requests
|
||||
# in one GPU issue cycle. Hence wavefront_size mem ports.
|
||||
for j in xrange(wavefront_size):
|
||||
system.cpu[shader_idx].CUs[i].memory_port[j] = \
|
||||
system.ruby._cpu_ports[gpu_port_idx].slave[j]
|
||||
gpu_port_idx += 1
|
||||
|
||||
for i in xrange(n_cu):
|
||||
if i > 0 and not i % options.cu_per_sqc:
|
||||
print "incrementing idx on ", i
|
||||
gpu_port_idx += 1
|
||||
system.cpu[shader_idx].CUs[i].sqc_port = \
|
||||
system.ruby._cpu_ports[gpu_port_idx].slave
|
||||
gpu_port_idx = gpu_port_idx + 1
|
||||
|
||||
# attach CP ports to Ruby
|
||||
for i in xrange(options.num_cp):
|
||||
system.cpu[cp_idx].createInterruptController()
|
||||
system.cpu[cp_idx].dcache_port = \
|
||||
system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
|
||||
system.cpu[cp_idx].icache_port = \
|
||||
system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
|
||||
system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
|
||||
system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
|
||||
system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
|
||||
cp_idx = cp_idx + 1
|
||||
|
||||
# connect dispatcher to the system.piobus
|
||||
dispatcher.pio = system.piobus.master
|
||||
dispatcher.dma = system.piobus.slave
|
||||
|
||||
################# Connect the CPU and GPU via GPU Dispatcher ###################
|
||||
# CPU rings the GPU doorbell to notify a pending task
|
||||
# using this interface.
|
||||
# And GPU uses this interface to notify the CPU of task completion
|
||||
# The communcation happens through emulated driver.
|
||||
|
||||
# Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
|
||||
# parameters must be after the explicit setting of the System cpu list
|
||||
shader.cpu_pointer = host_cpu
|
||||
dispatcher.cpu = host_cpu
|
||||
dispatcher.shader_pointer = shader
|
||||
dispatcher.cl_driver = driver
|
||||
|
||||
########################## Start simulation ########################
|
||||
|
||||
root = Root(system=system, full_system=False)
|
||||
m5.ticks.setGlobalFrequency('1THz')
|
||||
if options.abs_max_tick:
|
||||
maxtick = options.abs_max_tick
|
||||
else:
|
||||
maxtick = m5.MaxTick
|
||||
|
||||
# Benchmarks support work item annotations
|
||||
Simulation.setWorkCountOptions(system, options)
|
||||
|
||||
# Checkpointing is not supported by APU model
|
||||
if (options.checkpoint_dir != None or
|
||||
options.checkpoint_restore != None):
|
||||
fatal("Checkpointing not supported by apu model")
|
||||
|
||||
checkpoint_dir = None
|
||||
m5.instantiate(checkpoint_dir)
|
||||
|
||||
# Map workload to this address space
|
||||
host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
|
||||
|
||||
exit_event = m5.simulate(maxtick)
|
||||
print "Ticks:", m5.curTick()
|
||||
print 'Exiting because ', exit_event.getCause()
|
||||
sys.exit(exit_event.getCode())
|
187
configs/example/ruby_gpu_random_test.py
Normal file
187
configs/example/ruby_gpu_random_test.py
Normal file
|
@ -0,0 +1,187 @@
|
|||
#
|
||||
# Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Brad Beckmann
|
||||
#
|
||||
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from m5.util import addToPath
|
||||
import os, optparse, sys
|
||||
addToPath('../common')
|
||||
addToPath('../ruby')
|
||||
addToPath('../topologies')
|
||||
|
||||
import Options
|
||||
import Ruby
|
||||
|
||||
# Get paths we might need.
|
||||
config_path = os.path.dirname(os.path.abspath(__file__))
|
||||
config_root = os.path.dirname(config_path)
|
||||
m5_root = os.path.dirname(config_root)
|
||||
|
||||
parser = optparse.OptionParser()
|
||||
Options.addCommonOptions(parser)
|
||||
|
||||
parser.add_option("--maxloads", metavar="N", default=100,
|
||||
help="Stop after N loads")
|
||||
parser.add_option("-f", "--wakeup_freq", metavar="N", default=10,
|
||||
help="Wakeup every N cycles")
|
||||
parser.add_option("-u", "--num-compute-units", type="int", default=1,
|
||||
help="number of compute units in the GPU")
|
||||
parser.add_option("--numCPs", type="int", default=0,
|
||||
help="Number of GPU Command Processors (CP)")
|
||||
# not super important now, but to avoid putting the number 4 everywhere, make
|
||||
# it an option/knob
|
||||
parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs \
|
||||
sharing an SQC (icache, and thus icache TLB)")
|
||||
parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
|
||||
"per CU")
|
||||
parser.add_option("--wf-size", type="int", default=64,
|
||||
help="Wavefront size(in workitems)")
|
||||
parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
|
||||
"WF slots per SIMD")
|
||||
|
||||
#
|
||||
# Add the ruby specific and protocol specific options
|
||||
#
|
||||
Ruby.define_options(parser)
|
||||
|
||||
execfile(os.path.join(config_root, "common", "Options.py"))
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
#
|
||||
# Set the default cache size and associativity to be very small to encourage
|
||||
# races between requests and writebacks.
|
||||
#
|
||||
options.l1d_size="256B"
|
||||
options.l1i_size="256B"
|
||||
options.l2_size="512B"
|
||||
options.l3_size="1kB"
|
||||
options.l1d_assoc=2
|
||||
options.l1i_assoc=2
|
||||
options.l2_assoc=2
|
||||
options.l3_assoc=2
|
||||
|
||||
# This file can support multiple compute units
|
||||
assert(options.num_compute_units >= 1)
|
||||
n_cu = options.num_compute_units
|
||||
|
||||
options.num_sqc = int((n_cu + options.cu_per_sqc - 1) / options.cu_per_sqc)
|
||||
|
||||
if args:
|
||||
print "Error: script doesn't take any positional arguments"
|
||||
sys.exit(1)
|
||||
|
||||
#
|
||||
# Create the ruby random tester
|
||||
#
|
||||
|
||||
# Check to for the GPU_RfO protocol. Other GPU protocols are non-SC and will
|
||||
# not work with the Ruby random tester.
|
||||
assert(buildEnv['PROTOCOL'] == 'GPU_RfO')
|
||||
|
||||
# The GPU_RfO protocol does not support cache flushes
|
||||
check_flush = False
|
||||
|
||||
tester = RubyTester(check_flush=check_flush,
|
||||
checks_to_complete=options.maxloads,
|
||||
wakeup_frequency=options.wakeup_freq,
|
||||
deadlock_threshold=1000000)
|
||||
|
||||
#
|
||||
# Create the M5 system. Note that the Memory Object isn't
|
||||
# actually used by the rubytester, but is included to support the
|
||||
# M5 memory size == Ruby memory size checks
|
||||
#
|
||||
system = System(cpu=tester, mem_ranges=[AddrRange(options.mem_size)])
|
||||
|
||||
# Create a top-level voltage domain and clock domain
|
||||
system.voltage_domain = VoltageDomain(voltage=options.sys_voltage)
|
||||
|
||||
system.clk_domain = SrcClockDomain(clock=options.sys_clock,
|
||||
voltage_domain=system.voltage_domain)
|
||||
|
||||
Ruby.create_system(options, False, system)
|
||||
|
||||
# Create a seperate clock domain for Ruby
|
||||
system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock,
|
||||
voltage_domain=system.voltage_domain)
|
||||
|
||||
tester.num_cpus = len(system.ruby._cpu_ports)
|
||||
|
||||
#
|
||||
# The tester is most effective when randomization is turned on and
|
||||
# artifical delay is randomly inserted on messages
|
||||
#
|
||||
system.ruby.randomization = True
|
||||
|
||||
for ruby_port in system.ruby._cpu_ports:
|
||||
|
||||
#
|
||||
# Tie the ruby tester ports to the ruby cpu read and write ports
|
||||
#
|
||||
if ruby_port.support_data_reqs and ruby_port.support_inst_reqs:
|
||||
tester.cpuInstDataPort = ruby_port.slave
|
||||
elif ruby_port.support_data_reqs:
|
||||
tester.cpuDataPort = ruby_port.slave
|
||||
elif ruby_port.support_inst_reqs:
|
||||
tester.cpuInstPort = ruby_port.slave
|
||||
|
||||
# Do not automatically retry stalled Ruby requests
|
||||
ruby_port.no_retry_on_stall = True
|
||||
|
||||
#
|
||||
# Tell each sequencer this is the ruby tester so that it
|
||||
# copies the subblock back to the checker
|
||||
#
|
||||
ruby_port.using_ruby_tester = True
|
||||
|
||||
# -----------------------
|
||||
# run simulation
|
||||
# -----------------------
|
||||
|
||||
root = Root( full_system = False, system = system )
|
||||
root.system.mem_mode = 'timing'
|
||||
|
||||
# Not much point in this being higher than the L1 latency
|
||||
m5.ticks.setGlobalFrequency('1ns')
|
||||
|
||||
# instantiate configuration
|
||||
m5.instantiate()
|
||||
|
||||
# simulate until program terminates
|
||||
exit_event = m5.simulate(options.abs_max_tick)
|
||||
|
||||
print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause()
|
134
configs/ruby/AMD_Base_Constructor.py
Normal file
134
configs/ruby/AMD_Base_Constructor.py
Normal file
|
@ -0,0 +1,134 @@
|
|||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Sooraj Puthoor, Lisa Hsu
|
||||
#
|
||||
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from m5.util import convert
|
||||
from CntrlBase import *
|
||||
from Cluster import Cluster
|
||||
|
||||
#
|
||||
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
|
||||
#
|
||||
class L1Cache(RubyCache):
|
||||
latency = 1
|
||||
resourceStalls = False
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
#
|
||||
# Note: the L2 Cache latency is not currently used
|
||||
#
|
||||
class L2Cache(RubyCache):
|
||||
latency = 10
|
||||
resourceStalls = False
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
class CPCntrl(AMD_Base_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.cntrl_id = self.cntrlCount()
|
||||
|
||||
self.L1Icache = L1Cache()
|
||||
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
|
||||
self.L1D0cache = L1Cache()
|
||||
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L1D1cache = L1Cache()
|
||||
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options.l2_size, options.l2_assoc, options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1Icache
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.icache = self.L1Icache
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
self.issue_latency = options.cpu_to_dir_latency
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--cpu-to-dir-latency", type="int", default=15)
|
||||
|
||||
def construct(options, system, ruby_system):
|
||||
if (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
|
||||
buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
|
||||
buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
|
||||
panic("This script requires VIPER based protocols \
|
||||
to be built.")
|
||||
cpu_sequencers = []
|
||||
cpuCluster = None
|
||||
cpuCluster = Cluster(name="CPU Cluster", extBW = 8, intBW=8) # 16 GB/s
|
||||
for i in xrange((options.num_cpus + 1) / 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
# Connect the CP controllers to the ruby network
|
||||
cp_cntrl.requestFromCore = ruby_system.network.slave
|
||||
cp_cntrl.responseFromCore = ruby_system.network.slave
|
||||
cp_cntrl.unblockFromCore = ruby_system.network.slave
|
||||
cp_cntrl.probeToCore = ruby_system.network.master
|
||||
cp_cntrl.responseToCore = ruby_system.network.master
|
||||
|
||||
exec("system.cp_cntrl%d = cp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
cpuCluster.add(cp_cntrl)
|
||||
return cpu_sequencers, cpuCluster
|
751
configs/ruby/GPU_RfO.py
Normal file
751
configs/ruby/GPU_RfO.py
Normal file
|
@ -0,0 +1,751 @@
|
|||
#
|
||||
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Lisa Hsu
|
||||
#
|
||||
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from Ruby import create_topology
|
||||
from Ruby import send_evicts
|
||||
|
||||
from Cluster import Cluster
|
||||
from Crossbar import Crossbar
|
||||
|
||||
class CntrlBase:
|
||||
_seqs = 0
|
||||
@classmethod
|
||||
def seqCount(cls):
|
||||
# Use SeqCount not class since we need global count
|
||||
CntrlBase._seqs += 1
|
||||
return CntrlBase._seqs - 1
|
||||
|
||||
_cntrls = 0
|
||||
@classmethod
|
||||
def cntrlCount(cls):
|
||||
# Use CntlCount not class since we need global count
|
||||
CntrlBase._cntrls += 1
|
||||
return CntrlBase._cntrls - 1
|
||||
|
||||
_version = 0
|
||||
@classmethod
|
||||
def versionCount(cls):
|
||||
cls._version += 1 # Use count for this particular type
|
||||
return cls._version - 1
|
||||
|
||||
class TccDirCache(RubyCache):
|
||||
size = "512kB"
|
||||
assoc = 16
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.size.value += (options.num_compute_units *
|
||||
(MemorySize(options.tcp_size).value) *
|
||||
options.tcc_dir_factor) / long(options.num_tccs)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L1DCache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l1d_size)
|
||||
self.assoc = options.l1d_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L1ICache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l1i_size)
|
||||
self.assoc = options.l1i_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L2Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l2_size)
|
||||
self.assoc = options.l2_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
|
||||
class CPCntrl(CorePair_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1Icache = L1ICache()
|
||||
self.L1Icache.create(options)
|
||||
self.L1D0cache = L1DCache()
|
||||
self.L1D0cache.create(options)
|
||||
self.L1D1cache = L1DCache()
|
||||
self.L1D1cache.create(options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.icache_hit_latency = 2
|
||||
self.sequencer.dcache_hit_latency = 2
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1Icache
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.icache = self.L1Icache
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.icache_hit_latency = 2
|
||||
self.sequencer1.dcache_hit_latency = 2
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
self.issue_latency = options.cpu_to_dir_latency
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCPCache(RubyCache):
|
||||
assoc = 8
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 4
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcp_size)
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class TCPCntrl(TCP_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
self.L1cache.create(options)
|
||||
|
||||
self.coalescer = RubyGPUCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
self.coalescer.max_outstanding_requests = options.simds_per_cu * \
|
||||
options.wfs_per_simd * \
|
||||
options.wf_size
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def createCP(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
self.L1cache.create(options)
|
||||
|
||||
self.coalescer = RubyGPUCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = True
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class SQCCache(RubyCache):
|
||||
size = "32kB"
|
||||
assoc = 8
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 4
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class SQCCntrl(SQC_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
self.sequencer.is_cpu_sequencer = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def createCP(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
|
||||
class TCC(RubyCache):
|
||||
assoc = 16
|
||||
dataAccessLatency = 8
|
||||
tagAccessLatency = 2
|
||||
resourceStalls = True
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.size = self.size / options.num_tccs
|
||||
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
|
||||
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
|
||||
if ((self.size.value / long(self.assoc)) < 128):
|
||||
self.size.value = long(128 * self.assoc)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class TCCCntrl(TCC_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L2cache = TCC()
|
||||
self.L2cache.create(options)
|
||||
self.l2_response_latency = options.TCC_latency
|
||||
|
||||
self.number_of_TBEs = 2048
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
|
||||
tcc_unblock_to_tccdir, req_to_tcc,
|
||||
probe_to_tcc, resp_to_tcc):
|
||||
self.w_reqToTCCDir = req_to_tccdir
|
||||
self.w_respToTCCDir = resp_to_tccdir
|
||||
self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
|
||||
self.w_reqToTCC = req_to_tcc
|
||||
self.w_probeToTCC = probe_to_tcc
|
||||
self.w_respToTCC = resp_to_tcc
|
||||
|
||||
class TCCDirCntrl(TCCdir_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.directory = TccDirCache()
|
||||
self.directory.create(options)
|
||||
|
||||
self.number_of_TBEs = 1024
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
|
||||
tcc_unblock_to_tccdir, req_to_tcc,
|
||||
probe_to_tcc, resp_to_tcc):
|
||||
self.w_reqToTCCDir = req_to_tccdir
|
||||
self.w_respToTCCDir = resp_to_tccdir
|
||||
self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
|
||||
self.w_reqToTCC = req_to_tcc
|
||||
self.w_probeToTCC = probe_to_tcc
|
||||
self.w_respToTCC = resp_to_tcc
|
||||
|
||||
class L3Cache(RubyCache):
|
||||
assoc = 8
|
||||
dataArrayBanks = 256
|
||||
tagArrayBanks = 256
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.size = MemorySize(options.l3_size)
|
||||
self.size.value /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataAccessLatency = options.l3_data_latency
|
||||
self.tagAccessLatency = options.l3_tag_latency
|
||||
self.resourceStalls = options.no_resource_stalls
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L3Cntrl(L3Cache_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L3cache = L3Cache()
|
||||
self.L3cache.create(options, ruby_system, system)
|
||||
|
||||
self.l3_response_latency = max(self.L3cache.dataAccessLatency,
|
||||
self.L3cache.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
class DirMem(RubyDirectoryMemory, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
phys_mem_size = AddrRange(options.mem_size).size()
|
||||
mem_module_size = phys_mem_size / options.num_dirs
|
||||
dir_size = MemorySize('0B')
|
||||
dir_size.value = mem_module_size
|
||||
self.size = dir_size
|
||||
|
||||
class DirCntrl(Directory_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.response_latency = 30
|
||||
|
||||
self.directory = DirMem()
|
||||
self.directory.create(options, ruby_system, system)
|
||||
|
||||
self.L3CacheMemory = L3Cache()
|
||||
self.L3CacheMemory.create(options, ruby_system, system)
|
||||
|
||||
self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
|
||||
self.L3CacheMemory.tagAccessLatency)
|
||||
|
||||
self.number_of_TBEs = options.num_tbes
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--num-subcaches", type="int", default=4)
|
||||
parser.add_option("--l3-data-latency", type="int", default=20)
|
||||
parser.add_option("--l3-tag-latency", type="int", default=15)
|
||||
parser.add_option("--cpu-to-dir-latency", type="int", default=15)
|
||||
parser.add_option("--gpu-to-dir-latency", type="int", default=160)
|
||||
parser.add_option("--no-resource-stalls", action="store_false",
|
||||
default=True)
|
||||
parser.add_option("--num-tbes", type="int", default=256)
|
||||
parser.add_option("--l2-latency", type="int", default=50) # load to use
|
||||
parser.add_option("--num-tccs", type="int", default=1,
|
||||
help="number of TCC directories and banks in the GPU")
|
||||
parser.add_option("--TCP_latency", type="int", default=4,
|
||||
help="TCP latency")
|
||||
parser.add_option("--TCC_latency", type="int", default=16,
|
||||
help="TCC latency")
|
||||
parser.add_option("--tcc-size", type='string', default='256kB',
|
||||
help="agregate tcc size")
|
||||
parser.add_option("--tcp-size", type='string', default='16kB',
|
||||
help="tcp size")
|
||||
parser.add_option("--tcc-dir-factor", type='int', default=4,
|
||||
help="TCCdir size = factor *(TCPs + TCC)")
|
||||
|
||||
def create_system(options, full_system, system, dma_devices, ruby_system):
|
||||
if buildEnv['PROTOCOL'] != 'GPU_RfO':
|
||||
panic("This script requires the GPU_RfO protocol to be built.")
|
||||
|
||||
cpu_sequencers = []
|
||||
|
||||
#
|
||||
# The ruby network creation expects the list of nodes in the system to be
|
||||
# consistent with the NetDest list. Therefore the l1 controller nodes
|
||||
# must be listed before the directory nodes and directory nodes before
|
||||
# dma nodes, etc.
|
||||
#
|
||||
cp_cntrl_nodes = []
|
||||
tcp_cntrl_nodes = []
|
||||
sqc_cntrl_nodes = []
|
||||
tcc_cntrl_nodes = []
|
||||
tccdir_cntrl_nodes = []
|
||||
dir_cntrl_nodes = []
|
||||
l3_cntrl_nodes = []
|
||||
|
||||
#
|
||||
# Must create the individual controllers before the network to ensure the
|
||||
# controller constructors are called before the network constructor
|
||||
#
|
||||
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
|
||||
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
|
||||
# Clusters
|
||||
mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
|
||||
for i in xrange(options.num_dirs):
|
||||
|
||||
dir_cntrl = DirCntrl(TCC_select_num_bits = TCC_bits)
|
||||
dir_cntrl.create(options, ruby_system, system)
|
||||
dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
#Enough TBEs for all TCP TBEs
|
||||
|
||||
# Connect the Directory controller to the ruby network
|
||||
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.responseFromCores = MessageBuffer()
|
||||
dir_cntrl.responseFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockFromCores = MessageBuffer()
|
||||
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.probeToCore = MessageBuffer()
|
||||
dir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.responseToCore = MessageBuffer()
|
||||
dir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.responseFromMemory = MessageBuffer()
|
||||
|
||||
exec("system.dir_cntrl%d = dir_cntrl" % i)
|
||||
dir_cntrl_nodes.append(dir_cntrl)
|
||||
|
||||
mainCluster.add(dir_cntrl)
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
cpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
|
||||
for i in xrange((options.num_cpus + 1) / 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.cp_cntrl%d = cp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
|
||||
# Connect the CP controllers and the network
|
||||
cp_cntrl.requestFromCore = MessageBuffer()
|
||||
cp_cntrl.requestFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.responseFromCore = MessageBuffer()
|
||||
cp_cntrl.responseFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.unblockFromCore = MessageBuffer()
|
||||
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.probeToCore = MessageBuffer()
|
||||
cp_cntrl.probeToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.responseToCore = MessageBuffer()
|
||||
cp_cntrl.responseToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
cpuCluster.add(cp_cntrl)
|
||||
|
||||
gpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
|
||||
|
||||
for i in xrange(options.num_compute_units):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
number_of_TBEs = 2560) # max outstanding requests
|
||||
tcp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.coalescer)
|
||||
tcp_cntrl_nodes.append(tcp_cntrl)
|
||||
|
||||
# Connect the TCP controller to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
for i in xrange(options.num_sqc):
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
for i in xrange(options.numCPs):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
number_of_TBEs = 2560) # max outstanding requests
|
||||
tcp_cntrl.createCP(options, ruby_system, system)
|
||||
|
||||
exec("system.tcp_cntrl%d = tcp_cntrl" % (options.num_compute_units + i))
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.sequencer)
|
||||
tcp_cntrl_nodes.append(tcp_cntrl)
|
||||
|
||||
# Connect the TCP controller to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.createCP(options, ruby_system, system)
|
||||
|
||||
exec("system.sqc_cntrl%d = sqc_cntrl" % (options.num_compute_units + i))
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
for i in xrange(options.num_tccs):
|
||||
|
||||
tcc_cntrl = TCCCntrl(TCC_select_num_bits = TCC_bits,
|
||||
number_of_TBEs = options.num_compute_units * 2560)
|
||||
#Enough TBEs for all TCP TBEs
|
||||
tcc_cntrl.create(options, ruby_system, system)
|
||||
tcc_cntrl_nodes.append(tcc_cntrl)
|
||||
|
||||
tccdir_cntrl = TCCDirCntrl(TCC_select_num_bits = TCC_bits,
|
||||
number_of_TBEs = options.num_compute_units * 2560)
|
||||
#Enough TBEs for all TCP TBEs
|
||||
tccdir_cntrl.create(options, ruby_system, system)
|
||||
tccdir_cntrl_nodes.append(tccdir_cntrl)
|
||||
|
||||
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
|
||||
exec("system.tccdir_cntrl%d = tccdir_cntrl" % i)
|
||||
|
||||
# connect all of the wire buffers between L3 and dirs up
|
||||
req_to_tccdir = RubyWireBuffer()
|
||||
resp_to_tccdir = RubyWireBuffer()
|
||||
tcc_unblock_to_tccdir = RubyWireBuffer()
|
||||
req_to_tcc = RubyWireBuffer()
|
||||
probe_to_tcc = RubyWireBuffer()
|
||||
resp_to_tcc = RubyWireBuffer()
|
||||
|
||||
tcc_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
|
||||
tcc_unblock_to_tccdir, req_to_tcc,
|
||||
probe_to_tcc, resp_to_tcc)
|
||||
tccdir_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
|
||||
tcc_unblock_to_tccdir, req_to_tcc,
|
||||
probe_to_tcc, resp_to_tcc)
|
||||
|
||||
# Connect the TCC controller to the ruby network
|
||||
tcc_cntrl.responseFromTCC = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseFromTCC.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.responseToTCC = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseToTCC.slave = ruby_system.network.master
|
||||
|
||||
# Connect the TCC Dir controller to the ruby network
|
||||
tccdir_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.requestFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.responseFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.unblockFromTCP = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.unblockFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.probeToCore = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.responseToCore = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.probeFromNB = MessageBuffer()
|
||||
tccdir_cntrl.probeFromNB.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.responseFromNB = MessageBuffer()
|
||||
tccdir_cntrl.responseFromNB.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.requestToNB = MessageBuffer()
|
||||
tccdir_cntrl.requestToNB.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.responseToNB = MessageBuffer()
|
||||
tccdir_cntrl.responseToNB.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.unblockToNB = MessageBuffer()
|
||||
tccdir_cntrl.unblockToNB.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
# TCC cntrls added to the GPU cluster
|
||||
gpuCluster.add(tcc_cntrl)
|
||||
gpuCluster.add(tccdir_cntrl)
|
||||
|
||||
# Assuming no DMA devices
|
||||
assert(len(dma_devices) == 0)
|
||||
|
||||
# Add cpu/gpu clusters to main cluster
|
||||
mainCluster.add(cpuCluster)
|
||||
mainCluster.add(gpuCluster)
|
||||
|
||||
ruby_system.network.number_of_virtual_networks = 10
|
||||
|
||||
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
|
674
configs/ruby/GPU_VIPER.py
Normal file
674
configs/ruby/GPU_VIPER.py
Normal file
|
@ -0,0 +1,674 @@
|
|||
#
|
||||
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Lisa Hsu
|
||||
#
|
||||
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from Ruby import create_topology
|
||||
from Ruby import send_evicts
|
||||
|
||||
from Cluster import Cluster
|
||||
from Crossbar import Crossbar
|
||||
|
||||
class CntrlBase:
|
||||
_seqs = 0
|
||||
@classmethod
|
||||
def seqCount(cls):
|
||||
# Use SeqCount not class since we need global count
|
||||
CntrlBase._seqs += 1
|
||||
return CntrlBase._seqs - 1
|
||||
|
||||
_cntrls = 0
|
||||
@classmethod
|
||||
def cntrlCount(cls):
|
||||
# Use CntlCount not class since we need global count
|
||||
CntrlBase._cntrls += 1
|
||||
return CntrlBase._cntrls - 1
|
||||
|
||||
_version = 0
|
||||
@classmethod
|
||||
def versionCount(cls):
|
||||
cls._version += 1 # Use count for this particular type
|
||||
return cls._version - 1
|
||||
|
||||
class L1Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
dataArrayBanks = 2
|
||||
tagArrayBanks = 2
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L2Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class CPCntrl(CorePair_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1Icache = L1Cache()
|
||||
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
|
||||
self.L1D0cache = L1Cache()
|
||||
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L1D1cache = L1Cache()
|
||||
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options.l2_size, options.l2_assoc, options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1Icache
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.icache = self.L1Icache
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
self.issue_latency = options.cpu_to_dir_latency
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCPCache(RubyCache):
|
||||
size = "16kB"
|
||||
assoc = 16
|
||||
dataArrayBanks = 16 #number of data banks
|
||||
tagArrayBanks = 16 #number of tag banks
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcp_size)
|
||||
self.assoc = options.tcp_assoc
|
||||
self.resourceStalls = options.no_tcc_resource_stalls
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class TCPCntrl(TCP_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency,
|
||||
dataAccessLatency = options.TCP_latency)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
self.L1cache.create(options)
|
||||
self.issue_latency = 1
|
||||
|
||||
self.coalescer = VIPERCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def createCP(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency,
|
||||
dataAccessLatency = options.TCP_latency)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
self.L1cache.create(options)
|
||||
self.issue_latency = 1
|
||||
|
||||
self.coalescer = VIPERCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = True
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class SQCCache(RubyCache):
|
||||
dataArrayBanks = 8
|
||||
tagArrayBanks = 8
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.sqc_size)
|
||||
self.assoc = options.sqc_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class SQCCntrl(SQC_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
self.sequencer.is_cpu_sequencer = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCC(RubyCache):
|
||||
size = MemorySize("256kB")
|
||||
assoc = 16
|
||||
dataAccessLatency = 8
|
||||
tagAccessLatency = 2
|
||||
resourceStalls = True
|
||||
def create(self, options):
|
||||
self.assoc = options.tcc_assoc
|
||||
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
|
||||
s = options.num_compute_units
|
||||
tcc_size = s * 128
|
||||
tcc_size = str(tcc_size)+'kB'
|
||||
self.size = MemorySize(tcc_size)
|
||||
self.dataArrayBanks = 64
|
||||
self.tagArrayBanks = 64
|
||||
else:
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
|
||||
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
|
||||
self.size.value = self.size.value / options.num_tccs
|
||||
if ((self.size.value / long(self.assoc)) < 128):
|
||||
self.size.value = long(128 * self.assoc)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
|
||||
class TCCCntrl(TCC_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L2cache = TCC()
|
||||
self.L2cache.create(options)
|
||||
self.L2cache.resourceStalls = options.no_tcc_resource_stalls
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class L3Cache(RubyCache):
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.size = MemorySize(options.l3_size)
|
||||
self.size.value /= options.num_dirs
|
||||
self.assoc = options.l3_assoc
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataAccessLatency = options.l3_data_latency
|
||||
self.tagAccessLatency = options.l3_tag_latency
|
||||
self.resourceStalls = False
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L3Cntrl(L3Cache_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L3cache = L3Cache()
|
||||
self.L3cache.create(options, ruby_system, system)
|
||||
|
||||
self.l3_response_latency = max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
class DirMem(RubyDirectoryMemory, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
phys_mem_size = AddrRange(options.mem_size).size()
|
||||
mem_module_size = phys_mem_size / options.num_dirs
|
||||
dir_size = MemorySize('0B')
|
||||
dir_size.value = mem_module_size
|
||||
self.size = dir_size
|
||||
|
||||
class DirCntrl(Directory_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.response_latency = 30
|
||||
|
||||
self.directory = DirMem()
|
||||
self.directory.create(options, ruby_system, system)
|
||||
|
||||
self.L3CacheMemory = L3Cache()
|
||||
self.L3CacheMemory.create(options, ruby_system, system)
|
||||
|
||||
self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
|
||||
self.L3CacheMemory.tagAccessLatency)
|
||||
|
||||
self.number_of_TBEs = options.num_tbes
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--num-subcaches", type = "int", default = 4)
|
||||
parser.add_option("--l3-data-latency", type = "int", default = 20)
|
||||
parser.add_option("--l3-tag-latency", type = "int", default = 15)
|
||||
parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
|
||||
parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
|
||||
parser.add_option("--no-resource-stalls", action = "store_false",
|
||||
default = True)
|
||||
parser.add_option("--no-tcc-resource-stalls", action = "store_false",
|
||||
default = True)
|
||||
parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
|
||||
parser.add_option("--num-tbes", type = "int", default = 256)
|
||||
parser.add_option("--l2-latency", type = "int", default = 50) # load to use
|
||||
parser.add_option("--num-tccs", type = "int", default = 1,
|
||||
help = "number of TCC banks in the GPU")
|
||||
parser.add_option("--sqc-size", type = 'string', default = '32kB',
|
||||
help = "SQC cache size")
|
||||
parser.add_option("--sqc-assoc", type = 'int', default = 8,
|
||||
help = "SQC cache assoc")
|
||||
parser.add_option("--WB_L1", action = "store_true", default = False,
|
||||
help = "writeback L1")
|
||||
parser.add_option("--WB_L2", action = "store_true", default = False,
|
||||
help = "writeback L2")
|
||||
parser.add_option("--TCP_latency", type = "int", default = 4,
|
||||
help = "TCP latency")
|
||||
parser.add_option("--TCC_latency", type = "int", default = 16,
|
||||
help = "TCC latency")
|
||||
parser.add_option("--tcc-size", type = 'string', default = '256kB',
|
||||
help = "agregate tcc size")
|
||||
parser.add_option("--tcc-assoc", type = 'int', default = 16,
|
||||
help = "tcc assoc")
|
||||
parser.add_option("--tcp-size", type = 'string', default = '16kB',
|
||||
help = "tcp size")
|
||||
parser.add_option("--tcp-assoc", type = 'int', default = 16,
|
||||
help = "tcp assoc")
|
||||
parser.add_option("--noL1", action = "store_true", default = False,
|
||||
help = "bypassL1")
|
||||
|
||||
def create_system(options, full_system, system, dma_devices, ruby_system):
|
||||
if buildEnv['PROTOCOL'] != 'GPU_VIPER':
|
||||
panic("This script requires the GPU_VIPER protocol to be built.")
|
||||
|
||||
cpu_sequencers = []
|
||||
|
||||
#
|
||||
# The ruby network creation expects the list of nodes in the system to be
|
||||
# consistent with the NetDest list. Therefore the l1 controller nodes
|
||||
# must be listed before the directory nodes and directory nodes before
|
||||
# dma nodes, etc.
|
||||
#
|
||||
cp_cntrl_nodes = []
|
||||
tcp_cntrl_nodes = []
|
||||
sqc_cntrl_nodes = []
|
||||
tcc_cntrl_nodes = []
|
||||
dir_cntrl_nodes = []
|
||||
l3_cntrl_nodes = []
|
||||
|
||||
#
|
||||
# Must create the individual controllers before the network to ensure the
|
||||
# controller constructors are called before the network constructor
|
||||
#
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
|
||||
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
|
||||
# Clusters
|
||||
crossbar_bw = None
|
||||
mainCluster = None
|
||||
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
|
||||
#Assuming a 2GHz clock
|
||||
crossbar_bw = 16 * options.num_compute_units * options.bw_scalor
|
||||
mainCluster = Cluster(intBW=crossbar_bw)
|
||||
else:
|
||||
mainCluster = Cluster(intBW=8) # 16 GB/s
|
||||
for i in xrange(options.num_dirs):
|
||||
|
||||
dir_cntrl = DirCntrl(noTCCdir = True, TCC_select_num_bits = TCC_bits)
|
||||
dir_cntrl.create(options, ruby_system, system)
|
||||
dir_cntrl.number_of_TBEs = options.num_tbes
|
||||
dir_cntrl.useL3OnWT = options.use_L3_on_WT
|
||||
# the number_of_TBEs is inclusive of TBEs below
|
||||
|
||||
# Connect the Directory controller to the ruby network
|
||||
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.responseFromCores = MessageBuffer()
|
||||
dir_cntrl.responseFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockFromCores = MessageBuffer()
|
||||
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.probeToCore = MessageBuffer()
|
||||
dir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.responseToCore = MessageBuffer()
|
||||
dir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.responseFromMemory = MessageBuffer()
|
||||
|
||||
exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
|
||||
dir_cntrl_nodes.append(dir_cntrl)
|
||||
|
||||
mainCluster.add(dir_cntrl)
|
||||
|
||||
cpuCluster = None
|
||||
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
|
||||
cpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
|
||||
else:
|
||||
cpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
|
||||
for i in xrange((options.num_cpus + 1) / 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("ruby_system.cp_cntrl%d = cp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
|
||||
# Connect the CP controllers and the network
|
||||
cp_cntrl.requestFromCore = MessageBuffer()
|
||||
cp_cntrl.requestFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.responseFromCore = MessageBuffer()
|
||||
cp_cntrl.responseFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.unblockFromCore = MessageBuffer()
|
||||
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.probeToCore = MessageBuffer()
|
||||
cp_cntrl.probeToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.responseToCore = MessageBuffer()
|
||||
cp_cntrl.responseToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
cpuCluster.add(cp_cntrl)
|
||||
|
||||
gpuCluster = None
|
||||
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
|
||||
gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
|
||||
else:
|
||||
gpuCluster = Cluster(extBW = 8, intBW = 8) # 16 GB/s
|
||||
for i in xrange(options.num_compute_units):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
issue_latency = 1,
|
||||
number_of_TBEs = 2560)
|
||||
# TBEs set to max outstanding requests
|
||||
tcp_cntrl.create(options, ruby_system, system)
|
||||
tcp_cntrl.WB = options.WB_L1
|
||||
tcp_cntrl.disableL1 = options.noL1
|
||||
tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency
|
||||
tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency
|
||||
|
||||
exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.coalescer)
|
||||
tcp_cntrl_nodes.append(tcp_cntrl)
|
||||
|
||||
# Connect the TCP controller to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer()
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
for i in xrange(options.num_sqc):
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
for i in xrange(options.numCPs):
|
||||
|
||||
tcp_ID = options.num_compute_units + i
|
||||
sqc_ID = options.num_sqc + i
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
issue_latency = 1,
|
||||
number_of_TBEs = 2560)
|
||||
# TBEs set to max outstanding requests
|
||||
tcp_cntrl.createCP(options, ruby_system, system)
|
||||
tcp_cntrl.WB = options.WB_L1
|
||||
tcp_cntrl.disableL1 = options.noL1
|
||||
tcp_cntrl.L1cache.tagAccessLatency = options.TCP_latency
|
||||
tcp_cntrl.L1cache.dataAccessLatency = options.TCP_latency
|
||||
|
||||
exec("ruby_system.tcp_cntrl%d = tcp_cntrl" % tcp_ID)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.sequencer)
|
||||
tcp_cntrl_nodes.append(tcp_cntrl)
|
||||
|
||||
# Connect the CP (TCP) controllers to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("ruby_system.sqc_cntrl%d = sqc_cntrl" % sqc_ID)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
for i in xrange(options.num_tccs):
|
||||
|
||||
tcc_cntrl = TCCCntrl(l2_response_latency = options.TCC_latency)
|
||||
tcc_cntrl.create(options, ruby_system, system)
|
||||
tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
|
||||
tcc_cntrl.l2_response_latency = options.TCC_latency
|
||||
tcc_cntrl_nodes.append(tcc_cntrl)
|
||||
tcc_cntrl.WB = options.WB_L2
|
||||
tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
# the number_of_TBEs is inclusive of TBEs below
|
||||
|
||||
# Connect the TCC controllers to the ruby network
|
||||
tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.probeFromNB = MessageBuffer()
|
||||
tcc_cntrl.probeFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseFromNB = MessageBuffer()
|
||||
tcc_cntrl.responseFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.responseToNB = MessageBuffer()
|
||||
tcc_cntrl.responseToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.unblockToNB = MessageBuffer()
|
||||
tcc_cntrl.unblockToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
exec("ruby_system.tcc_cntrl%d = tcc_cntrl" % i)
|
||||
|
||||
# connect all of the wire buffers between L3 and dirs up
|
||||
# TCC cntrls added to the GPU cluster
|
||||
gpuCluster.add(tcc_cntrl)
|
||||
|
||||
# Assuming no DMA devices
|
||||
assert(len(dma_devices) == 0)
|
||||
|
||||
# Add cpu/gpu clusters to main cluster
|
||||
mainCluster.add(cpuCluster)
|
||||
mainCluster.add(gpuCluster)
|
||||
|
||||
ruby_system.network.number_of_virtual_networks = 10
|
||||
|
||||
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
|
588
configs/ruby/GPU_VIPER_Baseline.py
Normal file
588
configs/ruby/GPU_VIPER_Baseline.py
Normal file
|
@ -0,0 +1,588 @@
|
|||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Sooraj Puthoor
|
||||
#
|
||||
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from Ruby import create_topology
|
||||
from Ruby import send_evicts
|
||||
|
||||
from Cluster import Cluster
|
||||
from Crossbar import Crossbar
|
||||
|
||||
class CntrlBase:
|
||||
_seqs = 0
|
||||
@classmethod
|
||||
def seqCount(cls):
|
||||
# Use SeqCount not class since we need global count
|
||||
CntrlBase._seqs += 1
|
||||
return CntrlBase._seqs - 1
|
||||
|
||||
_cntrls = 0
|
||||
@classmethod
|
||||
def cntrlCount(cls):
|
||||
# Use CntlCount not class since we need global count
|
||||
CntrlBase._cntrls += 1
|
||||
return CntrlBase._cntrls - 1
|
||||
|
||||
_version = 0
|
||||
@classmethod
|
||||
def versionCount(cls):
|
||||
cls._version += 1 # Use count for this particular type
|
||||
return cls._version - 1
|
||||
|
||||
class L1Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
dataArrayBanks = 2
|
||||
tagArrayBanks = 2
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L2Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class CPCntrl(CorePair_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1Icache = L1Cache()
|
||||
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
|
||||
self.L1D0cache = L1Cache()
|
||||
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L1D1cache = L1Cache()
|
||||
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options.l2_size, options.l2_assoc, options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1Icache
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.icache = self.L1Icache
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
self.issue_latency = options.cpu_to_dir_latency
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCPCache(RubyCache):
|
||||
size = "16kB"
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcp_size)
|
||||
self.dataArrayBanks = 16
|
||||
self.tagArrayBanks = 16
|
||||
self.dataAccessLatency = 4
|
||||
self.tagAccessLatency = 1
|
||||
self.resourceStalls = options.no_tcc_resource_stalls
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class TCPCntrl(TCP_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L1cache = TCPCache()
|
||||
self.L1cache.create(options)
|
||||
self.issue_latency = 1
|
||||
|
||||
self.coalescer = VIPERCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class SQCCache(RubyCache):
|
||||
dataArrayBanks = 8
|
||||
tagArrayBanks = 8
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.sqc_size)
|
||||
self.assoc = options.sqc_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class SQCCntrl(SQC_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = False
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
self.sequencer.is_cpu_sequencer = False
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCC(RubyCache):
|
||||
size = MemorySize("256kB")
|
||||
assoc = 16
|
||||
dataAccessLatency = 8
|
||||
tagAccessLatency = 2
|
||||
resourceStalls = True
|
||||
def create(self, options):
|
||||
self.assoc = options.tcc_assoc
|
||||
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
|
||||
s = options.num_compute_units
|
||||
tcc_size = s * 128
|
||||
tcc_size = str(tcc_size)+'kB'
|
||||
self.size = MemorySize(tcc_size)
|
||||
self.dataArrayBanks = 64
|
||||
self.tagArrayBanks = 64
|
||||
else:
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
|
||||
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
|
||||
self.size.value = self.size.value / options.num_tccs
|
||||
if ((self.size.value / long(self.assoc)) < 128):
|
||||
self.size.value = long(128 * self.assoc)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class TCCCntrl(TCC_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L2cache = TCC()
|
||||
self.L2cache.create(options)
|
||||
self.ruby_system = ruby_system
|
||||
self.L2cache.resourceStalls = options.no_tcc_resource_stalls
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class L3Cache(RubyCache):
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.size = MemorySize(options.l3_size)
|
||||
self.size.value /= options.num_dirs
|
||||
self.assoc = options.l3_assoc
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataAccessLatency = options.l3_data_latency
|
||||
self.tagAccessLatency = options.l3_tag_latency
|
||||
self.resourceStalls = False
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class ProbeFilter(RubyCache):
|
||||
size = "4MB"
|
||||
assoc = 16
|
||||
dataArrayBanks = 256
|
||||
tagArrayBanks = 256
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.block_size = "%dB" % (64 * options.blocks_per_region)
|
||||
self.size = options.region_dir_entries * \
|
||||
self.block_size * options.num_compute_units
|
||||
self.assoc = 8
|
||||
self.tagArrayBanks = 8
|
||||
self.tagAccessLatency = options.dir_tag_latency
|
||||
self.dataAccessLatency = 1
|
||||
self.resourceStalls = options.no_resource_stalls
|
||||
self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L3Cntrl(L3Cache_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L3cache = L3Cache()
|
||||
self.L3cache.create(options, ruby_system, system)
|
||||
self.l3_response_latency = \
|
||||
max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
class DirMem(RubyDirectoryMemory, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
phys_mem_size = AddrRange(options.mem_size).size()
|
||||
mem_module_size = phys_mem_size / options.num_dirs
|
||||
dir_size = MemorySize('0B')
|
||||
dir_size.value = mem_module_size
|
||||
self.size = dir_size
|
||||
|
||||
class DirCntrl(Directory_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.response_latency = 30
|
||||
self.directory = DirMem()
|
||||
self.directory.create(options, ruby_system, system)
|
||||
self.L3CacheMemory = L3Cache()
|
||||
self.L3CacheMemory.create(options, ruby_system, system)
|
||||
self.ProbeFilterMemory = ProbeFilter()
|
||||
self.ProbeFilterMemory.create(options, ruby_system, system)
|
||||
self.l3_hit_latency = \
|
||||
max(self.L3CacheMemory.dataAccessLatency,
|
||||
self.L3CacheMemory.tagAccessLatency)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--num-subcaches", type = "int", default = 4)
|
||||
parser.add_option("--l3-data-latency", type = "int", default = 20)
|
||||
parser.add_option("--l3-tag-latency", type = "int", default = 15)
|
||||
parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
|
||||
parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
|
||||
parser.add_option("--no-resource-stalls", action = "store_false",
|
||||
default = True)
|
||||
parser.add_option("--no-tcc-resource-stalls", action = "store_false",
|
||||
default = True)
|
||||
parser.add_option("--num-tbes", type = "int", default = 2560)
|
||||
parser.add_option("--l2-latency", type = "int", default = 50) # load to use
|
||||
parser.add_option("--num-tccs", type = "int", default = 1,
|
||||
help = "number of TCC banks in the GPU")
|
||||
parser.add_option("--sqc-size", type = 'string', default = '32kB',
|
||||
help = "SQC cache size")
|
||||
parser.add_option("--sqc-assoc", type = 'int', default = 8,
|
||||
help = "SQC cache assoc")
|
||||
parser.add_option("--region-dir-entries", type = "int", default = 8192)
|
||||
parser.add_option("--dir-tag-latency", type = "int", default = 8)
|
||||
parser.add_option("--dir-tag-banks", type = "int", default = 4)
|
||||
parser.add_option("--blocks-per-region", type = "int", default = 1)
|
||||
parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
|
||||
parser.add_option("--nonInclusiveDir", action = "store_true",
|
||||
default = False)
|
||||
parser.add_option("--WB_L1", action = "store_true",
|
||||
default = False, help = "writeback L2")
|
||||
parser.add_option("--WB_L2", action = "store_true",
|
||||
default = False, help = "writeback L2")
|
||||
parser.add_option("--TCP_latency", type = "int",
|
||||
default = 4, help = "TCP latency")
|
||||
parser.add_option("--TCC_latency", type = "int",
|
||||
default = 16, help = "TCC latency")
|
||||
parser.add_option("--tcc-size", type = 'string', default = '2MB',
|
||||
help = "agregate tcc size")
|
||||
parser.add_option("--tcc-assoc", type = 'int', default = 16,
|
||||
help = "tcc assoc")
|
||||
parser.add_option("--tcp-size", type = 'string', default = '16kB',
|
||||
help = "tcp size")
|
||||
parser.add_option("--sampler-sets", type = "int", default = 1024)
|
||||
parser.add_option("--sampler-assoc", type = "int", default = 16)
|
||||
parser.add_option("--sampler-counter", type = "int", default = 512)
|
||||
parser.add_option("--noL1", action = "store_true", default = False,
|
||||
help = "bypassL1")
|
||||
parser.add_option("--noL2", action = "store_true", default = False,
|
||||
help = "bypassL2")
|
||||
|
||||
def create_system(options, full_system, system, dma_devices, ruby_system):
|
||||
if buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline':
|
||||
panic("This script requires the" \
|
||||
"GPU_VIPER_Baseline protocol to be built.")
|
||||
|
||||
cpu_sequencers = []
|
||||
|
||||
#
|
||||
# The ruby network creation expects the list of nodes in the system to be
|
||||
# consistent with the NetDest list. Therefore the l1 controller nodes
|
||||
# must be listed before the directory nodes and directory nodes before
|
||||
# dma nodes, etc.
|
||||
#
|
||||
cp_cntrl_nodes = []
|
||||
tcp_cntrl_nodes = []
|
||||
sqc_cntrl_nodes = []
|
||||
tcc_cntrl_nodes = []
|
||||
dir_cntrl_nodes = []
|
||||
l3_cntrl_nodes = []
|
||||
|
||||
#
|
||||
# Must create the individual controllers before the network to ensure the
|
||||
# controller constructors are called before the network constructor
|
||||
#
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
|
||||
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
|
||||
# Clusters
|
||||
crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
|
||||
mainCluster = Cluster(intBW = crossbar_bw)
|
||||
for i in xrange(options.num_dirs):
|
||||
|
||||
dir_cntrl = DirCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
|
||||
dir_cntrl.create(options, ruby_system, system)
|
||||
dir_cntrl.number_of_TBEs = options.num_tbes
|
||||
dir_cntrl.useL3OnWT = options.use_L3_on_WT
|
||||
dir_cntrl.inclusiveDir = not options.nonInclusiveDir
|
||||
|
||||
# Connect the Directory controller to the ruby network
|
||||
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.responseFromCores = MessageBuffer()
|
||||
dir_cntrl.responseFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockFromCores = MessageBuffer()
|
||||
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.probeToCore = MessageBuffer()
|
||||
dir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.responseToCore = MessageBuffer()
|
||||
dir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.responseFromMemory = MessageBuffer()
|
||||
|
||||
exec("system.dir_cntrl%d = dir_cntrl" % i)
|
||||
dir_cntrl_nodes.append(dir_cntrl)
|
||||
mainCluster.add(dir_cntrl)
|
||||
|
||||
cpuCluster = Cluster(extBW = crossbar_bw, intBW=crossbar_bw)
|
||||
for i in xrange((options.num_cpus + 1) / 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.cp_cntrl%d = cp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
|
||||
# Connect the CP controllers and the network
|
||||
cp_cntrl.requestFromCore = MessageBuffer()
|
||||
cp_cntrl.requestFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.responseFromCore = MessageBuffer()
|
||||
cp_cntrl.responseFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.unblockFromCore = MessageBuffer()
|
||||
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.probeToCore = MessageBuffer()
|
||||
cp_cntrl.probeToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.responseToCore = MessageBuffer()
|
||||
cp_cntrl.responseToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
cpuCluster.add(cp_cntrl)
|
||||
|
||||
gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
|
||||
for i in xrange(options.num_compute_units):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
issue_latency = 1,
|
||||
number_of_TBEs = 2560)
|
||||
# TBEs set to max outstanding requests
|
||||
tcp_cntrl.create(options, ruby_system, system)
|
||||
tcp_cntrl.WB = options.WB_L1
|
||||
tcp_cntrl.disableL1 = options.noL1
|
||||
|
||||
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.coalescer)
|
||||
tcp_cntrl_nodes.append(tcp_cntrl)
|
||||
|
||||
# Connect the CP (TCP) controllers to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer()
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
for i in xrange(options.num_sqc):
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
# Because of wire buffers, num_tccs must equal num_tccdirs
|
||||
numa_bit = 6
|
||||
|
||||
for i in xrange(options.num_tccs):
|
||||
|
||||
tcc_cntrl = TCCCntrl()
|
||||
tcc_cntrl.create(options, ruby_system, system)
|
||||
tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
|
||||
tcc_cntrl.l2_response_latency = options.TCC_latency
|
||||
tcc_cntrl_nodes.append(tcc_cntrl)
|
||||
tcc_cntrl.WB = options.WB_L2
|
||||
tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
|
||||
# Connect the TCC controllers to the ruby network
|
||||
tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.probeFromNB = MessageBuffer()
|
||||
tcc_cntrl.probeFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseFromNB = MessageBuffer()
|
||||
tcc_cntrl.responseFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.responseToNB = MessageBuffer()
|
||||
tcc_cntrl.responseToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.unblockToNB = MessageBuffer()
|
||||
tcc_cntrl.unblockToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
|
||||
# connect all of the wire buffers between L3 and dirs up
|
||||
# TCC cntrls added to the GPU cluster
|
||||
gpuCluster.add(tcc_cntrl)
|
||||
|
||||
# Assuming no DMA devices
|
||||
assert(len(dma_devices) == 0)
|
||||
|
||||
# Add cpu/gpu clusters to main cluster
|
||||
mainCluster.add(cpuCluster)
|
||||
mainCluster.add(gpuCluster)
|
||||
|
||||
ruby_system.network.number_of_virtual_networks = 10
|
||||
|
||||
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
|
758
configs/ruby/GPU_VIPER_Region.py
Normal file
758
configs/ruby/GPU_VIPER_Region.py
Normal file
|
@ -0,0 +1,758 @@
|
|||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Sooraj Puthoor
|
||||
#
|
||||
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from Ruby import send_evicts
|
||||
|
||||
from Cluster import Cluster
|
||||
|
||||
class CntrlBase:
|
||||
_seqs = 0
|
||||
@classmethod
|
||||
def seqCount(cls):
|
||||
# Use SeqCount not class since we need global count
|
||||
CntrlBase._seqs += 1
|
||||
return CntrlBase._seqs - 1
|
||||
|
||||
_cntrls = 0
|
||||
@classmethod
|
||||
def cntrlCount(cls):
|
||||
# Use CntlCount not class since we need global count
|
||||
CntrlBase._cntrls += 1
|
||||
return CntrlBase._cntrls - 1
|
||||
|
||||
_version = 0
|
||||
@classmethod
|
||||
def versionCount(cls):
|
||||
cls._version += 1 # Use count for this particular type
|
||||
return cls._version - 1
|
||||
|
||||
#
|
||||
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
|
||||
#
|
||||
class L1Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
dataArrayBanks = 2
|
||||
tagArrayBanks = 2
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L2Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class CPCntrl(CorePair_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1Icache = L1Cache()
|
||||
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
|
||||
self.L1D0cache = L1Cache()
|
||||
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L1D1cache = L1Cache()
|
||||
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options.l2_size, options.l2_assoc, options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1Icache
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.icache = self.L1Icache
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
self.issue_latency = 1
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCPCache(RubyCache):
|
||||
size = "16kB"
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcp_size)
|
||||
self.dataArrayBanks = 16
|
||||
self.tagArrayBanks = 16
|
||||
self.dataAccessLatency = 4
|
||||
self.tagAccessLatency = 1
|
||||
self.resourceStalls = options.no_tcc_resource_stalls
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
|
||||
|
||||
class TCPCntrl(TCP_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L1cache = TCPCache(dataAccessLatency = options.TCP_latency)
|
||||
self.L1cache.create(options)
|
||||
self.issue_latency = 1
|
||||
|
||||
self.coalescer = VIPERCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class SQCCache(RubyCache):
|
||||
dataArrayBanks = 8
|
||||
tagArrayBanks = 8
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.sqc_size)
|
||||
self.assoc = options.sqc_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
|
||||
|
||||
class SQCCntrl(SQC_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = False
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1cache
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
self.sequencer.is_cpu_sequencer = False
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCC(RubyCache):
|
||||
size = MemorySize("256kB")
|
||||
assoc = 16
|
||||
dataAccessLatency = 8
|
||||
tagAccessLatency = 2
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.assoc = options.tcc_assoc
|
||||
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
|
||||
s = options.num_compute_units
|
||||
tcc_size = s * 128
|
||||
tcc_size = str(tcc_size)+'kB'
|
||||
self.size = MemorySize(tcc_size)
|
||||
self.dataArrayBanks = 64
|
||||
self.tagArrayBanks = 64
|
||||
else:
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
|
||||
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
|
||||
self.size.value = self.size.value / options.num_tccs
|
||||
if ((self.size.value / long(self.assoc)) < 128):
|
||||
self.size.value = long(128 * self.assoc)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
|
||||
|
||||
class TCCCntrl(TCC_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L2cache = TCC()
|
||||
self.L2cache.create(options)
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class L3Cache(RubyCache):
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.size = MemorySize(options.l3_size)
|
||||
self.size.value /= options.num_dirs
|
||||
self.assoc = options.l3_assoc
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataAccessLatency = options.l3_data_latency
|
||||
self.tagAccessLatency = options.l3_tag_latency
|
||||
self.resourceStalls = False
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
|
||||
|
||||
class L3Cntrl(L3Cache_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L3cache = L3Cache()
|
||||
self.L3cache.create(options, ruby_system, system)
|
||||
self.l3_response_latency = \
|
||||
max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
# Directory memory: Directory memory of infinite size which is
|
||||
# used by directory controller to store the "states" of the
|
||||
# state machine. The state machine is implemented per cache block
|
||||
class DirMem(RubyDirectoryMemory, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
phys_mem_size = AddrRange(options.mem_size).size()
|
||||
mem_module_size = phys_mem_size / options.num_dirs
|
||||
dir_size = MemorySize('0B')
|
||||
dir_size.value = mem_module_size
|
||||
self.size = dir_size
|
||||
|
||||
# Directory controller: Contains directory memory, L3 cache and associated state
|
||||
# machine which is used to accurately redirect a data request to L3 cache or to
|
||||
# memory. The permissions requests do not come to this directory for region
|
||||
# based protocols as they are handled exclusively by the region directory.
|
||||
# However, region directory controller uses this directory controller for
|
||||
# sending probe requests and receiving probe responses.
|
||||
class DirCntrl(Directory_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.response_latency = 25
|
||||
self.response_latency_regionDir = 1
|
||||
self.directory = DirMem()
|
||||
self.directory.create(options, ruby_system, system)
|
||||
self.L3CacheMemory = L3Cache()
|
||||
self.L3CacheMemory.create(options, ruby_system, system)
|
||||
self.l3_hit_latency = \
|
||||
max(self.L3CacheMemory.dataAccessLatency,
|
||||
self.L3CacheMemory.tagAccessLatency)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
# Region directory : Stores region permissions
|
||||
class RegionDir(RubyCache):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.block_size = "%dB" % (64 * options.blocks_per_region)
|
||||
self.size = options.region_dir_entries * \
|
||||
self.block_size * options.num_compute_units
|
||||
self.assoc = 8
|
||||
self.tagArrayBanks = 8
|
||||
self.tagAccessLatency = options.dir_tag_latency
|
||||
self.dataAccessLatency = 1
|
||||
self.resourceStalls = options.no_resource_stalls
|
||||
self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy(assoc = self.assoc)
|
||||
# Region directory controller : Contains region directory and associated state
|
||||
# machine for dealing with region coherence requests.
|
||||
class RegionCntrl(RegionDir_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.cacheMemory = RegionDir()
|
||||
self.cacheMemory.create(options, ruby_system, system)
|
||||
self.blocksPerRegion = options.blocks_per_region
|
||||
self.toDirLatency = \
|
||||
max(self.cacheMemory.dataAccessLatency,
|
||||
self.cacheMemory.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
self.always_migrate = options.always_migrate
|
||||
self.sym_migrate = options.symmetric_migrate
|
||||
self.asym_migrate = options.asymmetric_migrate
|
||||
if self.always_migrate:
|
||||
assert(not self.asym_migrate and not self.sym_migrate)
|
||||
if self.sym_migrate:
|
||||
assert(not self.always_migrate and not self.asym_migrate)
|
||||
if self.asym_migrate:
|
||||
assert(not self.always_migrate and not self.sym_migrate)
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
# Region Buffer: A region directory cache which avoids some potential
|
||||
# long latency lookup of region directory for getting region permissions
|
||||
class RegionBuffer(RubyCache):
|
||||
assoc = 4
|
||||
dataArrayBanks = 256
|
||||
tagArrayBanks = 256
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
resourceStalls = True
|
||||
|
||||
class RBCntrl(RegionBuffer_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.cacheMemory = RegionBuffer()
|
||||
self.cacheMemory.resourceStalls = options.no_tcc_resource_stalls
|
||||
self.cacheMemory.dataArrayBanks = 64
|
||||
self.cacheMemory.tagArrayBanks = 64
|
||||
self.blocksPerRegion = options.blocks_per_region
|
||||
self.cacheMemory.block_size = "%dB" % (64 * self.blocksPerRegion)
|
||||
self.cacheMemory.start_index_bit = \
|
||||
6 + int(math.log(self.blocksPerRegion, 2))
|
||||
self.cacheMemory.size = options.region_buffer_entries * \
|
||||
self.cacheMemory.block_size * options.num_compute_units
|
||||
self.toDirLatency = options.gpu_to_dir_latency
|
||||
self.toRegionDirLatency = options.cpu_to_dir_latency
|
||||
self.noTCCdir = True
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
self.TCC_select_num_bits = TCC_bits
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
self.cacheMemory.replacement_policy = \
|
||||
PseudoLRUReplacementPolicy(assoc = self.cacheMemory.assoc)
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--num-subcaches", type="int", default=4)
|
||||
parser.add_option("--l3-data-latency", type="int", default=20)
|
||||
parser.add_option("--l3-tag-latency", type="int", default=15)
|
||||
parser.add_option("--cpu-to-dir-latency", type="int", default=120)
|
||||
parser.add_option("--gpu-to-dir-latency", type="int", default=60)
|
||||
parser.add_option("--no-resource-stalls", action="store_false",
|
||||
default=True)
|
||||
parser.add_option("--no-tcc-resource-stalls", action="store_false",
|
||||
default=True)
|
||||
parser.add_option("--num-tbes", type="int", default=32)
|
||||
parser.add_option("--l2-latency", type="int", default=50) # load to use
|
||||
parser.add_option("--num-tccs", type="int", default=1,
|
||||
help="number of TCC banks in the GPU")
|
||||
|
||||
parser.add_option("--sqc-size", type='string', default='32kB',
|
||||
help="SQC cache size")
|
||||
parser.add_option("--sqc-assoc", type='int', default=8,
|
||||
help="SQC cache assoc")
|
||||
|
||||
parser.add_option("--WB_L1", action="store_true",
|
||||
default=False, help="L2 Writeback Cache")
|
||||
parser.add_option("--WB_L2", action="store_true",
|
||||
default=False, help="L2 Writeback Cache")
|
||||
parser.add_option("--TCP_latency",
|
||||
type="int", default=4, help="TCP latency")
|
||||
parser.add_option("--TCC_latency",
|
||||
type="int", default=16, help="TCC latency")
|
||||
parser.add_option("--tcc-size", type='string', default='2MB',
|
||||
help="agregate tcc size")
|
||||
parser.add_option("--tcc-assoc", type='int', default=16,
|
||||
help="tcc assoc")
|
||||
parser.add_option("--tcp-size", type='string', default='16kB',
|
||||
help="tcp size")
|
||||
|
||||
parser.add_option("--dir-tag-latency", type="int", default=4)
|
||||
parser.add_option("--dir-tag-banks", type="int", default=4)
|
||||
parser.add_option("--blocks-per-region", type="int", default=16)
|
||||
parser.add_option("--dir-entries", type="int", default=8192)
|
||||
|
||||
# Region buffer is a cache of region directory. Hence region
|
||||
# directory is inclusive with respect to region directory.
|
||||
# However, region directory is non-inclusive with respect to
|
||||
# the caches in the system
|
||||
parser.add_option("--region-dir-entries", type="int", default=1024)
|
||||
parser.add_option("--region-buffer-entries", type="int", default=512)
|
||||
|
||||
parser.add_option("--always-migrate",
|
||||
action="store_true", default=False)
|
||||
parser.add_option("--symmetric-migrate",
|
||||
action="store_true", default=False)
|
||||
parser.add_option("--asymmetric-migrate",
|
||||
action="store_true", default=False)
|
||||
parser.add_option("--use-L3-on-WT", action="store_true", default=False)
|
||||
|
||||
def create_system(options, full_system, system, dma_devices, ruby_system):
|
||||
if buildEnv['PROTOCOL'] != 'GPU_VIPER_Region':
|
||||
panic("This script requires the GPU_VIPER_Region protocol to be built.")
|
||||
|
||||
cpu_sequencers = []
|
||||
|
||||
#
|
||||
# The ruby network creation expects the list of nodes in the system to be
|
||||
# consistent with the NetDest list. Therefore the l1 controller nodes
|
||||
# must be listed before the directory nodes and directory nodes before
|
||||
# dma nodes, etc.
|
||||
#
|
||||
dir_cntrl_nodes = []
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
|
||||
#
|
||||
# Must create the individual controllers before the network to ensure the
|
||||
# controller constructors are called before the network constructor
|
||||
#
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
|
||||
cpuCluster = Cluster(extBW = (crossbar_bw), intBW=crossbar_bw)
|
||||
for i in xrange((options.num_cpus + 1) / 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
rb_cntrl = RBCntrl()
|
||||
rb_cntrl.create(options, ruby_system, system)
|
||||
rb_cntrl.number_of_TBEs = 256
|
||||
rb_cntrl.isOnCPU = True
|
||||
|
||||
cp_cntrl.regionBufferNum = rb_cntrl.version
|
||||
|
||||
exec("system.cp_cntrl%d = cp_cntrl" % i)
|
||||
exec("system.rb_cntrl%d = rb_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
|
||||
# Connect the CP controllers and the network
|
||||
cp_cntrl.requestFromCore = MessageBuffer()
|
||||
cp_cntrl.requestFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.responseFromCore = MessageBuffer()
|
||||
cp_cntrl.responseFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.unblockFromCore = MessageBuffer()
|
||||
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.probeToCore = MessageBuffer()
|
||||
cp_cntrl.probeToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.responseToCore = MessageBuffer()
|
||||
cp_cntrl.responseToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
# Connect the RB controllers to the ruby network
|
||||
rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
|
||||
rb_cntrl.requestFromCore.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.responseFromCore = MessageBuffer()
|
||||
rb_cntrl.responseFromCore.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.requestToNetwork = MessageBuffer()
|
||||
rb_cntrl.requestToNetwork.master = ruby_system.network.slave
|
||||
|
||||
rb_cntrl.notifyFromRegionDir = MessageBuffer()
|
||||
rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.probeFromRegionDir = MessageBuffer()
|
||||
rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.unblockFromDir = MessageBuffer()
|
||||
rb_cntrl.unblockFromDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.responseToRegDir = MessageBuffer()
|
||||
rb_cntrl.responseToRegDir.master = ruby_system.network.slave
|
||||
|
||||
rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
cpuCluster.add(cp_cntrl)
|
||||
cpuCluster.add(rb_cntrl)
|
||||
|
||||
gpuCluster = Cluster(extBW = (crossbar_bw), intBW = crossbar_bw)
|
||||
for i in xrange(options.num_compute_units):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
issue_latency = 1,
|
||||
number_of_TBEs = 2560)
|
||||
# TBEs set to max outstanding requests
|
||||
tcp_cntrl.create(options, ruby_system, system)
|
||||
tcp_cntrl.WB = options.WB_L1
|
||||
tcp_cntrl.disableL1 = False
|
||||
|
||||
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.coalescer)
|
||||
|
||||
# Connect the CP (TCP) controllers to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer()
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
for i in xrange(options.num_sqc):
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
numa_bit = 6
|
||||
|
||||
for i in xrange(options.num_tccs):
|
||||
|
||||
tcc_cntrl = TCCCntrl()
|
||||
tcc_cntrl.create(options, ruby_system, system)
|
||||
tcc_cntrl.l2_request_latency = 1
|
||||
tcc_cntrl.l2_response_latency = options.TCC_latency
|
||||
tcc_cntrl.WB = options.WB_L2
|
||||
tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
|
||||
# Connect the TCC controllers to the ruby network
|
||||
tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.probeFromNB = MessageBuffer()
|
||||
tcc_cntrl.probeFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseFromNB = MessageBuffer()
|
||||
tcc_cntrl.responseFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.responseToNB = MessageBuffer()
|
||||
tcc_cntrl.responseToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.unblockToNB = MessageBuffer()
|
||||
tcc_cntrl.unblockToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
rb_cntrl = RBCntrl()
|
||||
rb_cntrl.create(options, ruby_system, system)
|
||||
rb_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
rb_cntrl.isOnCPU = False
|
||||
|
||||
# Connect the RB controllers to the ruby network
|
||||
rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
|
||||
rb_cntrl.requestFromCore.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.responseFromCore = MessageBuffer()
|
||||
rb_cntrl.responseFromCore.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.requestToNetwork = MessageBuffer()
|
||||
rb_cntrl.requestToNetwork.master = ruby_system.network.slave
|
||||
|
||||
rb_cntrl.notifyFromRegionDir = MessageBuffer()
|
||||
rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.probeFromRegionDir = MessageBuffer()
|
||||
rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.unblockFromDir = MessageBuffer()
|
||||
rb_cntrl.unblockFromDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.responseToRegDir = MessageBuffer()
|
||||
rb_cntrl.responseToRegDir.master = ruby_system.network.slave
|
||||
|
||||
rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
tcc_cntrl.regionBufferNum = rb_cntrl.version
|
||||
|
||||
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
|
||||
exec("system.tcc_rb_cntrl%d = rb_cntrl" % i)
|
||||
|
||||
# TCC cntrls added to the GPU cluster
|
||||
gpuCluster.add(tcc_cntrl)
|
||||
gpuCluster.add(rb_cntrl)
|
||||
|
||||
# Because of wire buffers, num_l3caches must equal num_dirs
|
||||
# Region coherence only works with 1 dir
|
||||
assert(options.num_l3caches == options.num_dirs == 1)
|
||||
|
||||
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
|
||||
# Clusters
|
||||
mainCluster = Cluster(intBW = crossbar_bw)
|
||||
|
||||
dir_cntrl = DirCntrl()
|
||||
dir_cntrl.create(options, ruby_system, system)
|
||||
dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
dir_cntrl.useL3OnWT = options.use_L3_on_WT
|
||||
|
||||
# Connect the Directory controller to the ruby network
|
||||
dir_cntrl.requestFromCores = MessageBuffer()
|
||||
dir_cntrl.requestFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.responseFromCores = MessageBuffer()
|
||||
dir_cntrl.responseFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockFromCores = MessageBuffer()
|
||||
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.probeToCore = MessageBuffer()
|
||||
dir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.responseToCore = MessageBuffer()
|
||||
dir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.reqFromRegBuf = MessageBuffer()
|
||||
dir_cntrl.reqFromRegBuf.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.reqToRegDir = MessageBuffer(ordered = True)
|
||||
dir_cntrl.reqToRegDir.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.reqFromRegDir = MessageBuffer(ordered = True)
|
||||
dir_cntrl.reqFromRegDir.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockToRegDir = MessageBuffer()
|
||||
dir_cntrl.unblockToRegDir.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.responseFromMemory = MessageBuffer()
|
||||
|
||||
exec("system.dir_cntrl%d = dir_cntrl" % i)
|
||||
dir_cntrl_nodes.append(dir_cntrl)
|
||||
|
||||
mainCluster.add(dir_cntrl)
|
||||
|
||||
reg_cntrl = RegionCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
|
||||
reg_cntrl.create(options, ruby_system, system)
|
||||
reg_cntrl.number_of_TBEs = options.num_tbes
|
||||
reg_cntrl.cpuRegionBufferNum = system.rb_cntrl0.version
|
||||
reg_cntrl.gpuRegionBufferNum = system.tcc_rb_cntrl0.version
|
||||
|
||||
# Connect the Region Dir controllers to the ruby network
|
||||
reg_cntrl.requestToDir = MessageBuffer(ordered = True)
|
||||
reg_cntrl.requestToDir.master = ruby_system.network.slave
|
||||
|
||||
reg_cntrl.notifyToRBuffer = MessageBuffer()
|
||||
reg_cntrl.notifyToRBuffer.master = ruby_system.network.slave
|
||||
|
||||
reg_cntrl.probeToRBuffer = MessageBuffer()
|
||||
reg_cntrl.probeToRBuffer.master = ruby_system.network.slave
|
||||
|
||||
reg_cntrl.responseFromRBuffer = MessageBuffer()
|
||||
reg_cntrl.responseFromRBuffer.slave = ruby_system.network.master
|
||||
|
||||
reg_cntrl.requestFromRegBuf = MessageBuffer()
|
||||
reg_cntrl.requestFromRegBuf.slave = ruby_system.network.master
|
||||
|
||||
reg_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
exec("system.reg_cntrl%d = reg_cntrl" % i)
|
||||
|
||||
mainCluster.add(reg_cntrl)
|
||||
|
||||
# Assuming no DMA devices
|
||||
assert(len(dma_devices) == 0)
|
||||
|
||||
# Add cpu/gpu clusters to main cluster
|
||||
mainCluster.add(cpuCluster)
|
||||
mainCluster.add(gpuCluster)
|
||||
|
||||
ruby_system.network.number_of_virtual_networks = 10
|
||||
|
||||
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
|
326
configs/ruby/MOESI_AMD_Base.py
Normal file
326
configs/ruby/MOESI_AMD_Base.py
Normal file
|
@ -0,0 +1,326 @@
|
|||
#
|
||||
# Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Lisa Hsu
|
||||
#
|
||||
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from Ruby import create_topology
|
||||
from Ruby import send_evicts
|
||||
|
||||
from Cluster import Cluster
|
||||
from Crossbar import Crossbar
|
||||
|
||||
class CntrlBase:
|
||||
_seqs = 0
|
||||
@classmethod
|
||||
def seqCount(cls):
|
||||
# Use SeqCount not class since we need global count
|
||||
CntrlBase._seqs += 1
|
||||
return CntrlBase._seqs - 1
|
||||
|
||||
_cntrls = 0
|
||||
@classmethod
|
||||
def cntrlCount(cls):
|
||||
# Use CntlCount not class since we need global count
|
||||
CntrlBase._cntrls += 1
|
||||
return CntrlBase._cntrls - 1
|
||||
|
||||
_version = 0
|
||||
@classmethod
|
||||
def versionCount(cls):
|
||||
cls._version += 1 # Use count for this particular type
|
||||
return cls._version - 1
|
||||
|
||||
class L1DCache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l1d_size)
|
||||
self.assoc = options.l1d_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L1ICache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l1i_size)
|
||||
self.assoc = options.l1i_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L2Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l2_size)
|
||||
self.assoc = options.l2_assoc
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class CPCntrl(CorePair_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1Icache = L1ICache()
|
||||
self.L1Icache.create(options)
|
||||
self.L1D0cache = L1DCache()
|
||||
self.L1D0cache.create(options)
|
||||
self.L1D1cache = L1DCache()
|
||||
self.L1D1cache.create(options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.icache_hit_latency = 2
|
||||
self.sequencer.dcache_hit_latency = 2
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.icache = self.L1Icache
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.icache = self.L1Icache
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.icache_hit_latency = 2
|
||||
self.sequencer1.dcache_hit_latency = 2
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
self.issue_latency = options.cpu_to_dir_latency
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class L3Cache(RubyCache):
|
||||
assoc = 8
|
||||
dataArrayBanks = 256
|
||||
tagArrayBanks = 256
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.size = MemorySize(options.l3_size)
|
||||
self.size.value /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataAccessLatency = options.l3_data_latency
|
||||
self.tagAccessLatency = options.l3_tag_latency
|
||||
self.resourceStalls = options.no_resource_stalls
|
||||
self.replacement_policy = PseudoLRUReplacementPolicy()
|
||||
|
||||
class L3Cntrl(L3Cache_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L3cache = L3Cache()
|
||||
self.L3cache.create(options, ruby_system, system)
|
||||
|
||||
self.l3_response_latency = max(self.L3cache.dataAccessLatency,
|
||||
self.L3cache.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
class DirMem(RubyDirectoryMemory, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
phys_mem_size = AddrRange(options.mem_size).size()
|
||||
mem_module_size = phys_mem_size / options.num_dirs
|
||||
dir_size = MemorySize('0B')
|
||||
dir_size.value = mem_module_size
|
||||
self.size = dir_size
|
||||
|
||||
class DirCntrl(Directory_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.response_latency = 30
|
||||
|
||||
self.directory = DirMem()
|
||||
self.directory.create(options, ruby_system, system)
|
||||
|
||||
self.L3CacheMemory = L3Cache()
|
||||
self.L3CacheMemory.create(options, ruby_system, system)
|
||||
|
||||
self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
|
||||
self.L3CacheMemory.tagAccessLatency)
|
||||
|
||||
self.number_of_TBEs = options.num_tbes
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
self.CPUonly = True
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--num-subcaches", type="int", default=4)
|
||||
parser.add_option("--l3-data-latency", type="int", default=20)
|
||||
parser.add_option("--l3-tag-latency", type="int", default=15)
|
||||
parser.add_option("--cpu-to-dir-latency", type="int", default=15)
|
||||
parser.add_option("--no-resource-stalls", action="store_false",
|
||||
default=True)
|
||||
parser.add_option("--num-tbes", type="int", default=256)
|
||||
parser.add_option("--l2-latency", type="int", default=50) # load to use
|
||||
|
||||
def create_system(options, full_system, system, dma_devices, ruby_system):
|
||||
if buildEnv['PROTOCOL'] != 'MOESI_AMD_Base':
|
||||
panic("This script requires the MOESI_AMD_Base protocol.")
|
||||
|
||||
cpu_sequencers = []
|
||||
|
||||
#
|
||||
# The ruby network creation expects the list of nodes in the system to
|
||||
# be consistent with the NetDest list. Therefore the l1 controller
|
||||
# nodes must be listed before the directory nodes and directory nodes
|
||||
# before dma nodes, etc.
|
||||
#
|
||||
l1_cntrl_nodes = []
|
||||
l3_cntrl_nodes = []
|
||||
dir_cntrl_nodes = []
|
||||
|
||||
control_count = 0
|
||||
|
||||
#
|
||||
# Must create the individual controllers before the network to ensure
|
||||
# the controller constructors are called before the network constructor
|
||||
#
|
||||
|
||||
# This is the base crossbar that connects the L3s, Dirs, and cpu
|
||||
# Cluster
|
||||
mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
|
||||
for i in xrange(options.num_dirs):
|
||||
|
||||
dir_cntrl = DirCntrl(TCC_select_num_bits = 0)
|
||||
dir_cntrl.create(options, ruby_system, system)
|
||||
|
||||
# Connect the Directory controller to the ruby network
|
||||
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.responseFromCores = MessageBuffer()
|
||||
dir_cntrl.responseFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockFromCores = MessageBuffer()
|
||||
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.probeToCore = MessageBuffer()
|
||||
dir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.responseToCore = MessageBuffer()
|
||||
dir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.responseFromMemory = MessageBuffer()
|
||||
|
||||
exec("system.dir_cntrl%d = dir_cntrl" % i)
|
||||
dir_cntrl_nodes.append(dir_cntrl)
|
||||
|
||||
mainCluster.add(dir_cntrl)
|
||||
|
||||
# Technically this config can support an odd number of cpus, but the top
|
||||
# level config files, such as the ruby_random_tester, will get confused if
|
||||
# the number of cpus does not equal the number of sequencers. Thus make
|
||||
# sure that an even number of cpus is specified.
|
||||
assert((options.num_cpus % 2) == 0)
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
cpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
|
||||
for i in xrange((options.num_cpus + 1) / 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.cp_cntrl%d = cp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
|
||||
# Connect the CP controllers and the network
|
||||
cp_cntrl.requestFromCore = MessageBuffer()
|
||||
cp_cntrl.requestFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.responseFromCore = MessageBuffer()
|
||||
cp_cntrl.responseFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.unblockFromCore = MessageBuffer()
|
||||
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.probeToCore = MessageBuffer()
|
||||
cp_cntrl.probeToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.responseToCore = MessageBuffer()
|
||||
cp_cntrl.responseToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
cpuCluster.add(cp_cntrl)
|
||||
|
||||
# Assuming no DMA devices
|
||||
assert(len(dma_devices) == 0)
|
||||
|
||||
# Add cpu/gpu clusters to main cluster
|
||||
mainCluster.add(cpuCluster)
|
||||
|
||||
ruby_system.network.number_of_virtual_networks = 10
|
||||
|
||||
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
|
|
@ -78,7 +78,7 @@ class SourceMeta(type):
|
|||
def __init__(cls, name, bases, dict):
|
||||
super(SourceMeta, cls).__init__(name, bases, dict)
|
||||
cls.all = []
|
||||
|
||||
|
||||
def get(cls, **guards):
|
||||
'''Find all files that match the specified guards. If a source
|
||||
file does not specify a flag, the default is False'''
|
||||
|
@ -367,9 +367,9 @@ def makeTheISA(source, target, env):
|
|||
target_isa = env['TARGET_ISA']
|
||||
def define(isa):
|
||||
return isa.upper() + '_ISA'
|
||||
|
||||
|
||||
def namespace(isa):
|
||||
return isa[0].upper() + isa[1:].lower() + 'ISA'
|
||||
return isa[0].upper() + isa[1:].lower() + 'ISA'
|
||||
|
||||
|
||||
code = code_formatter()
|
||||
|
@ -407,6 +407,51 @@ def makeTheISA(source, target, env):
|
|||
env.Command('config/the_isa.hh', map(Value, all_isa_list),
|
||||
MakeAction(makeTheISA, Transform("CFG ISA", 0)))
|
||||
|
||||
def makeTheGPUISA(source, target, env):
|
||||
isas = [ src.get_contents() for src in source ]
|
||||
target_gpu_isa = env['TARGET_GPU_ISA']
|
||||
def define(isa):
|
||||
return isa.upper() + '_ISA'
|
||||
|
||||
def namespace(isa):
|
||||
return isa[0].upper() + isa[1:].lower() + 'ISA'
|
||||
|
||||
|
||||
code = code_formatter()
|
||||
code('''\
|
||||
#ifndef __CONFIG_THE_GPU_ISA_HH__
|
||||
#define __CONFIG_THE_GPU_ISA_HH__
|
||||
|
||||
''')
|
||||
|
||||
# create defines for the preprocessing and compile-time determination
|
||||
for i,isa in enumerate(isas):
|
||||
code('#define $0 $1', define(isa), i + 1)
|
||||
code()
|
||||
|
||||
# create an enum for any run-time determination of the ISA, we
|
||||
# reuse the same name as the namespaces
|
||||
code('enum class GPUArch {')
|
||||
for i,isa in enumerate(isas):
|
||||
if i + 1 == len(isas):
|
||||
code(' $0 = $1', namespace(isa), define(isa))
|
||||
else:
|
||||
code(' $0 = $1,', namespace(isa), define(isa))
|
||||
code('};')
|
||||
|
||||
code('''
|
||||
|
||||
#define THE_GPU_ISA ${{define(target_gpu_isa)}}
|
||||
#define TheGpuISA ${{namespace(target_gpu_isa)}}
|
||||
#define THE_GPU_ISA_STR "${{target_gpu_isa}}"
|
||||
|
||||
#endif // __CONFIG_THE_GPU_ISA_HH__''')
|
||||
|
||||
code.write(str(target[0]))
|
||||
|
||||
env.Command('config/the_gpu_isa.hh', map(Value, all_gpu_isa_list),
|
||||
MakeAction(makeTheGPUISA, Transform("CFG ISA", 0)))
|
||||
|
||||
########################################################################
|
||||
#
|
||||
# Prevent any SimObjects from being added after this point, they
|
||||
|
@ -784,7 +829,7 @@ extern "C" {
|
|||
EmbeddedSwig embed_swig_${module}(init_${module});
|
||||
''')
|
||||
code.write(str(target[0]))
|
||||
|
||||
|
||||
# Build all swig modules
|
||||
for swig in SwigSource.all:
|
||||
env.Command([swig.cc_source.tnode, swig.py_source.tnode], swig.tnode,
|
||||
|
@ -959,7 +1004,7 @@ const uint8_t data_${sym}[] = {
|
|||
x = array.array('B', data[i:i+step])
|
||||
code(''.join('%d,' % d for d in x))
|
||||
code.dedent()
|
||||
|
||||
|
||||
code('''};
|
||||
|
||||
EmbeddedPython embedded_${sym}(
|
||||
|
|
|
@ -68,6 +68,14 @@ isa_switch_hdrs = Split('''
|
|||
# Set up this directory to support switching headers
|
||||
make_switching_dir('arch', isa_switch_hdrs, env)
|
||||
|
||||
if env['BUILD_GPU']:
|
||||
gpu_isa_switch_hdrs = Split('''
|
||||
gpu_decoder.hh
|
||||
gpu_types.hh
|
||||
''')
|
||||
|
||||
make_gpu_switching_dir('arch', gpu_isa_switch_hdrs, env)
|
||||
|
||||
#################################################################
|
||||
#
|
||||
# Include architecture-specific files.
|
||||
|
|
67
src/arch/hsail/Brig.h
Normal file
67
src/arch/hsail/Brig.h
Normal file
|
@ -0,0 +1,67 @@
|
|||
// University of Illinois/NCSA
|
||||
// Open Source License
|
||||
//
|
||||
// Copyright (c) 2013, Advanced Micro Devices, Inc.
|
||||
// All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// HSA Team
|
||||
//
|
||||
// Advanced Micro Devices, Inc
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
// this software and associated documentation files (the "Software"), to deal with
|
||||
// the Software without restriction, including without limitation the rights to
|
||||
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
// of the Software, and to permit persons to whom the Software is furnished to do
|
||||
// so, subject to the following conditions:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
//
|
||||
// * Redistributions in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimers in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// * Neither the names of the LLVM Team, University of Illinois at
|
||||
// Urbana-Champaign, nor the names of its contributors may be used to
|
||||
// endorse or promote products derived from this Software without specific
|
||||
// prior written permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
// SOFTWARE.
|
||||
#ifndef INTERNAL_BRIG_H
|
||||
#define INTERNAL_BRIG_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace Brig {
|
||||
#include "Brig_new.hpp"
|
||||
|
||||
// These typedefs provide some backward compatibility with earlier versions
|
||||
// of Brig.h, reducing the number of code changes. The distinct names also
|
||||
// increase legibility by showing the code's intent.
|
||||
typedef BrigBase BrigDirective;
|
||||
typedef BrigBase BrigOperand;
|
||||
|
||||
enum BrigMemoryFenceSegments { // for internal use only
|
||||
//.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc }
|
||||
//.mnemo_token=_EMMemoryFenceSegments
|
||||
//.mnemo_context=EInstModifierInstFenceContext
|
||||
BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0,
|
||||
BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1,
|
||||
BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2,
|
||||
BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif // defined(INTERNAL_BRIG_H)
|
1587
src/arch/hsail/Brig_new.hpp
Normal file
1587
src/arch/hsail/Brig_new.hpp
Normal file
File diff suppressed because it is too large
Load diff
54
src/arch/hsail/SConscript
Normal file
54
src/arch/hsail/SConscript
Normal file
|
@ -0,0 +1,54 @@
|
|||
# -*- mode:python -*-
|
||||
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Anthony Gutierrez
|
||||
#
|
||||
|
||||
Import('*')
|
||||
|
||||
if not env['BUILD_GPU']:
|
||||
Return()
|
||||
|
||||
if env['TARGET_GPU_ISA'] == 'hsail':
|
||||
env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'],
|
||||
'gen.py', '$SOURCE $TARGETS')
|
||||
|
||||
Source('generic_types.cc')
|
||||
Source('gpu_decoder.cc')
|
||||
Source('insts/branch.cc')
|
||||
Source('insts/gen_exec.cc')
|
||||
Source('insts/gpu_static_inst.cc')
|
||||
Source('insts/main.cc')
|
||||
Source('insts/pseudo_inst.cc')
|
||||
Source('insts/mem.cc')
|
||||
Source('operand.cc')
|
40
src/arch/hsail/SConsopts
Normal file
40
src/arch/hsail/SConsopts
Normal file
|
@ -0,0 +1,40 @@
|
|||
# -*- mode:python -*-
|
||||
|
||||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Anthony Gutierrez
|
||||
#
|
||||
|
||||
Import('*')
|
||||
|
||||
all_gpu_isa_list.append('hsail')
|
806
src/arch/hsail/gen.py
Executable file
806
src/arch/hsail/gen.py
Executable file
|
@ -0,0 +1,806 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Steve Reinhardt
|
||||
#
|
||||
|
||||
import sys, re
|
||||
|
||||
from m5.util import code_formatter
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
print "Error: need 3 args (file names)"
|
||||
sys.exit(0)
|
||||
|
||||
header_code = code_formatter()
|
||||
decoder_code = code_formatter()
|
||||
exec_code = code_formatter()
|
||||
|
||||
###############
|
||||
#
|
||||
# Generate file prologs (includes etc.)
|
||||
#
|
||||
###############
|
||||
|
||||
header_code('''
|
||||
#include "arch/hsail/insts/decl.hh"
|
||||
#include "base/bitfield.hh"
|
||||
#include "gpu-compute/hsail_code.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
''')
|
||||
header_code.indent()
|
||||
|
||||
decoder_code('''
|
||||
#include "arch/hsail/gpu_decoder.hh"
|
||||
#include "arch/hsail/insts/branch.hh"
|
||||
#include "arch/hsail/insts/decl.hh"
|
||||
#include "arch/hsail/insts/gen_decl.hh"
|
||||
#include "arch/hsail/insts/mem.hh"
|
||||
#include "arch/hsail/insts/mem_impl.hh"
|
||||
#include "gpu-compute/brig_object.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
std::vector<GPUStaticInst*> Decoder::decodedInsts;
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode(MachInst machInst)
|
||||
{
|
||||
using namespace Brig;
|
||||
|
||||
const BrigInstBase *ib = machInst.brigInstBase;
|
||||
const BrigObject *obj = machInst.brigObj;
|
||||
|
||||
switch(ib->opcode) {
|
||||
''')
|
||||
decoder_code.indent()
|
||||
decoder_code.indent()
|
||||
|
||||
exec_code('''
|
||||
#include "arch/hsail/insts/gen_decl.hh"
|
||||
#include "base/intmath.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
''')
|
||||
exec_code.indent()
|
||||
|
||||
###############
|
||||
#
|
||||
# Define code templates for class declarations (for header file)
|
||||
#
|
||||
###############
|
||||
|
||||
# Basic header template for an instruction with no template parameters.
|
||||
header_template_nodt = '''
|
||||
class $class_name : public $base_class
|
||||
{
|
||||
public:
|
||||
typedef $base_class Base;
|
||||
|
||||
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: Base(ib, obj, "$opcode")
|
||||
{
|
||||
}
|
||||
|
||||
void execute(GPUDynInstPtr gpuDynInst);
|
||||
};
|
||||
|
||||
'''
|
||||
|
||||
# Basic header template for an instruction with a single DataType
|
||||
# template parameter.
|
||||
header_template_1dt = '''
|
||||
template<typename DataType>
|
||||
class $class_name : public $base_class<DataType>
|
||||
{
|
||||
public:
|
||||
typedef $base_class<DataType> Base;
|
||||
typedef typename DataType::CType CType;
|
||||
|
||||
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: Base(ib, obj, "$opcode")
|
||||
{
|
||||
}
|
||||
|
||||
void execute(GPUDynInstPtr gpuDynInst);
|
||||
};
|
||||
|
||||
'''
|
||||
|
||||
header_template_1dt_noexec = '''
|
||||
template<typename DataType>
|
||||
class $class_name : public $base_class<DataType>
|
||||
{
|
||||
public:
|
||||
typedef $base_class<DataType> Base;
|
||||
typedef typename DataType::CType CType;
|
||||
|
||||
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: Base(ib, obj, "$opcode")
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
'''
|
||||
|
||||
# Same as header_template_1dt, except the base class has a second
|
||||
# template parameter NumSrcOperands to allow a variable number of
|
||||
# source operands. Note that since this is implemented with an array,
|
||||
# it only works for instructions where all sources are of the same
|
||||
# type (like most arithmetics).
|
||||
header_template_1dt_varsrcs = '''
|
||||
template<typename DataType>
|
||||
class $class_name : public $base_class<DataType, $num_srcs>
|
||||
{
|
||||
public:
|
||||
typedef $base_class<DataType, $num_srcs> Base;
|
||||
typedef typename DataType::CType CType;
|
||||
|
||||
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: Base(ib, obj, "$opcode")
|
||||
{
|
||||
}
|
||||
|
||||
void execute(GPUDynInstPtr gpuDynInst);
|
||||
};
|
||||
|
||||
'''
|
||||
|
||||
# Header template for instruction with two DataType template
|
||||
# parameters, one for the dest and one for the source. This is used
|
||||
# by compare and convert.
|
||||
header_template_2dt = '''
|
||||
template<typename DestDataType, class SrcDataType>
|
||||
class $class_name : public $base_class<DestDataType, SrcDataType>
|
||||
{
|
||||
public:
|
||||
typedef $base_class<DestDataType, SrcDataType> Base;
|
||||
typedef typename DestDataType::CType DestCType;
|
||||
typedef typename SrcDataType::CType SrcCType;
|
||||
|
||||
$class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: Base(ib, obj, "$opcode")
|
||||
{
|
||||
}
|
||||
|
||||
void execute(GPUDynInstPtr gpuDynInst);
|
||||
};
|
||||
|
||||
'''
|
||||
|
||||
header_templates = {
|
||||
'ArithInst': header_template_1dt_varsrcs,
|
||||
'CmovInst': header_template_1dt,
|
||||
'ClassInst': header_template_1dt,
|
||||
'ShiftInst': header_template_1dt,
|
||||
'ExtractInsertInst': header_template_1dt,
|
||||
'CmpInst': header_template_2dt,
|
||||
'CvtInst': header_template_2dt,
|
||||
'LdInst': '',
|
||||
'StInst': '',
|
||||
'SpecialInstNoSrc': header_template_nodt,
|
||||
'SpecialInst1Src': header_template_nodt,
|
||||
'SpecialInstNoSrcNoDest': '',
|
||||
}
|
||||
|
||||
###############
|
||||
#
|
||||
# Define code templates for exec functions
|
||||
#
|
||||
###############
|
||||
|
||||
# exec function body
|
||||
exec_template_nodt_nosrc = '''
|
||||
void
|
||||
$class_name::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
typedef Base::DestCType DestCType;
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
DestCType dest_val = $expr;
|
||||
this->dest.set(w, lane, dest_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
exec_template_nodt_1src = '''
|
||||
void
|
||||
$class_name::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
typedef Base::DestCType DestCType;
|
||||
typedef Base::SrcCType SrcCType;
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
|
||||
DestCType dest_val = $expr;
|
||||
|
||||
this->dest.set(w, lane, dest_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
exec_template_1dt_varsrcs = '''
|
||||
template<typename DataType>
|
||||
void
|
||||
$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
CType dest_val;
|
||||
if ($dest_is_src_flag) {
|
||||
dest_val = this->dest.template get<CType>(w, lane);
|
||||
}
|
||||
|
||||
CType src_val[$num_srcs];
|
||||
|
||||
for (int i = 0; i < $num_srcs; ++i) {
|
||||
src_val[i] = this->src[i].template get<CType>(w, lane);
|
||||
}
|
||||
|
||||
dest_val = (CType)($expr);
|
||||
|
||||
this->dest.set(w, lane, dest_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
exec_template_1dt_3srcs = '''
|
||||
template<typename DataType>
|
||||
void
|
||||
$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
typedef typename Base::Src0CType Src0T;
|
||||
typedef typename Base::Src1CType Src1T;
|
||||
typedef typename Base::Src2CType Src2T;
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
CType dest_val;
|
||||
|
||||
if ($dest_is_src_flag) {
|
||||
dest_val = this->dest.template get<CType>(w, lane);
|
||||
}
|
||||
|
||||
Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
|
||||
Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
|
||||
Src2T src_val2 = this->src2.template get<Src2T>(w, lane);
|
||||
|
||||
dest_val = $expr;
|
||||
|
||||
this->dest.set(w, lane, dest_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
exec_template_1dt_2src_1dest = '''
|
||||
template<typename DataType>
|
||||
void
|
||||
$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
typedef typename Base::DestCType DestT;
|
||||
typedef CType Src0T;
|
||||
typedef typename Base::Src1CType Src1T;
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
DestT dest_val;
|
||||
if ($dest_is_src_flag) {
|
||||
dest_val = this->dest.template get<DestT>(w, lane);
|
||||
}
|
||||
Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
|
||||
Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
|
||||
|
||||
dest_val = $expr;
|
||||
|
||||
this->dest.set(w, lane, dest_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
exec_template_shift = '''
|
||||
template<typename DataType>
|
||||
void
|
||||
$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
CType dest_val;
|
||||
|
||||
if ($dest_is_src_flag) {
|
||||
dest_val = this->dest.template get<CType>(w, lane);
|
||||
}
|
||||
|
||||
CType src_val0 = this->src0.template get<CType>(w, lane);
|
||||
uint32_t src_val1 = this->src1.template get<uint32_t>(w, lane);
|
||||
|
||||
dest_val = $expr;
|
||||
|
||||
this->dest.set(w, lane, dest_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
exec_template_2dt = '''
|
||||
template<typename DestDataType, class SrcDataType>
|
||||
void
|
||||
$class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
DestCType dest_val;
|
||||
SrcCType src_val[$num_srcs];
|
||||
|
||||
for (int i = 0; i < $num_srcs; ++i) {
|
||||
src_val[i] = this->src[i].template get<SrcCType>(w, lane);
|
||||
}
|
||||
|
||||
dest_val = $expr;
|
||||
|
||||
this->dest.set(w, lane, dest_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
exec_templates = {
|
||||
'ArithInst': exec_template_1dt_varsrcs,
|
||||
'CmovInst': exec_template_1dt_3srcs,
|
||||
'ExtractInsertInst': exec_template_1dt_3srcs,
|
||||
'ClassInst': exec_template_1dt_2src_1dest,
|
||||
'CmpInst': exec_template_2dt,
|
||||
'CvtInst': exec_template_2dt,
|
||||
'LdInst': '',
|
||||
'StInst': '',
|
||||
'SpecialInstNoSrc': exec_template_nodt_nosrc,
|
||||
'SpecialInst1Src': exec_template_nodt_1src,
|
||||
'SpecialInstNoSrcNoDest': '',
|
||||
}
|
||||
|
||||
###############
|
||||
#
|
||||
# Define code templates for the decoder cases
|
||||
#
|
||||
###############
|
||||
|
||||
# decode template for nodt-opcode case
|
||||
decode_nodt_template = '''
|
||||
case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);'''
|
||||
|
||||
decode_case_prolog_class_inst = '''
|
||||
case BRIG_OPCODE_$brig_opcode_upper:
|
||||
{
|
||||
//const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]);
|
||||
BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType;
|
||||
//switch (baseOp->kind) {
|
||||
// case BRIG_OPERAND_REG:
|
||||
// type = ((const BrigOperandReg*)baseOp)->type;
|
||||
// break;
|
||||
// case BRIG_OPERAND_IMMED:
|
||||
// type = ((const BrigOperandImmed*)baseOp)->type;
|
||||
// break;
|
||||
// default:
|
||||
// fatal("CLASS unrecognized kind of operand %d\\n",
|
||||
// baseOp->kind);
|
||||
//}
|
||||
switch (type) {'''
|
||||
|
||||
# common prolog for 1dt- or 2dt-opcode case: switch on data type
|
||||
decode_case_prolog = '''
|
||||
case BRIG_OPCODE_$brig_opcode_upper:
|
||||
{
|
||||
switch (ib->type) {'''
|
||||
|
||||
# single-level decode case entry (for 1dt opcodes)
|
||||
decode_case_entry = \
|
||||
' case BRIG_TYPE_$type_name: return $constructor(ib, obj);'
|
||||
|
||||
decode_store_prolog = \
|
||||
' case BRIG_TYPE_$type_name: {'
|
||||
|
||||
decode_store_case_epilog = '''
|
||||
}'''
|
||||
|
||||
decode_store_case_entry = \
|
||||
' return $constructor(ib, obj);'
|
||||
|
||||
# common epilog for type switch
|
||||
decode_case_epilog = '''
|
||||
default: fatal("$brig_opcode_upper: unrecognized type %d\\n",
|
||||
ib->type);
|
||||
}
|
||||
}
|
||||
break;'''
|
||||
|
||||
# Additional templates for nested decode on a second type field (for
|
||||
# compare and convert). These are used in place of the
|
||||
# decode_case_entry template to create a second-level switch on on the
|
||||
# second type field inside each case of the first-level type switch.
|
||||
# Because the name and location of the second type can vary, the Brig
|
||||
# instruction type must be provided in $brig_type, and the name of the
|
||||
# second type field must be provided in $type_field.
|
||||
decode_case2_prolog = '''
|
||||
case BRIG_TYPE_$type_name:
|
||||
switch (((Brig$brig_type*)ib)->$type2_field) {'''
|
||||
|
||||
decode_case2_entry = \
|
||||
' case BRIG_TYPE_$type2_name: return $constructor(ib, obj);'
|
||||
|
||||
decode_case2_epilog = '''
|
||||
default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n",
|
||||
((Brig$brig_type*)ib)->$type2_field);
|
||||
}
|
||||
break;'''
|
||||
|
||||
# Figure out how many source operands an expr needs by looking for the
|
||||
# highest-numbered srcN value referenced. Since sources are numbered
|
||||
# starting at 0, the return value is N+1.
|
||||
def num_src_operands(expr):
|
||||
if expr.find('src2') != -1:
|
||||
return 3
|
||||
elif expr.find('src1') != -1:
|
||||
return 2
|
||||
elif expr.find('src0') != -1:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
###############
|
||||
#
|
||||
# Define final code generation methods
|
||||
#
|
||||
# The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for
|
||||
# generating actual instructions.
|
||||
#
|
||||
###############
|
||||
|
||||
# Generate class declaration, exec function, and decode switch case
|
||||
# for an brig_opcode with a single-level type switch. The 'types'
|
||||
# parameter is a list or tuple of types for which the instruction
|
||||
# should be instantiated.
|
||||
def gen(brig_opcode, types=None, expr=None, base_class='ArithInst',
|
||||
type2_info=None, constructor_prefix='new ', is_store=False):
|
||||
brig_opcode_upper = brig_opcode.upper()
|
||||
class_name = brig_opcode
|
||||
opcode = class_name.lower()
|
||||
|
||||
if base_class == 'ArithInst':
|
||||
# note that expr must be provided with ArithInst so we can
|
||||
# derive num_srcs for the template
|
||||
assert expr
|
||||
|
||||
if expr:
|
||||
# Derive several bits of info from expr. If expr is not used,
|
||||
# this info will be irrelevant.
|
||||
num_srcs = num_src_operands(expr)
|
||||
# if the RHS expression includes 'dest', then we're doing an RMW
|
||||
# on the reg and we need to treat it like a source
|
||||
dest_is_src = expr.find('dest') != -1
|
||||
dest_is_src_flag = str(dest_is_src).lower() # for C++
|
||||
if base_class in ['ShiftInst']:
|
||||
expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
|
||||
elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']:
|
||||
expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr)
|
||||
else:
|
||||
expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
|
||||
expr = re.sub(r'\bdest\b', r'dest_val', expr)
|
||||
|
||||
# Strip template arguments off of base class before looking up
|
||||
# appropriate templates
|
||||
base_class_base = re.sub(r'<.*>$', '', base_class)
|
||||
header_code(header_templates[base_class_base])
|
||||
|
||||
if base_class.startswith('SpecialInst'):
|
||||
exec_code(exec_templates[base_class_base])
|
||||
elif base_class.startswith('ShiftInst'):
|
||||
header_code(exec_template_shift)
|
||||
else:
|
||||
header_code(exec_templates[base_class_base])
|
||||
|
||||
if not types or isinstance(types, str):
|
||||
# Just a single type
|
||||
constructor = constructor_prefix + class_name
|
||||
decoder_code(decode_nodt_template)
|
||||
else:
|
||||
# multiple types, need at least one level of decode
|
||||
if brig_opcode == 'Class':
|
||||
decoder_code(decode_case_prolog_class_inst)
|
||||
else:
|
||||
decoder_code(decode_case_prolog)
|
||||
if not type2_info:
|
||||
if is_store == False:
|
||||
# single list of types, to basic one-level decode
|
||||
for type_name in types:
|
||||
full_class_name = '%s<%s>' % (class_name, type_name.upper())
|
||||
constructor = constructor_prefix + full_class_name
|
||||
decoder_code(decode_case_entry)
|
||||
else:
|
||||
# single list of types, to basic one-level decode
|
||||
for type_name in types:
|
||||
decoder_code(decode_store_prolog)
|
||||
type_size = int(re.findall(r'[0-9]+', type_name)[0])
|
||||
src_size = 32
|
||||
type_type = type_name[0]
|
||||
full_class_name = '%s<%s,%s>' % (class_name, \
|
||||
type_name.upper(), \
|
||||
'%s%d' % \
|
||||
(type_type.upper(), \
|
||||
type_size))
|
||||
constructor = constructor_prefix + full_class_name
|
||||
decoder_code(decode_store_case_entry)
|
||||
decoder_code(decode_store_case_epilog)
|
||||
else:
|
||||
# need secondary type switch (convert, compare)
|
||||
# unpack extra info on second switch
|
||||
(type2_field, types2) = type2_info
|
||||
brig_type = 'Inst%s' % brig_opcode
|
||||
for type_name in types:
|
||||
decoder_code(decode_case2_prolog)
|
||||
fmt = '%s<%s,%%s>' % (class_name, type_name.upper())
|
||||
for type2_name in types2:
|
||||
full_class_name = fmt % type2_name.upper()
|
||||
constructor = constructor_prefix + full_class_name
|
||||
decoder_code(decode_case2_entry)
|
||||
|
||||
decoder_code(decode_case2_epilog)
|
||||
|
||||
decoder_code(decode_case_epilog)
|
||||
|
||||
###############
|
||||
#
|
||||
# Generate instructions
|
||||
#
|
||||
###############
|
||||
|
||||
# handy abbreviations for common sets of types
|
||||
|
||||
# arithmetic ops are typically defined only on 32- and 64-bit sizes
|
||||
arith_int_types = ('S32', 'U32', 'S64', 'U64')
|
||||
arith_float_types = ('F32', 'F64')
|
||||
arith_types = arith_int_types + arith_float_types
|
||||
|
||||
bit_types = ('B1', 'B32', 'B64')
|
||||
|
||||
all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types
|
||||
|
||||
# I think you might be able to do 'f16' memory ops too, but we'll
|
||||
# ignore them for now.
|
||||
mem_types = all_int_types + arith_float_types
|
||||
mem_atom_types = all_int_types + ('B32', 'B64')
|
||||
|
||||
##### Arithmetic & logical operations
|
||||
gen('Add', arith_types, 'src0 + src1')
|
||||
gen('Sub', arith_types, 'src0 - src1')
|
||||
gen('Mul', arith_types, 'src0 * src1')
|
||||
gen('Div', arith_types, 'src0 / src1')
|
||||
gen('Min', arith_types, 'std::min(src0, src1)')
|
||||
gen('Max', arith_types, 'std::max(src0, src1)')
|
||||
gen('Gcnmin', arith_types, 'std::min(src0, src1)')
|
||||
|
||||
gen('CopySign', arith_float_types,
|
||||
'src1 < 0 ? -std::abs(src0) : std::abs(src0)')
|
||||
gen('Sqrt', arith_float_types, 'sqrt(src0)')
|
||||
gen('Floor', arith_float_types, 'floor(src0)')
|
||||
|
||||
# "fast" sqrt... same as slow for us
|
||||
gen('Nsqrt', arith_float_types, 'sqrt(src0)')
|
||||
gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)')
|
||||
gen('Nrcp', arith_float_types, '1.0/src0')
|
||||
gen('Fract', arith_float_types,
|
||||
'(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)')
|
||||
|
||||
gen('Ncos', arith_float_types, 'cos(src0)');
|
||||
gen('Nsin', arith_float_types, 'sin(src0)');
|
||||
|
||||
gen('And', bit_types, 'src0 & src1')
|
||||
gen('Or', bit_types, 'src0 | src1')
|
||||
gen('Xor', bit_types, 'src0 ^ src1')
|
||||
|
||||
gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)')
|
||||
gen('Firstbit',bit_types, 'firstbit(src0)')
|
||||
gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)')
|
||||
|
||||
gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst')
|
||||
gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst')
|
||||
|
||||
# gen('Mul_hi', types=('s32','u32', '??'))
|
||||
# gen('Mul24', types=('s32','u32', '??'))
|
||||
gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)')
|
||||
|
||||
gen('Abs', arith_types, 'std::abs(src0)')
|
||||
gen('Neg', arith_types, '-src0')
|
||||
|
||||
gen('Mov', bit_types, 'src0')
|
||||
gen('Not', bit_types, 'heynot(src0)')
|
||||
|
||||
# mad and fma differ only in rounding behavior, which we don't emulate
|
||||
# also there's an integer form of mad, but not of fma
|
||||
gen('Mad', arith_types, 'src0 * src1 + src2')
|
||||
gen('Fma', arith_float_types, 'src0 * src1 + src2')
|
||||
|
||||
#native floating point operations
|
||||
gen('Nfma', arith_float_types, 'src0 * src1 + src2')
|
||||
|
||||
gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst')
|
||||
gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))')
|
||||
gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))')
|
||||
|
||||
# see base/bitfield.hh
|
||||
gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)',
|
||||
'ExtractInsertInst')
|
||||
|
||||
gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)',
|
||||
'ExtractInsertInst')
|
||||
|
||||
##### Compare
|
||||
gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)',
|
||||
'CmpInst', ('sourceType', arith_types + bit_types))
|
||||
gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst')
|
||||
|
||||
##### Conversion
|
||||
|
||||
# Conversion operations are only defined on B1, not B32 or B64
|
||||
cvt_types = ('B1',) + mem_types
|
||||
|
||||
gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types))
|
||||
|
||||
|
||||
##### Load & Store
|
||||
gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode')
|
||||
gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode')
|
||||
gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode',
|
||||
is_store=True)
|
||||
gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode')
|
||||
gen('AtomicNoRet', mem_atom_types, base_class='StInst',
|
||||
constructor_prefix='decode')
|
||||
|
||||
gen('Cbr', base_class = 'LdInst', constructor_prefix='decode')
|
||||
gen('Br', base_class = 'LdInst', constructor_prefix='decode')
|
||||
|
||||
##### Special operations
|
||||
def gen_special(brig_opcode, expr, dest_type='U32'):
|
||||
num_srcs = num_src_operands(expr)
|
||||
if num_srcs == 0:
|
||||
base_class = 'SpecialInstNoSrc<%s>' % dest_type
|
||||
elif num_srcs == 1:
|
||||
base_class = 'SpecialInst1Src<%s>' % dest_type
|
||||
else:
|
||||
assert false
|
||||
|
||||
gen(brig_opcode, None, expr, base_class)
|
||||
|
||||
gen_special('WorkItemId', 'w->workitemid[src0][lane]')
|
||||
gen_special('WorkItemAbsId',
|
||||
'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])')
|
||||
gen_special('WorkGroupId', 'w->workgroupid[src0]')
|
||||
gen_special('WorkGroupSize', 'w->workgroupsz[src0]')
|
||||
gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]')
|
||||
gen_special('GridSize', 'w->gridsz[src0]')
|
||||
gen_special('GridGroups',
|
||||
'divCeil(w->gridsz[src0],w->workgroupsz[src0])')
|
||||
gen_special('LaneId', 'lane')
|
||||
gen_special('WaveId', 'w->dynwaveid')
|
||||
gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64')
|
||||
|
||||
# gen_special('CU'', ')
|
||||
|
||||
gen('Ret', base_class='SpecialInstNoSrcNoDest')
|
||||
gen('Barrier', base_class='SpecialInstNoSrcNoDest')
|
||||
gen('MemFence', base_class='SpecialInstNoSrcNoDest')
|
||||
|
||||
# Map magic instructions to the BrigSyscall opcode
|
||||
# Magic instructions are defined in magic.hh
|
||||
#
|
||||
# In the future, real HSA kernel system calls can be implemented and coexist
|
||||
# with magic instructions.
|
||||
gen('Call', base_class='SpecialInstNoSrcNoDest')
|
||||
|
||||
###############
|
||||
#
|
||||
# Generate file epilogs
|
||||
#
|
||||
###############
|
||||
header_code.dedent()
|
||||
header_code('''
|
||||
} // namespace HsailISA
|
||||
''')
|
||||
|
||||
# close off main decode switch
|
||||
decoder_code.dedent()
|
||||
decoder_code.dedent()
|
||||
decoder_code('''
|
||||
default: fatal("unrecognized Brig opcode %d\\n", ib->opcode);
|
||||
} // end switch(ib->opcode)
|
||||
} // end decode()
|
||||
} // namespace HsailISA
|
||||
''')
|
||||
|
||||
exec_code.dedent()
|
||||
exec_code('''
|
||||
} // namespace HsailISA
|
||||
''')
|
||||
|
||||
###############
|
||||
#
|
||||
# Output accumulated code to files
|
||||
#
|
||||
###############
|
||||
header_code.write(sys.argv[1])
|
||||
decoder_code.write(sys.argv[2])
|
||||
exec_code.write(sys.argv[3])
|
47
src/arch/hsail/generic_types.cc
Normal file
47
src/arch/hsail/generic_types.cc
Normal file
|
@ -0,0 +1,47 @@
|
|||
#include "arch/hsail/generic_types.hh"
|
||||
#include "base/misc.hh"
|
||||
|
||||
using namespace Brig;
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
Enums::GenericMemoryOrder
|
||||
getGenericMemoryOrder(BrigMemoryOrder brig_memory_order)
|
||||
{
|
||||
switch(brig_memory_order) {
|
||||
case BRIG_MEMORY_ORDER_NONE:
|
||||
return Enums::MEMORY_ORDER_NONE;
|
||||
case BRIG_MEMORY_ORDER_RELAXED:
|
||||
return Enums::MEMORY_ORDER_RELAXED;
|
||||
case BRIG_MEMORY_ORDER_SC_ACQUIRE:
|
||||
return Enums::MEMORY_ORDER_SC_ACQUIRE;
|
||||
case BRIG_MEMORY_ORDER_SC_RELEASE:
|
||||
return Enums::MEMORY_ORDER_SC_RELEASE;
|
||||
case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
|
||||
return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE;
|
||||
default:
|
||||
fatal("HsailISA::MemInst::getGenericMemoryOrder -> ",
|
||||
"bad BrigMemoryOrder\n");
|
||||
}
|
||||
}
|
||||
|
||||
Enums::GenericMemoryScope
|
||||
getGenericMemoryScope(BrigMemoryScope brig_memory_scope)
|
||||
{
|
||||
switch(brig_memory_scope) {
|
||||
case BRIG_MEMORY_SCOPE_NONE:
|
||||
return Enums::MEMORY_SCOPE_NONE;
|
||||
case BRIG_MEMORY_SCOPE_WORKITEM:
|
||||
return Enums::MEMORY_SCOPE_WORKITEM;
|
||||
case BRIG_MEMORY_SCOPE_WORKGROUP:
|
||||
return Enums::MEMORY_SCOPE_WORKGROUP;
|
||||
case BRIG_MEMORY_SCOPE_AGENT:
|
||||
return Enums::MEMORY_SCOPE_DEVICE;
|
||||
case BRIG_MEMORY_SCOPE_SYSTEM:
|
||||
return Enums::MEMORY_SCOPE_SYSTEM;
|
||||
default:
|
||||
fatal("HsailISA::MemInst::getGenericMemoryScope -> ",
|
||||
"bad BrigMemoryScope\n");
|
||||
}
|
||||
}
|
||||
} // namespace HsailISA
|
16
src/arch/hsail/generic_types.hh
Normal file
16
src/arch/hsail/generic_types.hh
Normal file
|
@ -0,0 +1,16 @@
|
|||
#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__
|
||||
#define __ARCH_HSAIL_GENERIC_TYPES_HH__
|
||||
|
||||
#include "arch/hsail/Brig.h"
|
||||
#include "enums/GenericMemoryOrder.hh"
|
||||
#include "enums/GenericMemoryScope.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
Enums::GenericMemoryOrder
|
||||
getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order);
|
||||
Enums::GenericMemoryScope
|
||||
getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope);
|
||||
} // namespace HsailISA
|
||||
|
||||
#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__
|
77
src/arch/hsail/gpu_decoder.hh
Normal file
77
src/arch/hsail/gpu_decoder.hh
Normal file
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_HSAIL_GPU_DECODER_HH__
|
||||
#define __ARCH_HSAIL_GPU_DECODER_HH__
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "arch/hsail/gpu_types.hh"
|
||||
|
||||
class BrigObject;
|
||||
class GPUStaticInst;
|
||||
|
||||
namespace Brig
|
||||
{
|
||||
class BrigInstBase;
|
||||
}
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
class Decoder
|
||||
{
|
||||
public:
|
||||
GPUStaticInst* decode(MachInst machInst);
|
||||
|
||||
GPUStaticInst*
|
||||
decode(RawMachInst inst)
|
||||
{
|
||||
return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr;
|
||||
}
|
||||
|
||||
RawMachInst
|
||||
saveInst(GPUStaticInst *decodedInst)
|
||||
{
|
||||
decodedInsts.push_back(decodedInst);
|
||||
|
||||
return decodedInsts.size() - 1;
|
||||
}
|
||||
|
||||
private:
|
||||
static std::vector<GPUStaticInst*> decodedInsts;
|
||||
};
|
||||
} // namespace HsailISA
|
||||
|
||||
#endif // __ARCH_HSAIL_GPU_DECODER_HH__
|
69
src/arch/hsail/gpu_types.hh
Normal file
69
src/arch/hsail/gpu_types.hh
Normal file
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_HSAIL_GPU_TYPES_HH__
|
||||
#define __ARCH_HSAIL_GPU_TYPES_HH__
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace Brig
|
||||
{
|
||||
class BrigInstBase;
|
||||
}
|
||||
|
||||
class BrigObject;
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
// A raw machine instruction represents the raw bits that
|
||||
// our model uses to represent an actual instruction. In
|
||||
// the case of HSAIL this is just an index into a list of
|
||||
// instruction objects.
|
||||
typedef uint64_t RawMachInst;
|
||||
|
||||
// The MachInst is a representation of an instruction
|
||||
// that has more information than just the machine code.
|
||||
// For HSAIL the actual machine code is a BrigInstBase
|
||||
// and the BrigObject contains more pertinent
|
||||
// information related to operaands, etc.
|
||||
|
||||
struct MachInst
|
||||
{
|
||||
const Brig::BrigInstBase *brigInstBase;
|
||||
const BrigObject *brigObj;
|
||||
};
|
||||
}
|
||||
|
||||
#endif // __ARCH_HSAIL_GPU_TYPES_HH__
|
86
src/arch/hsail/insts/branch.cc
Normal file
86
src/arch/hsail/insts/branch.cc
Normal file
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "arch/hsail/insts/branch.hh"
|
||||
|
||||
#include "gpu-compute/hsail_code.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
GPUStaticInst*
|
||||
decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
{
|
||||
// Detect direct vs indirect branch by seeing whether we have a
|
||||
// register operand.
|
||||
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
|
||||
const Brig::BrigOperand *reg = obj->getOperand(op_offs);
|
||||
|
||||
if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
|
||||
return new BrnIndirectInst(ib, obj);
|
||||
} else {
|
||||
return new BrnDirectInst(ib, obj);
|
||||
}
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
{
|
||||
// Detect direct vs indirect branch by seeing whether we have a
|
||||
// second register operand (after the condition).
|
||||
unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
|
||||
const Brig::BrigOperand *reg = obj->getOperand(op_offs);
|
||||
|
||||
if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
|
||||
return new CbrIndirectInst(ib, obj);
|
||||
} else {
|
||||
return new CbrDirectInst(ib, obj);
|
||||
}
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
{
|
||||
// Detect direct vs indirect branch by seeing whether we have a
|
||||
// second register operand (after the condition).
|
||||
unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
|
||||
const Brig::BrigOperand *reg = obj->getOperand(op_offs);
|
||||
|
||||
if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
|
||||
return new BrIndirectInst(ib, obj);
|
||||
} else {
|
||||
return new BrDirectInst(ib, obj);
|
||||
}
|
||||
}
|
||||
} // namespace HsailISA
|
442
src/arch/hsail/insts/branch.hh
Normal file
442
src/arch/hsail/insts/branch.hh
Normal file
|
@ -0,0 +1,442 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
|
||||
#define __ARCH_HSAIL_INSTS_BRANCH_HH__
|
||||
|
||||
#include "arch/hsail/insts/gpu_static_inst.hh"
|
||||
#include "arch/hsail/operand.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
|
||||
// The main difference between a direct branch and an indirect branch
|
||||
// is whether the target is a register or a label, so we can share a
|
||||
// lot of code if we template the base implementation on that type.
|
||||
template<typename TargetType>
|
||||
class BrnInstBase : public HsailGPUStaticInst
|
||||
{
|
||||
public:
|
||||
void generateDisassembly();
|
||||
|
||||
Brig::BrigWidth8_t width;
|
||||
TargetType target;
|
||||
|
||||
BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: HsailGPUStaticInst(obj, "brn")
|
||||
{
|
||||
o_type = Enums::OT_BRANCH;
|
||||
width = ((Brig::BrigInstBr*)ib)->width;
|
||||
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
|
||||
target.init(op_offs, obj);
|
||||
o_type = Enums::OT_BRANCH;
|
||||
}
|
||||
|
||||
uint32_t getTargetPc() override { return target.getTarget(0, 0); }
|
||||
|
||||
bool unconditionalJumpInstruction() override { return true; }
|
||||
bool isVectorRegister(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.isVectorRegister();
|
||||
}
|
||||
bool isCondRegister(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.isCondRegister();
|
||||
}
|
||||
bool isScalarRegister(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.isScalarRegister();
|
||||
}
|
||||
|
||||
bool isSrcOperand(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool isDstOperand(int operandIndex) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int getOperandSize(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.opSize();
|
||||
}
|
||||
|
||||
int getRegisterIndex(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.regIndex();
|
||||
}
|
||||
|
||||
int getNumOperands() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
void execute(GPUDynInstPtr gpuDynInst);
|
||||
};
|
||||
|
||||
template<typename TargetType>
|
||||
void
|
||||
BrnInstBase<TargetType>::generateDisassembly()
|
||||
{
|
||||
std::string widthClause;
|
||||
|
||||
if (width != 1) {
|
||||
widthClause = csprintf("_width(%d)", width);
|
||||
}
|
||||
|
||||
disassembly = csprintf("%s%s %s", opcode, widthClause,
|
||||
target.disassemble());
|
||||
}
|
||||
|
||||
template<typename TargetType>
|
||||
void
|
||||
BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
if (getTargetPc() == w->rpc()) {
|
||||
w->popFromReconvergenceStack();
|
||||
} else {
|
||||
// Rpc and execution mask remain the same
|
||||
w->pc(getTargetPc());
|
||||
}
|
||||
w->discardFetch();
|
||||
}
|
||||
|
||||
class BrnDirectInst : public BrnInstBase<LabelOperand>
|
||||
{
|
||||
public:
|
||||
BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: BrnInstBase<LabelOperand>(ib, obj)
|
||||
{
|
||||
}
|
||||
int numSrcRegOperands() { return 0; }
|
||||
int numDstRegOperands() { return 0; }
|
||||
};
|
||||
|
||||
class BrnIndirectInst : public BrnInstBase<SRegOperand>
|
||||
{
|
||||
public:
|
||||
BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: BrnInstBase<SRegOperand>(ib, obj)
|
||||
{
|
||||
}
|
||||
int numSrcRegOperands() { return target.isVectorRegister(); }
|
||||
int numDstRegOperands() { return 0; }
|
||||
};
|
||||
|
||||
GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
|
||||
const BrigObject *obj);
|
||||
|
||||
template<typename TargetType>
|
||||
class CbrInstBase : public HsailGPUStaticInst
|
||||
{
|
||||
public:
|
||||
void generateDisassembly();
|
||||
|
||||
Brig::BrigWidth8_t width;
|
||||
CRegOperand cond;
|
||||
TargetType target;
|
||||
|
||||
CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: HsailGPUStaticInst(obj, "cbr")
|
||||
{
|
||||
o_type = Enums::OT_BRANCH;
|
||||
width = ((Brig::BrigInstBr *)ib)->width;
|
||||
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
|
||||
cond.init(op_offs, obj);
|
||||
op_offs = obj->getOperandPtr(ib->operands, 1);
|
||||
target.init(op_offs, obj);
|
||||
o_type = Enums::OT_BRANCH;
|
||||
}
|
||||
|
||||
uint32_t getTargetPc() override { return target.getTarget(0, 0); }
|
||||
|
||||
void execute(GPUDynInstPtr gpuDynInst);
|
||||
// Assumption: Target is operand 0, Condition Register is operand 1
|
||||
bool isVectorRegister(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
if (!operandIndex)
|
||||
return target.isVectorRegister();
|
||||
else
|
||||
return false;
|
||||
}
|
||||
bool isCondRegister(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
if (!operandIndex)
|
||||
return target.isCondRegister();
|
||||
else
|
||||
return true;
|
||||
}
|
||||
bool isScalarRegister(int operandIndex) {
|
||||
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
|
||||
if (!operandIndex)
|
||||
return target.isScalarRegister();
|
||||
else
|
||||
return false;
|
||||
}
|
||||
bool isSrcOperand(int operandIndex) {
|
||||
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
|
||||
if (operandIndex == 0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
// both Condition Register and Target are source operands
|
||||
bool isDstOperand(int operandIndex) {
|
||||
return false;
|
||||
}
|
||||
int getOperandSize(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
if (!operandIndex)
|
||||
return target.opSize();
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
int getRegisterIndex(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
if (!operandIndex)
|
||||
return target.regIndex();
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Operands = Target, Condition Register
|
||||
int getNumOperands() {
|
||||
return 2;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename TargetType>
|
||||
void
|
||||
CbrInstBase<TargetType>::generateDisassembly()
|
||||
{
|
||||
std::string widthClause;
|
||||
|
||||
if (width != 1) {
|
||||
widthClause = csprintf("_width(%d)", width);
|
||||
}
|
||||
|
||||
disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
|
||||
cond.disassemble(), target.disassemble());
|
||||
}
|
||||
|
||||
template<typename TargetType>
|
||||
void
|
||||
CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
const uint32_t curr_pc = w->pc();
|
||||
const uint32_t curr_rpc = w->rpc();
|
||||
const VectorMask curr_mask = w->execMask();
|
||||
|
||||
/**
|
||||
* TODO: can we move this pop outside the instruction, and
|
||||
* into the wavefront?
|
||||
*/
|
||||
w->popFromReconvergenceStack();
|
||||
|
||||
// immediate post-dominator instruction
|
||||
const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
|
||||
if (curr_rpc != rpc) {
|
||||
w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
|
||||
}
|
||||
|
||||
// taken branch
|
||||
const uint32_t true_pc = getTargetPc();
|
||||
VectorMask true_mask;
|
||||
for (unsigned int lane = 0; lane < VSZ; ++lane) {
|
||||
true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
|
||||
}
|
||||
|
||||
// not taken branch
|
||||
const uint32_t false_pc = curr_pc + 1;
|
||||
assert(true_pc != false_pc);
|
||||
if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
|
||||
VectorMask false_mask = curr_mask & ~true_mask;
|
||||
w->pushToReconvergenceStack(false_pc, rpc, false_mask);
|
||||
}
|
||||
|
||||
if (true_pc != rpc && true_mask.count()) {
|
||||
w->pushToReconvergenceStack(true_pc, rpc, true_mask);
|
||||
}
|
||||
assert(w->pc() != curr_pc);
|
||||
w->discardFetch();
|
||||
}
|
||||
|
||||
|
||||
class CbrDirectInst : public CbrInstBase<LabelOperand>
|
||||
{
|
||||
public:
|
||||
CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: CbrInstBase<LabelOperand>(ib, obj)
|
||||
{
|
||||
}
|
||||
// the source operand of a conditional branch is a Condition
|
||||
// Register which is not stored in the VRF
|
||||
// so we do not count it as a source-register operand
|
||||
// even though, formally, it is one.
|
||||
int numSrcRegOperands() { return 0; }
|
||||
int numDstRegOperands() { return 0; }
|
||||
};
|
||||
|
||||
class CbrIndirectInst : public CbrInstBase<SRegOperand>
|
||||
{
|
||||
public:
|
||||
CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: CbrInstBase<SRegOperand>(ib, obj)
|
||||
{
|
||||
}
|
||||
// one source operand of the conditional indirect branch is a Condition
|
||||
// register which is not stored in the VRF so we do not count it
|
||||
// as a source-register operand even though, formally, it is one.
|
||||
int numSrcRegOperands() { return target.isVectorRegister(); }
|
||||
int numDstRegOperands() { return 0; }
|
||||
};
|
||||
|
||||
GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
|
||||
const BrigObject *obj);
|
||||
|
||||
template<typename TargetType>
|
||||
class BrInstBase : public HsailGPUStaticInst
|
||||
{
|
||||
public:
|
||||
void generateDisassembly();
|
||||
|
||||
ImmOperand<uint32_t> width;
|
||||
TargetType target;
|
||||
|
||||
BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: HsailGPUStaticInst(obj, "br")
|
||||
{
|
||||
o_type = Enums::OT_BRANCH;
|
||||
width.init(((Brig::BrigInstBr *)ib)->width, obj);
|
||||
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
|
||||
target.init(op_offs, obj);
|
||||
o_type = Enums::OT_BRANCH;
|
||||
}
|
||||
|
||||
uint32_t getTargetPc() override { return target.getTarget(0, 0); }
|
||||
|
||||
bool unconditionalJumpInstruction() override { return true; }
|
||||
|
||||
void execute(GPUDynInstPtr gpuDynInst);
|
||||
bool isVectorRegister(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.isVectorRegister();
|
||||
}
|
||||
bool isCondRegister(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.isCondRegister();
|
||||
}
|
||||
bool isScalarRegister(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.isScalarRegister();
|
||||
}
|
||||
bool isSrcOperand(int operandIndex) {
|
||||
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
|
||||
return true;
|
||||
}
|
||||
bool isDstOperand(int operandIndex) { return false; }
|
||||
int getOperandSize(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.opSize();
|
||||
}
|
||||
int getRegisterIndex(int operandIndex) {
|
||||
assert(operandIndex >= 0 && operandIndex < getNumOperands());
|
||||
return target.regIndex();
|
||||
}
|
||||
int getNumOperands() { return 1; }
|
||||
};
|
||||
|
||||
template<typename TargetType>
|
||||
void
|
||||
BrInstBase<TargetType>::generateDisassembly()
|
||||
{
|
||||
std::string widthClause;
|
||||
|
||||
if (width.bits != 1) {
|
||||
widthClause = csprintf("_width(%d)", width.bits);
|
||||
}
|
||||
|
||||
disassembly = csprintf("%s%s %s", opcode, widthClause,
|
||||
target.disassemble());
|
||||
}
|
||||
|
||||
template<typename TargetType>
|
||||
void
|
||||
BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
if (getTargetPc() == w->rpc()) {
|
||||
w->popFromReconvergenceStack();
|
||||
} else {
|
||||
// Rpc and execution mask remain the same
|
||||
w->pc(getTargetPc());
|
||||
}
|
||||
w->discardFetch();
|
||||
}
|
||||
|
||||
class BrDirectInst : public BrInstBase<LabelOperand>
|
||||
{
|
||||
public:
|
||||
BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: BrInstBase<LabelOperand>(ib, obj)
|
||||
{
|
||||
}
|
||||
|
||||
int numSrcRegOperands() { return 0; }
|
||||
int numDstRegOperands() { return 0; }
|
||||
};
|
||||
|
||||
class BrIndirectInst : public BrInstBase<SRegOperand>
|
||||
{
|
||||
public:
|
||||
BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
|
||||
: BrInstBase<SRegOperand>(ib, obj)
|
||||
{
|
||||
}
|
||||
int numSrcRegOperands() { return target.isVectorRegister(); }
|
||||
int numDstRegOperands() { return 0; }
|
||||
};
|
||||
|
||||
GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
|
||||
const BrigObject *obj);
|
||||
} // namespace HsailISA
|
||||
|
||||
#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__
|
1106
src/arch/hsail/insts/decl.hh
Normal file
1106
src/arch/hsail/insts/decl.hh
Normal file
File diff suppressed because it is too large
Load diff
64
src/arch/hsail/insts/gpu_static_inst.cc
Normal file
64
src/arch/hsail/insts/gpu_static_inst.cc
Normal file
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "arch/hsail/insts/gpu_static_inst.hh"
|
||||
|
||||
#include "gpu-compute/brig_object.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
|
||||
const std::string &opcode)
|
||||
: GPUStaticInst(opcode), hsailCode(obj->currentCode)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
HsailGPUStaticInst::generateDisassembly()
|
||||
{
|
||||
disassembly = opcode;
|
||||
}
|
||||
|
||||
const std::string&
|
||||
HsailGPUStaticInst::disassemble()
|
||||
{
|
||||
if (disassembly.empty()) {
|
||||
generateDisassembly();
|
||||
assert(!disassembly.empty());
|
||||
}
|
||||
|
||||
return disassembly;
|
||||
}
|
||||
} // namespace HsailISA
|
65
src/arch/hsail/insts/gpu_static_inst.hh
Normal file
65
src/arch/hsail/insts/gpu_static_inst.hh
Normal file
|
@ -0,0 +1,65 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
|
||||
#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
|
||||
|
||||
/*
|
||||
* @file gpu_static_inst.hh
|
||||
*
|
||||
* Defines the base class representing HSAIL GPU static instructions.
|
||||
*/
|
||||
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
|
||||
class BrigObject;
|
||||
class HsailCode;
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
class HsailGPUStaticInst : public GPUStaticInst
|
||||
{
|
||||
public:
|
||||
HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
|
||||
void generateDisassembly();
|
||||
const std::string &disassemble();
|
||||
uint32_t instSize() { return 4; }
|
||||
|
||||
protected:
|
||||
HsailCode *hsailCode;
|
||||
};
|
||||
} // namespace HsailISA
|
||||
|
||||
#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
|
208
src/arch/hsail/insts/main.cc
Normal file
208
src/arch/hsail/insts/main.cc
Normal file
|
@ -0,0 +1,208 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#include "arch/hsail/insts/decl.hh"
|
||||
#include "debug/GPUExec.hh"
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
#include "gpu-compute/simple_pool_manager.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
template<> const char *B1::label = "b1";
|
||||
template<> const char *B8::label = "b8";
|
||||
template<> const char *B16::label = "b16";
|
||||
template<> const char *B32::label = "b32";
|
||||
template<> const char *B64::label = "b64";
|
||||
|
||||
template<> const char *S8::label = "s8";
|
||||
template<> const char *S16::label = "s16";
|
||||
template<> const char *S32::label = "s32";
|
||||
template<> const char *S64::label = "s64";
|
||||
|
||||
template<> const char *U8::label = "u8";
|
||||
template<> const char *U16::label = "u16";
|
||||
template<> const char *U32::label = "u32";
|
||||
template<> const char *U64::label = "u64";
|
||||
|
||||
template<> const char *F32::label = "f32";
|
||||
template<> const char *F64::label = "f64";
|
||||
|
||||
const char*
|
||||
cmpOpToString(Brig::BrigCompareOperation cmpOp)
|
||||
{
|
||||
using namespace Brig;
|
||||
|
||||
switch (cmpOp) {
|
||||
case BRIG_COMPARE_EQ:
|
||||
return "eq";
|
||||
case BRIG_COMPARE_NE:
|
||||
return "ne";
|
||||
case BRIG_COMPARE_LT:
|
||||
return "lt";
|
||||
case BRIG_COMPARE_LE:
|
||||
return "le";
|
||||
case BRIG_COMPARE_GT:
|
||||
return "gt";
|
||||
case BRIG_COMPARE_GE:
|
||||
return "ge";
|
||||
case BRIG_COMPARE_EQU:
|
||||
return "equ";
|
||||
case BRIG_COMPARE_NEU:
|
||||
return "neu";
|
||||
case BRIG_COMPARE_LTU:
|
||||
return "ltu";
|
||||
case BRIG_COMPARE_LEU:
|
||||
return "leu";
|
||||
case BRIG_COMPARE_GTU:
|
||||
return "gtu";
|
||||
case BRIG_COMPARE_GEU:
|
||||
return "geu";
|
||||
case BRIG_COMPARE_NUM:
|
||||
return "num";
|
||||
case BRIG_COMPARE_NAN:
|
||||
return "nan";
|
||||
case BRIG_COMPARE_SEQ:
|
||||
return "seq";
|
||||
case BRIG_COMPARE_SNE:
|
||||
return "sne";
|
||||
case BRIG_COMPARE_SLT:
|
||||
return "slt";
|
||||
case BRIG_COMPARE_SLE:
|
||||
return "sle";
|
||||
case BRIG_COMPARE_SGT:
|
||||
return "sgt";
|
||||
case BRIG_COMPARE_SGE:
|
||||
return "sge";
|
||||
case BRIG_COMPARE_SGEU:
|
||||
return "sgeu";
|
||||
case BRIG_COMPARE_SEQU:
|
||||
return "sequ";
|
||||
case BRIG_COMPARE_SNEU:
|
||||
return "sneu";
|
||||
case BRIG_COMPARE_SLTU:
|
||||
return "sltu";
|
||||
case BRIG_COMPARE_SLEU:
|
||||
return "sleu";
|
||||
case BRIG_COMPARE_SNUM:
|
||||
return "snum";
|
||||
case BRIG_COMPARE_SNAN:
|
||||
return "snan";
|
||||
case BRIG_COMPARE_SGTU:
|
||||
return "sgtu";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Ret::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
// mask off completed work-items
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
w->init_mask[lane] = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// delete extra instructions fetched for completed work-items
|
||||
w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
|
||||
w->instructionBuffer.end());
|
||||
if (w->pendingFetch) {
|
||||
w->dropFetch = true;
|
||||
}
|
||||
|
||||
// if all work-items have completed, then wave-front is done
|
||||
if (w->init_mask.none()) {
|
||||
w->status = Wavefront::S_STOPPED;
|
||||
|
||||
int32_t refCount = w->computeUnit->getLds().
|
||||
decreaseRefCounter(w->dispatchid, w->wg_id);
|
||||
|
||||
DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
|
||||
w->computeUnit->cu_id, w->wg_id, refCount);
|
||||
|
||||
// free the vector registers of the completed wavefront
|
||||
w->computeUnit->vectorRegsReserved[w->simdId] -=
|
||||
w->reservedVectorRegs;
|
||||
|
||||
assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
|
||||
|
||||
uint32_t endIndex = (w->startVgprIndex +
|
||||
w->reservedVectorRegs - 1) %
|
||||
w->computeUnit->vrf[w->simdId]->numRegs();
|
||||
|
||||
w->computeUnit->vrf[w->simdId]->manager->
|
||||
freeRegion(w->startVgprIndex, endIndex);
|
||||
|
||||
w->reservedVectorRegs = 0;
|
||||
w->startVgprIndex = 0;
|
||||
w->computeUnit->completedWfs++;
|
||||
|
||||
DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
|
||||
w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
|
||||
|
||||
if (!refCount) {
|
||||
// Notify Memory System of Kernel Completion
|
||||
// Kernel End = isKernel + isRelease
|
||||
w->status = Wavefront::S_RETURNING;
|
||||
GPUDynInstPtr local_mempacket = gpuDynInst;
|
||||
local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
|
||||
local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
|
||||
local_mempacket->useContinuation = false;
|
||||
local_mempacket->simdId = w->simdId;
|
||||
local_mempacket->wfSlotId = w->wfSlotId;
|
||||
local_mempacket->wfDynId = w->wfDynId;
|
||||
w->computeUnit->injectGlobalMemFence(local_mempacket, true);
|
||||
} else {
|
||||
w->computeUnit->shader->dispatcher->scheduleDispatch();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Barrier::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
assert(w->barrier_cnt == w->old_barrier_cnt);
|
||||
w->barrier_cnt = w->old_barrier_cnt + 1;
|
||||
w->stalledAtBarrier = true;
|
||||
}
|
||||
} // namespace HsailISA
|
139
src/arch/hsail/insts/mem.cc
Normal file
139
src/arch/hsail/insts/mem.cc
Normal file
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#include "arch/hsail/insts/mem.hh"
|
||||
|
||||
#include "arch/hsail/Brig.h"
|
||||
#include "enums/OpType.hh"
|
||||
|
||||
using namespace Brig;
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
const char* atomicOpToString(BrigAtomicOperation brigOp);
|
||||
|
||||
Enums::MemOpType
|
||||
brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
|
||||
{
|
||||
if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
|
||||
switch (brigOp) {
|
||||
case BRIG_ATOMIC_AND:
|
||||
return Enums::MO_AAND;
|
||||
case BRIG_ATOMIC_OR:
|
||||
return Enums::MO_AOR;
|
||||
case BRIG_ATOMIC_XOR:
|
||||
return Enums::MO_AXOR;
|
||||
case BRIG_ATOMIC_CAS:
|
||||
return Enums::MO_ACAS;
|
||||
case BRIG_ATOMIC_EXCH:
|
||||
return Enums::MO_AEXCH;
|
||||
case BRIG_ATOMIC_ADD:
|
||||
return Enums::MO_AADD;
|
||||
case BRIG_ATOMIC_WRAPINC:
|
||||
return Enums::MO_AINC;
|
||||
case BRIG_ATOMIC_WRAPDEC:
|
||||
return Enums::MO_ADEC;
|
||||
case BRIG_ATOMIC_MIN:
|
||||
return Enums::MO_AMIN;
|
||||
case BRIG_ATOMIC_MAX:
|
||||
return Enums::MO_AMAX;
|
||||
case BRIG_ATOMIC_SUB:
|
||||
return Enums::MO_ASUB;
|
||||
default:
|
||||
fatal("Bad BrigAtomicOperation code %d\n", brigOp);
|
||||
}
|
||||
} else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
|
||||
switch (brigOp) {
|
||||
case BRIG_ATOMIC_AND:
|
||||
return Enums::MO_ANRAND;
|
||||
case BRIG_ATOMIC_OR:
|
||||
return Enums::MO_ANROR;
|
||||
case BRIG_ATOMIC_XOR:
|
||||
return Enums::MO_ANRXOR;
|
||||
case BRIG_ATOMIC_CAS:
|
||||
return Enums::MO_ANRCAS;
|
||||
case BRIG_ATOMIC_EXCH:
|
||||
return Enums::MO_ANREXCH;
|
||||
case BRIG_ATOMIC_ADD:
|
||||
return Enums::MO_ANRADD;
|
||||
case BRIG_ATOMIC_WRAPINC:
|
||||
return Enums::MO_ANRINC;
|
||||
case BRIG_ATOMIC_WRAPDEC:
|
||||
return Enums::MO_ANRDEC;
|
||||
case BRIG_ATOMIC_MIN:
|
||||
return Enums::MO_ANRMIN;
|
||||
case BRIG_ATOMIC_MAX:
|
||||
return Enums::MO_ANRMAX;
|
||||
case BRIG_ATOMIC_SUB:
|
||||
return Enums::MO_ANRSUB;
|
||||
default:
|
||||
fatal("Bad BrigAtomicOperation code %d\n", brigOp);
|
||||
}
|
||||
} else {
|
||||
fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
|
||||
}
|
||||
}
|
||||
|
||||
const char*
|
||||
atomicOpToString(BrigAtomicOperation brigOp)
|
||||
{
|
||||
switch (brigOp) {
|
||||
case BRIG_ATOMIC_AND:
|
||||
return "and";
|
||||
case BRIG_ATOMIC_OR:
|
||||
return "or";
|
||||
case BRIG_ATOMIC_XOR:
|
||||
return "xor";
|
||||
case BRIG_ATOMIC_CAS:
|
||||
return "cas";
|
||||
case BRIG_ATOMIC_EXCH:
|
||||
return "exch";
|
||||
case BRIG_ATOMIC_ADD:
|
||||
return "add";
|
||||
case BRIG_ATOMIC_WRAPINC:
|
||||
return "inc";
|
||||
case BRIG_ATOMIC_WRAPDEC:
|
||||
return "dec";
|
||||
case BRIG_ATOMIC_MIN:
|
||||
return "min";
|
||||
case BRIG_ATOMIC_MAX:
|
||||
return "max";
|
||||
case BRIG_ATOMIC_SUB:
|
||||
return "sub";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
} // namespace HsailISA
|
1629
src/arch/hsail/insts/mem.hh
Normal file
1629
src/arch/hsail/insts/mem.hh
Normal file
File diff suppressed because it is too large
Load diff
660
src/arch/hsail/insts/mem_impl.hh
Normal file
660
src/arch/hsail/insts/mem_impl.hh
Normal file
|
@ -0,0 +1,660 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#include "arch/hsail/generic_types.hh"
|
||||
#include "gpu-compute/hsail_code.hh"
|
||||
|
||||
// defined in code.cc, but not worth sucking in all of code.h for this
|
||||
// at this point
|
||||
extern const char *segmentNames[];
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
template<typename DestDataType, typename AddrRegOperandType>
|
||||
void
|
||||
LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
|
||||
{
|
||||
this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
|
||||
DestDataType::label,
|
||||
this->dest.disassemble(),
|
||||
this->addr.disassemble());
|
||||
}
|
||||
|
||||
template<typename DestDataType, typename AddrRegOperandType>
|
||||
void
|
||||
LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
typedef typename DestDataType::CType CType M5_VAR_USED;
|
||||
const VectorMask &mask = w->get_pred();
|
||||
uint64_t addr_vec[VSZ];
|
||||
this->addr.calcVector(w, addr_vec);
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
this->dest.set(w, lane, addr_vec[lane]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename MemDataType, typename DestDataType,
|
||||
typename AddrRegOperandType>
|
||||
void
|
||||
LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
|
||||
{
|
||||
switch (num_dest_operands) {
|
||||
case 1:
|
||||
this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
|
||||
segmentNames[this->segment],
|
||||
MemDataType::label,
|
||||
this->dest.disassemble(),
|
||||
this->addr.disassemble());
|
||||
break;
|
||||
case 2:
|
||||
this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
|
||||
segmentNames[this->segment],
|
||||
MemDataType::label,
|
||||
this->dest_vect[0].disassemble(),
|
||||
this->dest_vect[1].disassemble(),
|
||||
this->addr.disassemble());
|
||||
break;
|
||||
case 4:
|
||||
this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
|
||||
this->opcode,
|
||||
segmentNames[this->segment],
|
||||
MemDataType::label,
|
||||
this->dest_vect[0].disassemble(),
|
||||
this->dest_vect[1].disassemble(),
|
||||
this->dest_vect[2].disassemble(),
|
||||
this->dest_vect[3].disassemble(),
|
||||
this->addr.disassemble());
|
||||
break;
|
||||
default:
|
||||
fatal("Bad ld register dest operand, num vector operands: %d \n",
|
||||
num_dest_operands);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static Addr
|
||||
calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
|
||||
{
|
||||
// what is the size of the object we are accessing??
|
||||
// NOTE: the compiler doesn't generate enough information
|
||||
// to do this yet..have to just line up all the private
|
||||
// work-item spaces back to back for now
|
||||
/*
|
||||
StorageElement* se =
|
||||
i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
|
||||
assert(se);
|
||||
|
||||
return w->wfSlotId * w->privSizePerItem * VSZ +
|
||||
se->offset * VSZ +
|
||||
lane * se->size;
|
||||
*/
|
||||
|
||||
// addressing strategy: interleave the private spaces of
|
||||
// work-items in a wave-front on 8 byte granularity.
|
||||
// this won't be perfect coalescing like the spill space
|
||||
// strategy, but it's better than nothing. The spill space
|
||||
// strategy won't work with private because the same address
|
||||
// may be accessed by different sized loads/stores.
|
||||
|
||||
// Note: I'm assuming that the largest load/store to private
|
||||
// is 8 bytes. If it is larger, the stride will have to increase
|
||||
|
||||
Addr addr_div8 = addr / 8;
|
||||
Addr addr_mod8 = addr % 8;
|
||||
|
||||
Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
|
||||
|
||||
assert(ret < w->privBase + (w->privSizePerItem * VSZ));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<typename MemDataType, typename DestDataType,
|
||||
typename AddrRegOperandType>
|
||||
void
|
||||
LdInst<MemDataType, DestDataType,
|
||||
AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
typedef typename MemDataType::CType MemCType;
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
// Kernarg references are handled uniquely for now (no Memory Request
|
||||
// is used), so special-case them up front. Someday we should
|
||||
// make this more realistic, at which we should get rid of this
|
||||
// block and fold this case into the switch below.
|
||||
if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
|
||||
MemCType val;
|
||||
|
||||
// I assume no vector ld for kernargs
|
||||
assert(num_dest_operands == 1);
|
||||
|
||||
// assuming for the moment that we'll never do register
|
||||
// offsets into kernarg space... just to make life simpler
|
||||
uint64_t address = this->addr.calcUniform();
|
||||
|
||||
val = *(MemCType*)&w->kernelArgs[address];
|
||||
|
||||
DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
this->dest.set(w, lane, val);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
} else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
|
||||
uint64_t address = this->addr.calcUniform();
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
MemCType val = w->readCallArgMem<MemCType>(lane, address);
|
||||
|
||||
DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
|
||||
(unsigned long long)val);
|
||||
|
||||
this->dest.set(w, lane, val);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
GPUDynInstPtr m = gpuDynInst;
|
||||
|
||||
this->addr.calcVector(w, m->addr);
|
||||
|
||||
m->m_op = Enums::MO_LD;
|
||||
m->m_type = MemDataType::memType;
|
||||
m->v_type = DestDataType::vgprType;
|
||||
|
||||
m->exec_mask = w->execMask();
|
||||
m->statusBitVector = 0;
|
||||
m->equiv = this->equivClass;
|
||||
m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
|
||||
|
||||
m->scope = getGenericMemoryScope(this->memoryScope);
|
||||
|
||||
if (num_dest_operands == 1) {
|
||||
m->dst_reg = this->dest.regIndex();
|
||||
m->n_reg = 1;
|
||||
} else {
|
||||
m->n_reg = num_dest_operands;
|
||||
for (int i = 0; i < num_dest_operands; ++i) {
|
||||
m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
|
||||
}
|
||||
}
|
||||
|
||||
m->simdId = w->simdId;
|
||||
m->wfSlotId = w->wfSlotId;
|
||||
m->wfDynId = w->wfDynId;
|
||||
m->kern_id = w->kern_id;
|
||||
m->cu_id = w->computeUnit->cu_id;
|
||||
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
||||
|
||||
switch (this->segment) {
|
||||
case Brig::BRIG_SEGMENT_GLOBAL:
|
||||
m->s_type = SEG_GLOBAL;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||
|
||||
// this is a complete hack to get around a compiler bug
|
||||
// (the compiler currently generates global access for private
|
||||
// addresses (starting from 0). We need to add the private offset)
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (m->addr[lane] < w->privSizePerItem) {
|
||||
if (mask[lane]) {
|
||||
// what is the size of the object we are accessing?
|
||||
// find base for for this wavefront
|
||||
|
||||
// calcPrivAddr will fail if accesses are unaligned
|
||||
assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
|
||||
|
||||
Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
|
||||
this);
|
||||
|
||||
m->addr[lane] = privAddr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_rd_gm++;
|
||||
w->rd_gm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
case Brig::BRIG_SEGMENT_SPILL:
|
||||
assert(num_dest_operands == 1);
|
||||
m->s_type = SEG_SPILL;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||
{
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
// note: this calculation will NOT WORK if the compiler
|
||||
// ever generates loads/stores to the same address with
|
||||
// different widths (e.g., a ld_u32 addr and a ld_u16 addr)
|
||||
if (mask[lane]) {
|
||||
assert(m->addr[lane] < w->spillSizePerItem);
|
||||
|
||||
m->addr[lane] = m->addr[lane] * w->spillWidth +
|
||||
lane * sizeof(MemCType) + w->spillBase;
|
||||
|
||||
w->last_addr[lane] = m->addr[lane];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_rd_gm++;
|
||||
w->rd_gm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
case Brig::BRIG_SEGMENT_GROUP:
|
||||
m->s_type = SEG_SHARED;
|
||||
m->pipeId = LDSMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(24));
|
||||
w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
|
||||
w->outstanding_reqs_rd_lm++;
|
||||
w->rd_lm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
case Brig::BRIG_SEGMENT_READONLY:
|
||||
m->s_type = SEG_READONLY;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
|
||||
m->addr[lane] += w->roBase;
|
||||
}
|
||||
}
|
||||
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_rd_gm++;
|
||||
w->rd_gm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
case Brig::BRIG_SEGMENT_PRIVATE:
|
||||
m->s_type = SEG_PRIVATE;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||
{
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
assert(m->addr[lane] < w->privSizePerItem);
|
||||
|
||||
m->addr[lane] = m->addr[lane] +
|
||||
lane * sizeof(MemCType) + w->privBase;
|
||||
}
|
||||
}
|
||||
}
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_rd_gm++;
|
||||
w->rd_gm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
default:
|
||||
fatal("Load to unsupported segment %d %llxe\n", this->segment,
|
||||
m->addr[0]);
|
||||
}
|
||||
|
||||
w->outstanding_reqs++;
|
||||
w->mem_reqs_in_pipe--;
|
||||
}
|
||||
|
||||
template<typename OperationType, typename SrcDataType,
|
||||
typename AddrRegOperandType>
|
||||
void
|
||||
StInst<OperationType, SrcDataType,
|
||||
AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
typedef typename OperationType::CType CType;
|
||||
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
// arg references are handled uniquely for now (no Memory Request
|
||||
// is used), so special-case them up front. Someday we should
|
||||
// make this more realistic, at which we should get rid of this
|
||||
// block and fold this case into the switch below.
|
||||
if (this->segment == Brig::BRIG_SEGMENT_ARG) {
|
||||
uint64_t address = this->addr.calcUniform();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
CType data = this->src.template get<CType>(w, lane);
|
||||
DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
|
||||
w->writeCallArgMem<CType>(lane, address, data);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
GPUDynInstPtr m = gpuDynInst;
|
||||
|
||||
m->exec_mask = w->execMask();
|
||||
|
||||
this->addr.calcVector(w, m->addr);
|
||||
|
||||
if (num_src_operands == 1) {
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
((CType*)m->d_data)[lane] =
|
||||
this->src.template get<CType>(w, lane);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int k= 0; k < num_src_operands; ++k) {
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
((CType*)m->d_data)[k * VSZ + lane] =
|
||||
this->src_vect[k].template get<CType>(w, lane);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m->m_op = Enums::MO_ST;
|
||||
m->m_type = OperationType::memType;
|
||||
m->v_type = OperationType::vgprType;
|
||||
|
||||
m->statusBitVector = 0;
|
||||
m->equiv = this->equivClass;
|
||||
|
||||
if (num_src_operands == 1) {
|
||||
m->n_reg = 1;
|
||||
} else {
|
||||
m->n_reg = num_src_operands;
|
||||
}
|
||||
|
||||
m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
|
||||
|
||||
m->scope = getGenericMemoryScope(this->memoryScope);
|
||||
|
||||
m->simdId = w->simdId;
|
||||
m->wfSlotId = w->wfSlotId;
|
||||
m->wfDynId = w->wfDynId;
|
||||
m->kern_id = w->kern_id;
|
||||
m->cu_id = w->computeUnit->cu_id;
|
||||
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
||||
|
||||
switch (this->segment) {
|
||||
case Brig::BRIG_SEGMENT_GLOBAL:
|
||||
m->s_type = SEG_GLOBAL;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||
|
||||
// this is a complete hack to get around a compiler bug
|
||||
// (the compiler currently generates global access for private
|
||||
// addresses (starting from 0). We need to add the private offset)
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
if (m->addr[lane] < w->privSizePerItem) {
|
||||
|
||||
// calcPrivAddr will fail if accesses are unaligned
|
||||
assert(!((sizeof(CType)-1) & m->addr[lane]));
|
||||
|
||||
Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
|
||||
this);
|
||||
|
||||
m->addr[lane] = privAddr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_wr_gm++;
|
||||
w->wr_gm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
case Brig::BRIG_SEGMENT_SPILL:
|
||||
assert(num_src_operands == 1);
|
||||
m->s_type = SEG_SPILL;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||
{
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
assert(m->addr[lane] < w->spillSizePerItem);
|
||||
|
||||
m->addr[lane] = m->addr[lane] * w->spillWidth +
|
||||
lane * sizeof(CType) + w->spillBase;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_wr_gm++;
|
||||
w->wr_gm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
case Brig::BRIG_SEGMENT_GROUP:
|
||||
m->s_type = SEG_SHARED;
|
||||
m->pipeId = LDSMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(24));
|
||||
w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
|
||||
w->outstanding_reqs_wr_lm++;
|
||||
w->wr_lm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
case Brig::BRIG_SEGMENT_PRIVATE:
|
||||
m->s_type = SEG_PRIVATE;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||
{
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
assert(m->addr[lane] < w->privSizePerItem);
|
||||
m->addr[lane] = m->addr[lane] + lane *
|
||||
sizeof(CType)+w->privBase;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_wr_gm++;
|
||||
w->wr_gm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
default:
|
||||
fatal("Store to unsupported segment %d\n", this->segment);
|
||||
}
|
||||
|
||||
w->outstanding_reqs++;
|
||||
w->mem_reqs_in_pipe--;
|
||||
}
|
||||
|
||||
template<typename OperationType, typename SrcDataType,
|
||||
typename AddrRegOperandType>
|
||||
void
|
||||
StInst<OperationType, SrcDataType,
|
||||
AddrRegOperandType>::generateDisassembly()
|
||||
{
|
||||
switch (num_src_operands) {
|
||||
case 1:
|
||||
this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
|
||||
segmentNames[this->segment],
|
||||
OperationType::label,
|
||||
this->src.disassemble(),
|
||||
this->addr.disassemble());
|
||||
break;
|
||||
case 2:
|
||||
this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
|
||||
segmentNames[this->segment],
|
||||
OperationType::label,
|
||||
this->src_vect[0].disassemble(),
|
||||
this->src_vect[1].disassemble(),
|
||||
this->addr.disassemble());
|
||||
break;
|
||||
case 4:
|
||||
this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
|
||||
this->opcode,
|
||||
segmentNames[this->segment],
|
||||
OperationType::label,
|
||||
this->src_vect[0].disassemble(),
|
||||
this->src_vect[1].disassemble(),
|
||||
this->src_vect[2].disassemble(),
|
||||
this->src_vect[3].disassemble(),
|
||||
this->addr.disassemble());
|
||||
break;
|
||||
default: fatal("Bad ld register src operand, num vector operands: "
|
||||
"%d \n", num_src_operands);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
|
||||
bool HasDst>
|
||||
void
|
||||
AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
|
||||
HasDst>::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
typedef typename DataType::CType CType;
|
||||
|
||||
Wavefront *w = gpuDynInst->wavefront();
|
||||
|
||||
GPUDynInstPtr m = gpuDynInst;
|
||||
|
||||
this->addr.calcVector(w, m->addr);
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
((CType *)m->a_data)[lane] =
|
||||
this->src[0].template get<CType>(w, lane);
|
||||
}
|
||||
|
||||
// load second source operand for CAS
|
||||
if (NumSrcOperands > 1) {
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
((CType*)m->x_data)[lane] =
|
||||
this->src[1].template get<CType>(w, lane);
|
||||
}
|
||||
}
|
||||
|
||||
assert(NumSrcOperands <= 2);
|
||||
|
||||
m->m_op = this->opType;
|
||||
m->m_type = DataType::memType;
|
||||
m->v_type = DataType::vgprType;
|
||||
|
||||
m->exec_mask = w->execMask();
|
||||
m->statusBitVector = 0;
|
||||
m->equiv = 0; // atomics don't have an equivalence class operand
|
||||
m->n_reg = 1;
|
||||
m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
|
||||
|
||||
m->scope = getGenericMemoryScope(this->memoryScope);
|
||||
|
||||
if (HasDst) {
|
||||
m->dst_reg = this->dest.regIndex();
|
||||
}
|
||||
|
||||
m->simdId = w->simdId;
|
||||
m->wfSlotId = w->wfSlotId;
|
||||
m->wfDynId = w->wfDynId;
|
||||
m->kern_id = w->kern_id;
|
||||
m->cu_id = w->computeUnit->cu_id;
|
||||
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
||||
|
||||
switch (this->segment) {
|
||||
case Brig::BRIG_SEGMENT_GLOBAL:
|
||||
m->s_type = SEG_GLOBAL;
|
||||
m->latency.set(w->computeUnit->shader->ticks(64));
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_wr_gm++;
|
||||
w->wr_gm_reqs_in_pipe--;
|
||||
w->outstanding_reqs_rd_gm++;
|
||||
w->rd_gm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
case Brig::BRIG_SEGMENT_GROUP:
|
||||
m->s_type = SEG_SHARED;
|
||||
m->pipeId = LDSMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(24));
|
||||
w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
|
||||
w->outstanding_reqs_wr_lm++;
|
||||
w->wr_lm_reqs_in_pipe--;
|
||||
w->outstanding_reqs_rd_lm++;
|
||||
w->rd_lm_reqs_in_pipe--;
|
||||
break;
|
||||
|
||||
default:
|
||||
fatal("Atomic op to unsupported segment %d\n",
|
||||
this->segment);
|
||||
}
|
||||
|
||||
w->outstanding_reqs++;
|
||||
w->mem_reqs_in_pipe--;
|
||||
}
|
||||
|
||||
const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
|
||||
|
||||
template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
|
||||
bool HasDst>
|
||||
void
|
||||
AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
|
||||
HasDst>::generateDisassembly()
|
||||
{
|
||||
if (HasDst) {
|
||||
this->disassembly =
|
||||
csprintf("%s_%s_%s_%s %s,%s", this->opcode,
|
||||
atomicOpToString(this->atomicOperation),
|
||||
segmentNames[this->segment],
|
||||
DataType::label, this->dest.disassemble(),
|
||||
this->addr.disassemble());
|
||||
} else {
|
||||
this->disassembly =
|
||||
csprintf("%s_%s_%s_%s %s", this->opcode,
|
||||
atomicOpToString(this->atomicOperation),
|
||||
segmentNames[this->segment],
|
||||
DataType::label, this->addr.disassemble());
|
||||
}
|
||||
|
||||
for (int i = 0; i < NumSrcOperands; ++i) {
|
||||
this->disassembly += ",";
|
||||
this->disassembly += this->src[i].disassemble();
|
||||
}
|
||||
}
|
||||
} // namespace HsailISA
|
787
src/arch/hsail/insts/pseudo_inst.cc
Normal file
787
src/arch/hsail/insts/pseudo_inst.cc
Normal file
|
@ -0,0 +1,787 @@
|
|||
/*
|
||||
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Marc Orr
|
||||
*/
|
||||
|
||||
#include <csignal>
|
||||
|
||||
#include "arch/hsail/insts/decl.hh"
|
||||
#include "arch/hsail/insts/mem.hh"
|
||||
|
||||
namespace HsailISA
|
||||
{
|
||||
// Pseudo (or magic) instructions are overloaded on the hsail call
|
||||
// instruction, because of its flexible parameter signature.
|
||||
|
||||
// To add a new magic instruction:
|
||||
// 1. Add an entry to the enum.
|
||||
// 2. Implement it in the switch statement below (Call::exec).
|
||||
// 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
|
||||
// so its easy to call from an OpenCL kernel.
|
||||
|
||||
// This enum should be identical to the enum in
|
||||
// hsa/hsail-gpu-compute/util/magicinst.h
|
||||
enum
|
||||
{
|
||||
MAGIC_PRINT_WF_32 = 0,
|
||||
MAGIC_PRINT_WF_64,
|
||||
MAGIC_PRINT_LANE,
|
||||
MAGIC_PRINT_LANE_64,
|
||||
MAGIC_PRINT_WF_FLOAT,
|
||||
MAGIC_SIM_BREAK,
|
||||
MAGIC_PREF_SUM,
|
||||
MAGIC_REDUCTION,
|
||||
MAGIC_MASKLANE_LOWER,
|
||||
MAGIC_MASKLANE_UPPER,
|
||||
MAGIC_JOIN_WF_BAR,
|
||||
MAGIC_WAIT_WF_BAR,
|
||||
MAGIC_PANIC,
|
||||
MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
|
||||
MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
|
||||
MAGIC_LOAD_GLOBAL_U32_REG,
|
||||
MAGIC_XACT_CAS_LD,
|
||||
MAGIC_MOST_SIG_THD,
|
||||
MAGIC_MOST_SIG_BROADCAST,
|
||||
MAGIC_PRINT_WFID_32,
|
||||
MAGIC_PRINT_WFID_64
|
||||
};
|
||||
|
||||
void
|
||||
Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
int op = 0;
|
||||
bool got_op = false;
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
int src_val0 = src1.get<int>(w, lane, 0);
|
||||
if (got_op) {
|
||||
if (src_val0 != op) {
|
||||
fatal("Multiple magic instructions per PC not "
|
||||
"supported\n");
|
||||
}
|
||||
} else {
|
||||
op = src_val0;
|
||||
got_op = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch(op) {
|
||||
case MAGIC_PRINT_WF_32:
|
||||
MagicPrintWF32(w);
|
||||
break;
|
||||
case MAGIC_PRINT_WF_64:
|
||||
MagicPrintWF64(w);
|
||||
break;
|
||||
case MAGIC_PRINT_LANE:
|
||||
MagicPrintLane(w);
|
||||
break;
|
||||
case MAGIC_PRINT_LANE_64:
|
||||
MagicPrintLane64(w);
|
||||
break;
|
||||
case MAGIC_PRINT_WF_FLOAT:
|
||||
MagicPrintWFFloat(w);
|
||||
break;
|
||||
case MAGIC_SIM_BREAK:
|
||||
MagicSimBreak(w);
|
||||
break;
|
||||
case MAGIC_PREF_SUM:
|
||||
MagicPrefixSum(w);
|
||||
break;
|
||||
case MAGIC_REDUCTION:
|
||||
MagicReduction(w);
|
||||
break;
|
||||
case MAGIC_MASKLANE_LOWER:
|
||||
MagicMaskLower(w);
|
||||
break;
|
||||
case MAGIC_MASKLANE_UPPER:
|
||||
MagicMaskUpper(w);
|
||||
break;
|
||||
case MAGIC_JOIN_WF_BAR:
|
||||
MagicJoinWFBar(w);
|
||||
break;
|
||||
case MAGIC_WAIT_WF_BAR:
|
||||
MagicWaitWFBar(w);
|
||||
break;
|
||||
case MAGIC_PANIC:
|
||||
MagicPanic(w);
|
||||
break;
|
||||
|
||||
// atomic instructions
|
||||
case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
|
||||
MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
|
||||
break;
|
||||
|
||||
case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
|
||||
MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
|
||||
break;
|
||||
|
||||
case MAGIC_LOAD_GLOBAL_U32_REG:
|
||||
MagicLoadGlobalU32Reg(w, gpuDynInst);
|
||||
break;
|
||||
|
||||
case MAGIC_XACT_CAS_LD:
|
||||
MagicXactCasLd(w);
|
||||
break;
|
||||
|
||||
case MAGIC_MOST_SIG_THD:
|
||||
MagicMostSigThread(w);
|
||||
break;
|
||||
|
||||
case MAGIC_MOST_SIG_BROADCAST:
|
||||
MagicMostSigBroadcast(w);
|
||||
break;
|
||||
|
||||
case MAGIC_PRINT_WFID_32:
|
||||
MagicPrintWF32ID(w);
|
||||
break;
|
||||
|
||||
case MAGIC_PRINT_WFID_64:
|
||||
MagicPrintWFID64(w);
|
||||
break;
|
||||
|
||||
default: fatal("unrecognized magic instruction: %d\n", op);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPrintLane(Wavefront *w)
|
||||
{
|
||||
#if TRACING_ON
|
||||
const VectorMask &mask = w->get_pred();
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
int src_val2 = src1.get<int>(w, lane, 2);
|
||||
if (src_val2) {
|
||||
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
|
||||
disassemble(), w->computeUnit->cu_id, w->simdId,
|
||||
w->wfSlotId, lane, src_val1);
|
||||
} else {
|
||||
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
|
||||
disassemble(), w->computeUnit->cu_id, w->simdId,
|
||||
w->wfSlotId, lane, src_val1);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPrintLane64(Wavefront *w)
|
||||
{
|
||||
#if TRACING_ON
|
||||
const VectorMask &mask = w->get_pred();
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
|
||||
int src_val2 = src1.get<int>(w, lane, 2);
|
||||
if (src_val2) {
|
||||
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
|
||||
disassemble(), w->computeUnit->cu_id, w->simdId,
|
||||
w->wfSlotId, lane, src_val1);
|
||||
} else {
|
||||
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
|
||||
disassemble(), w->computeUnit->cu_id, w->simdId,
|
||||
w->wfSlotId, lane, src_val1);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPrintWF32(Wavefront *w)
|
||||
{
|
||||
#if TRACING_ON
|
||||
const VectorMask &mask = w->get_pred();
|
||||
std::string res_str;
|
||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (!(lane & 7)) {
|
||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||
}
|
||||
|
||||
if (mask[lane]) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
int src_val2 = src1.get<int>(w, lane, 2);
|
||||
|
||||
if (src_val2) {
|
||||
res_str += csprintf("%08x", src_val1);
|
||||
} else {
|
||||
res_str += csprintf("%08d", src_val1);
|
||||
}
|
||||
} else {
|
||||
res_str += csprintf("xxxxxxxx");
|
||||
}
|
||||
|
||||
if ((lane & 7) == 7) {
|
||||
res_str += csprintf("\n");
|
||||
} else {
|
||||
res_str += csprintf(" ");
|
||||
}
|
||||
}
|
||||
|
||||
res_str += "\n\n";
|
||||
DPRINTFN(res_str.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPrintWF32ID(Wavefront *w)
|
||||
{
|
||||
#if TRACING_ON
|
||||
const VectorMask &mask = w->get_pred();
|
||||
std::string res_str;
|
||||
int src_val3 = -1;
|
||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (!(lane & 7)) {
|
||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||
}
|
||||
|
||||
if (mask[lane]) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
int src_val2 = src1.get<int>(w, lane, 2);
|
||||
src_val3 = src1.get<int>(w, lane, 3);
|
||||
|
||||
if (src_val2) {
|
||||
res_str += csprintf("%08x", src_val1);
|
||||
} else {
|
||||
res_str += csprintf("%08d", src_val1);
|
||||
}
|
||||
} else {
|
||||
res_str += csprintf("xxxxxxxx");
|
||||
}
|
||||
|
||||
if ((lane & 7) == 7) {
|
||||
res_str += csprintf("\n");
|
||||
} else {
|
||||
res_str += csprintf(" ");
|
||||
}
|
||||
}
|
||||
|
||||
res_str += "\n\n";
|
||||
if (w->wfDynId == src_val3) {
|
||||
DPRINTFN(res_str.c_str());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPrintWF64(Wavefront *w)
|
||||
{
|
||||
#if TRACING_ON
|
||||
const VectorMask &mask = w->get_pred();
|
||||
std::string res_str;
|
||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (!(lane & 3)) {
|
||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||
}
|
||||
|
||||
if (mask[lane]) {
|
||||
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
|
||||
int src_val2 = src1.get<int>(w, lane, 2);
|
||||
|
||||
if (src_val2) {
|
||||
res_str += csprintf("%016x", src_val1);
|
||||
} else {
|
||||
res_str += csprintf("%016d", src_val1);
|
||||
}
|
||||
} else {
|
||||
res_str += csprintf("xxxxxxxxxxxxxxxx");
|
||||
}
|
||||
|
||||
if ((lane & 3) == 3) {
|
||||
res_str += csprintf("\n");
|
||||
} else {
|
||||
res_str += csprintf(" ");
|
||||
}
|
||||
}
|
||||
|
||||
res_str += "\n\n";
|
||||
DPRINTFN(res_str.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPrintWFID64(Wavefront *w)
|
||||
{
|
||||
#if TRACING_ON
|
||||
const VectorMask &mask = w->get_pred();
|
||||
std::string res_str;
|
||||
int src_val3 = -1;
|
||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (!(lane & 3)) {
|
||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||
}
|
||||
|
||||
if (mask[lane]) {
|
||||
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
|
||||
int src_val2 = src1.get<int>(w, lane, 2);
|
||||
src_val3 = src1.get<int>(w, lane, 3);
|
||||
|
||||
if (src_val2) {
|
||||
res_str += csprintf("%016x", src_val1);
|
||||
} else {
|
||||
res_str += csprintf("%016d", src_val1);
|
||||
}
|
||||
} else {
|
||||
res_str += csprintf("xxxxxxxxxxxxxxxx");
|
||||
}
|
||||
|
||||
if ((lane & 3) == 3) {
|
||||
res_str += csprintf("\n");
|
||||
} else {
|
||||
res_str += csprintf(" ");
|
||||
}
|
||||
}
|
||||
|
||||
res_str += "\n\n";
|
||||
if (w->wfDynId == src_val3) {
|
||||
DPRINTFN(res_str.c_str());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPrintWFFloat(Wavefront *w)
|
||||
{
|
||||
#if TRACING_ON
|
||||
const VectorMask &mask = w->get_pred();
|
||||
std::string res_str;
|
||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (!(lane & 7)) {
|
||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||
}
|
||||
|
||||
if (mask[lane]) {
|
||||
float src_val1 = src1.get<float>(w, lane, 1);
|
||||
res_str += csprintf("%08f", src_val1);
|
||||
} else {
|
||||
res_str += csprintf("xxxxxxxx");
|
||||
}
|
||||
|
||||
if ((lane & 7) == 7) {
|
||||
res_str += csprintf("\n");
|
||||
} else {
|
||||
res_str += csprintf(" ");
|
||||
}
|
||||
}
|
||||
|
||||
res_str += "\n\n";
|
||||
DPRINTFN(res_str.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
// raises a signal that GDB will catch
|
||||
// when done with the break, type "signal 0" in gdb to continue
|
||||
void
|
||||
Call::MagicSimBreak(Wavefront *w)
|
||||
{
|
||||
std::string res_str;
|
||||
// print out state for this wavefront and then break
|
||||
res_str = csprintf("Breakpoint encountered for wavefront %i\n",
|
||||
w->wfSlotId);
|
||||
|
||||
res_str += csprintf(" Kern ID: %i\n", w->kern_id);
|
||||
res_str += csprintf(" Phase ID: %i\n", w->simdId);
|
||||
res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
|
||||
res_str += csprintf(" Exec mask: ");
|
||||
|
||||
for (int i = VSZ - 1; i >= 0; --i) {
|
||||
if (w->execMask(i))
|
||||
res_str += "1";
|
||||
else
|
||||
res_str += "0";
|
||||
|
||||
if ((i & 7) == 7)
|
||||
res_str += " ";
|
||||
}
|
||||
|
||||
res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
|
||||
|
||||
res_str += "\nHelpful debugging hints:\n";
|
||||
res_str += " Check out w->s_reg / w->d_reg for register state\n";
|
||||
|
||||
res_str += "\n\n";
|
||||
DPRINTFN(res_str.c_str());
|
||||
fflush(stdout);
|
||||
|
||||
raise(SIGTRAP);
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPrefixSum(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
int res = 0;
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
dest.set<int>(w, lane, res);
|
||||
res += src_val1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicReduction(Wavefront *w)
|
||||
{
|
||||
// reduction magic instruction
|
||||
// The reduction instruction takes up to 64 inputs (one from
|
||||
// each thread in a WF) and sums them. It returns the sum to
|
||||
// each thread in the WF.
|
||||
const VectorMask &mask = w->get_pred();
|
||||
int res = 0;
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
res += src_val1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
dest.set<int>(w, lane, res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicMaskLower(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
int res = 0;
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
|
||||
if (src_val1) {
|
||||
if (lane < (VSZ/2)) {
|
||||
res = res | ((uint32_t)(1) << lane);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
dest.set<int>(w, lane, res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicMaskUpper(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
int res = 0;
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
|
||||
if (src_val1) {
|
||||
if (lane >= (VSZ/2)) {
|
||||
res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
dest.set<int>(w, lane, res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicJoinWFBar(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
int max_cnt = 0;
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
w->bar_cnt[lane]++;
|
||||
|
||||
if (w->bar_cnt[lane] > max_cnt) {
|
||||
max_cnt = w->bar_cnt[lane];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (max_cnt > w->max_bar_cnt) {
|
||||
w->max_bar_cnt = max_cnt;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicWaitWFBar(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
int max_cnt = 0;
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
w->bar_cnt[lane]--;
|
||||
}
|
||||
|
||||
if (w->bar_cnt[lane] > max_cnt) {
|
||||
max_cnt = w->bar_cnt[lane];
|
||||
}
|
||||
}
|
||||
|
||||
if (max_cnt < w->max_bar_cnt) {
|
||||
w->max_bar_cnt = max_cnt;
|
||||
}
|
||||
|
||||
w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
|
||||
w->instructionBuffer.end());
|
||||
if (w->pendingFetch)
|
||||
w->dropFetch = true;
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicPanic(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
|
||||
src_val1, lane);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
|
||||
{
|
||||
// the address is in src1 | src2
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
int src_val1 = src1.get<int>(w, lane, 1);
|
||||
int src_val2 = src1.get<int>(w, lane, 2);
|
||||
Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
|
||||
|
||||
m->addr[lane] = addr;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
GPUDynInstPtr m = gpuDynInst;
|
||||
|
||||
calcAddr(w, m);
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
|
||||
}
|
||||
|
||||
m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
|
||||
Brig::BRIG_ATOMIC_ADD);
|
||||
m->m_type = U32::memType;
|
||||
m->v_type = U32::vgprType;
|
||||
|
||||
m->exec_mask = w->execMask();
|
||||
m->statusBitVector = 0;
|
||||
m->equiv = 0; // atomics don't have an equivalence class operand
|
||||
m->n_reg = 1;
|
||||
m->memoryOrder = Enums::MEMORY_ORDER_NONE;
|
||||
m->scope = Enums::MEMORY_SCOPE_NONE;
|
||||
|
||||
m->simdId = w->simdId;
|
||||
m->wfSlotId = w->wfSlotId;
|
||||
m->wfDynId = w->wfDynId;
|
||||
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
||||
|
||||
m->s_type = SEG_GLOBAL;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(64));
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_wr_gm++;
|
||||
w->wr_gm_reqs_in_pipe--;
|
||||
w->outstanding_reqs_rd_gm++;
|
||||
w->rd_gm_reqs_in_pipe--;
|
||||
w->outstanding_reqs++;
|
||||
w->mem_reqs_in_pipe--;
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
GPUDynInstPtr m = gpuDynInst;
|
||||
calcAddr(w, m);
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
|
||||
}
|
||||
|
||||
m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
|
||||
Brig::BRIG_ATOMIC_ADD);
|
||||
m->m_type = U32::memType;
|
||||
m->v_type = U32::vgprType;
|
||||
|
||||
m->exec_mask = w->execMask();
|
||||
m->statusBitVector = 0;
|
||||
m->equiv = 0; // atomics don't have an equivalence class operand
|
||||
m->n_reg = 1;
|
||||
m->memoryOrder = Enums::MEMORY_ORDER_NONE;
|
||||
m->scope = Enums::MEMORY_SCOPE_NONE;
|
||||
|
||||
m->simdId = w->simdId;
|
||||
m->wfSlotId = w->wfSlotId;
|
||||
m->wfDynId = w->wfDynId;
|
||||
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
||||
|
||||
m->s_type = SEG_GLOBAL;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(64));
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_wr_gm++;
|
||||
w->wr_gm_reqs_in_pipe--;
|
||||
w->outstanding_reqs_rd_gm++;
|
||||
w->rd_gm_reqs_in_pipe--;
|
||||
w->outstanding_reqs++;
|
||||
w->mem_reqs_in_pipe--;
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
GPUDynInstPtr m = gpuDynInst;
|
||||
// calculate the address
|
||||
calcAddr(w, m);
|
||||
|
||||
m->m_op = Enums::MO_LD;
|
||||
m->m_type = U32::memType; //MemDataType::memType;
|
||||
m->v_type = U32::vgprType; //DestDataType::vgprType;
|
||||
|
||||
m->exec_mask = w->execMask();
|
||||
m->statusBitVector = 0;
|
||||
m->equiv = 0;
|
||||
m->n_reg = 1;
|
||||
m->memoryOrder = Enums::MEMORY_ORDER_NONE;
|
||||
m->scope = Enums::MEMORY_SCOPE_NONE;
|
||||
|
||||
// FIXME
|
||||
//m->dst_reg = this->dest.regIndex();
|
||||
|
||||
m->simdId = w->simdId;
|
||||
m->wfSlotId = w->wfSlotId;
|
||||
m->wfDynId = w->wfDynId;
|
||||
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
||||
|
||||
m->s_type = SEG_GLOBAL;
|
||||
m->pipeId = GLBMEM_PIPE;
|
||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||
w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
|
||||
w->outstanding_reqs_rd_gm++;
|
||||
w->rd_gm_reqs_in_pipe--;
|
||||
w->outstanding_reqs++;
|
||||
w->mem_reqs_in_pipe--;
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicXactCasLd(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
int src_val1 = 0;
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (mask[lane]) {
|
||||
src_val1 = src1.get<int>(w, lane, 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
|
||||
w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
|
||||
w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
|
||||
}
|
||||
|
||||
w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
|
||||
.push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicMostSigThread(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
unsigned mst = true;
|
||||
|
||||
for (int lane = VSZ - 1; lane >= 0; --lane) {
|
||||
if (mask[lane]) {
|
||||
dest.set<int>(w, lane, mst);
|
||||
mst = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Call::MagicMostSigBroadcast(Wavefront *w)
|
||||
{
|
||||
const VectorMask &mask = w->get_pred();
|
||||
int res = 0;
|
||||
bool got_res = false;
|
||||
|
||||
for (int lane = VSZ - 1; lane >= 0; --lane) {
|
||||
if (mask[lane]) {
|
||||
if (!got_res) {
|
||||
res = src1.get<int>(w, lane, 1);
|
||||
got_res = true;
|
||||
}
|
||||
dest.set<int>(w, lane, res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace HsailISA
|
449
src/arch/hsail/operand.cc
Normal file
449
src/arch/hsail/operand.cc
Normal file
|
@ -0,0 +1,449 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#include "arch/hsail/operand.hh"
|
||||
|
||||
using namespace Brig;
|
||||
|
||||
bool
|
||||
BaseRegOperand::init(unsigned opOffset, const BrigObject *obj,
|
||||
unsigned &maxRegIdx, char _regFileChar)
|
||||
{
|
||||
regFileChar = _regFileChar;
|
||||
const BrigOperand *brigOp = obj->getOperand(opOffset);
|
||||
|
||||
if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER)
|
||||
return false;
|
||||
|
||||
const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp;
|
||||
|
||||
regIdx = brigRegOp->regNum;
|
||||
|
||||
DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx,
|
||||
brigRegOp->regKind);
|
||||
|
||||
maxRegIdx = std::max(maxRegIdx, regIdx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
ListOperand::init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset);
|
||||
|
||||
switch (brigOp->kind) {
|
||||
case BRIG_KIND_OPERAND_CODE_LIST:
|
||||
{
|
||||
const BrigOperandCodeList *opList =
|
||||
(const BrigOperandCodeList*)brigOp;
|
||||
|
||||
const Brig::BrigData *oprnd_data =
|
||||
obj->getBrigBaseData(opList->elements);
|
||||
|
||||
// Note: for calls Dest list of operands could be size of 0.
|
||||
elementCount = oprnd_data->byteCount / 4;
|
||||
|
||||
DPRINTF(GPUReg, "Operand Code List: # elements: %d\n",
|
||||
elementCount);
|
||||
|
||||
for (int i = 0; i < elementCount; ++i) {
|
||||
unsigned *data_offset =
|
||||
(unsigned*)obj->getData(opList->elements + 4 * (i + 1));
|
||||
|
||||
const BrigDirectiveVariable *p =
|
||||
(const BrigDirectiveVariable*)obj->
|
||||
getCodeSectionEntry(*data_offset);
|
||||
|
||||
StorageElement *se = obj->currentCode->storageMap->
|
||||
findSymbol(BRIG_SEGMENT_ARG, p);
|
||||
|
||||
assert(se);
|
||||
callArgs.push_back(se);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
fatal("ListOperand: bad operand kind %d\n", brigOp->kind);
|
||||
}
|
||||
}
|
||||
|
||||
std::string
|
||||
ListOperand::disassemble()
|
||||
{
|
||||
std::string res_str("");
|
||||
|
||||
for (auto it : callArgs) {
|
||||
res_str += csprintf("%s ", it->name.c_str());
|
||||
}
|
||||
|
||||
return res_str;
|
||||
}
|
||||
|
||||
void
|
||||
FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
const BrigOperand *baseOp = obj->getOperand(opOffset);
|
||||
|
||||
if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) {
|
||||
fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind);
|
||||
}
|
||||
|
||||
const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp;
|
||||
|
||||
const BrigDirectiveExecutable *p =
|
||||
(const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref);
|
||||
|
||||
func_name = obj->getString(p->name);
|
||||
}
|
||||
|
||||
std::string
|
||||
FunctionRefOperand::disassemble()
|
||||
{
|
||||
DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name);
|
||||
|
||||
return csprintf("%s", func_name);
|
||||
}
|
||||
|
||||
bool
|
||||
BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj,
|
||||
int at, unsigned &maxRegIdx, char _regFileChar)
|
||||
{
|
||||
regFileChar = _regFileChar;
|
||||
const BrigOperand *brigOp = obj->getOperand(opOffset);
|
||||
|
||||
if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST)
|
||||
return false;
|
||||
|
||||
|
||||
const Brig::BrigOperandOperandList *brigRegVecOp =
|
||||
(const Brig::BrigOperandOperandList*)brigOp;
|
||||
|
||||
unsigned *data_offset =
|
||||
(unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1));
|
||||
|
||||
const BrigOperand *p =
|
||||
(const BrigOperand*)obj->getOperand(*data_offset);
|
||||
if (p->kind != BRIG_KIND_OPERAND_REGISTER) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p;
|
||||
|
||||
regIdx = brigRegOp->regNum;
|
||||
|
||||
DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx,
|
||||
brigRegOp->regKind);
|
||||
|
||||
maxRegIdx = std::max(maxRegIdx, regIdx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj,
|
||||
unsigned &maxRegIdx, char _regFileChar)
|
||||
{
|
||||
const char *name = obj->getString(strOffset);
|
||||
char *endptr;
|
||||
regIdx = strtoul(name + 2, &endptr, 10);
|
||||
|
||||
if (name[0] != '$' || name[1] != _regFileChar) {
|
||||
fatal("register operand parse error on \"%s\"\n", name);
|
||||
}
|
||||
|
||||
maxRegIdx = std::max(maxRegIdx, regIdx);
|
||||
}
|
||||
|
||||
unsigned SRegOperand::maxRegIdx;
|
||||
unsigned DRegOperand::maxRegIdx;
|
||||
unsigned CRegOperand::maxRegIdx;
|
||||
|
||||
std::string
|
||||
SRegOperand::disassemble()
|
||||
{
|
||||
return csprintf("$s%d", regIdx);
|
||||
}
|
||||
|
||||
std::string
|
||||
DRegOperand::disassemble()
|
||||
{
|
||||
return csprintf("$d%d", regIdx);
|
||||
}
|
||||
|
||||
std::string
|
||||
CRegOperand::disassemble()
|
||||
{
|
||||
return csprintf("$c%d", regIdx);
|
||||
}
|
||||
|
||||
BrigRegOperandInfo
|
||||
findRegDataType(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
const BrigOperand *baseOp = obj->getOperand(opOffset);
|
||||
|
||||
switch (baseOp->kind) {
|
||||
case BRIG_KIND_OPERAND_REGISTER:
|
||||
{
|
||||
const BrigOperandRegister *op = (BrigOperandRegister*)baseOp;
|
||||
|
||||
return BrigRegOperandInfo((BrigKind16_t)baseOp->kind,
|
||||
(BrigRegisterKind)op->regKind);
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_OPERAND_OPERAND_LIST:
|
||||
{
|
||||
const BrigOperandOperandList *op =
|
||||
(BrigOperandOperandList*)baseOp;
|
||||
const BrigData *data_p = (BrigData*)obj->getData(op->elements);
|
||||
|
||||
|
||||
int num_operands = 0;
|
||||
BrigRegisterKind reg_kind = (BrigRegisterKind)0;
|
||||
for (int offset = 0; offset < data_p->byteCount; offset += 4) {
|
||||
const BrigOperand *op_p = (const BrigOperand *)
|
||||
obj->getOperand(((int *)data_p->bytes)[offset/4]);
|
||||
|
||||
if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) {
|
||||
const BrigOperandRegister *brigRegOp =
|
||||
(const BrigOperandRegister*)op_p;
|
||||
reg_kind = (BrigRegisterKind)brigRegOp->regKind;
|
||||
} else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) {
|
||||
uint16_t num_bytes =
|
||||
((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount
|
||||
- sizeof(BrigBase);
|
||||
if (num_bytes == sizeof(uint32_t)) {
|
||||
reg_kind = BRIG_REGISTER_KIND_SINGLE;
|
||||
} else if (num_bytes == sizeof(uint64_t)) {
|
||||
reg_kind = BRIG_REGISTER_KIND_DOUBLE;
|
||||
} else {
|
||||
fatal("OperandList: bad operand size %d\n", num_bytes);
|
||||
}
|
||||
} else {
|
||||
fatal("OperandList: bad operand kind %d\n", op_p->kind);
|
||||
}
|
||||
|
||||
num_operands++;
|
||||
}
|
||||
assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST);
|
||||
|
||||
return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind);
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_OPERAND_ADDRESS:
|
||||
{
|
||||
const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
|
||||
|
||||
if (!op->reg) {
|
||||
BrigType type = BRIG_TYPE_NONE;
|
||||
|
||||
if (op->symbol) {
|
||||
const BrigDirective *dir = (BrigDirective*)
|
||||
obj->getCodeSectionEntry(op->symbol);
|
||||
|
||||
assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
|
||||
|
||||
const BrigDirectiveVariable *sym =
|
||||
(const BrigDirectiveVariable*)dir;
|
||||
|
||||
type = (BrigType)sym->type;
|
||||
}
|
||||
return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS,
|
||||
(BrigType)type);
|
||||
} else {
|
||||
const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp;
|
||||
const BrigOperand *reg = obj->getOperand(b->reg);
|
||||
const BrigOperandRegister *rop = (BrigOperandRegister*)reg;
|
||||
|
||||
return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER,
|
||||
(BrigRegisterKind)rop->regKind);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
fatal("AddrOperand: bad operand kind %d\n", baseOp->kind);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj)
|
||||
{
|
||||
assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS);
|
||||
|
||||
const BrigDirective *d =
|
||||
(BrigDirective*)obj->getCodeSectionEntry(op->symbol);
|
||||
|
||||
assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
|
||||
const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d;
|
||||
name = obj->getString(sym->name);
|
||||
|
||||
if (sym->segment != BRIG_SEGMENT_ARG) {
|
||||
storageElement =
|
||||
obj->currentCode->storageMap->findSymbol(sym->segment, name);
|
||||
assert(storageElement);
|
||||
offset = 0;
|
||||
} else {
|
||||
// sym->name does not work for BRIG_SEGMENT_ARG for the following case:
|
||||
//
|
||||
// void foo(int a);
|
||||
// void bar(double a);
|
||||
//
|
||||
// foo(...) --> arg_u32 %param_p0;
|
||||
// st_arg_u32 $s0, [%param_p0];
|
||||
// call &foo (%param_p0);
|
||||
// bar(...) --> arg_f64 %param_p0;
|
||||
// st_arg_u64 $d0, [%param_p0];
|
||||
// call &foo (%param_p0);
|
||||
//
|
||||
// Both functions use the same variable name (param_p0)!!!
|
||||
//
|
||||
// Maybe this is a bug in the compiler (I don't know).
|
||||
//
|
||||
// Solution:
|
||||
// Use directive pointer (BrigDirectiveVariable) to differentiate 2
|
||||
// versions of param_p0.
|
||||
//
|
||||
// Note this solution is kind of stupid, because we are pulling stuff
|
||||
// out of the brig binary via the directive pointer and putting it into
|
||||
// the symbol table, but now we are indexing the symbol table by the
|
||||
// brig directive pointer! It makes the symbol table sort of pointless.
|
||||
// But I don't want to mess with the rest of the infrastructure, so
|
||||
// let's go with this for now.
|
||||
//
|
||||
// When we update the compiler again, we should see if this problem goes
|
||||
// away. If so, we can fold some of this functionality into the code for
|
||||
// kernel arguments. If not, maybe we can index the symbol name on a
|
||||
// hash of the variable AND function name
|
||||
storageElement = obj->currentCode->
|
||||
storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym);
|
||||
|
||||
assert(storageElement);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t
|
||||
AddrOperandBase::calcUniformBase()
|
||||
{
|
||||
// start with offset, will be 0 if not specified
|
||||
uint64_t address = offset;
|
||||
|
||||
// add in symbol value if specified
|
||||
if (storageElement) {
|
||||
address += storageElement->offset;
|
||||
}
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
std::string
|
||||
AddrOperandBase::disassemble(std::string reg_disassembly)
|
||||
{
|
||||
std::string disasm;
|
||||
|
||||
if (offset || reg_disassembly != "") {
|
||||
disasm += "[";
|
||||
|
||||
if (reg_disassembly != "") {
|
||||
disasm += reg_disassembly;
|
||||
|
||||
if (offset > 0) {
|
||||
disasm += "+";
|
||||
}
|
||||
}
|
||||
|
||||
if (offset) {
|
||||
disasm += csprintf("%d", offset);
|
||||
}
|
||||
|
||||
disasm += "]";
|
||||
} else if (name) {
|
||||
disasm += csprintf("[%s]", name);
|
||||
}
|
||||
|
||||
return disasm;
|
||||
}
|
||||
|
||||
void
|
||||
NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
const BrigOperand *baseOp = obj->getOperand(opOffset);
|
||||
|
||||
if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) {
|
||||
BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp;
|
||||
parseAddr(addrOp, obj);
|
||||
offset = (uint64_t(addrOp->offset.hi) << 32) |
|
||||
uint64_t(addrOp->offset.lo);
|
||||
} else {
|
||||
fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::string
|
||||
NoRegAddrOperand::disassemble()
|
||||
{
|
||||
return AddrOperandBase::disassemble(std::string(""));
|
||||
}
|
||||
|
||||
void
|
||||
LabelOperand::init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
const BrigOperandCodeRef *op =
|
||||
(const BrigOperandCodeRef*)obj->getOperand(opOffset);
|
||||
|
||||
assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF);
|
||||
|
||||
const BrigDirective *dir =
|
||||
(const BrigDirective*)obj->getCodeSectionEntry(op->ref);
|
||||
|
||||
assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL);
|
||||
label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
LabelOperand::getTarget(Wavefront *w, int lane)
|
||||
{
|
||||
return label->get();
|
||||
}
|
||||
|
||||
std::string
|
||||
LabelOperand::disassemble()
|
||||
{
|
||||
return label->name;
|
||||
}
|
768
src/arch/hsail/operand.hh
Normal file
768
src/arch/hsail/operand.hh
Normal file
|
@ -0,0 +1,768 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_HSAIL_OPERAND_HH__
|
||||
#define __ARCH_HSAIL_OPERAND_HH__
|
||||
|
||||
/**
|
||||
* @file operand.hh
|
||||
*
|
||||
* Defines classes encapsulating HSAIL instruction operands.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "arch/hsail/Brig.h"
|
||||
#include "base/trace.hh"
|
||||
#include "base/types.hh"
|
||||
#include "debug/GPUReg.hh"
|
||||
#include "enums/RegisterType.hh"
|
||||
#include "gpu-compute/brig_object.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/hsail_code.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
class Label;
|
||||
class StorageElement;
|
||||
|
||||
class BaseOperand
|
||||
{
|
||||
public:
|
||||
Enums::RegisterType registerType;
|
||||
uint32_t regOperandSize;
|
||||
BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; }
|
||||
bool isVectorRegister() { return registerType == Enums::RT_VECTOR; }
|
||||
bool isScalarRegister() { return registerType == Enums::RT_SCALAR; }
|
||||
bool isCondRegister() { return registerType == Enums::RT_CONDITION; }
|
||||
unsigned int regIndex() { return 0; }
|
||||
uint32_t opSize() { return regOperandSize; }
|
||||
virtual ~BaseOperand() { }
|
||||
};
|
||||
|
||||
class BrigRegOperandInfo
|
||||
{
|
||||
public:
|
||||
Brig::BrigKind16_t kind;
|
||||
Brig::BrigType type;
|
||||
Brig::BrigRegisterKind regKind;
|
||||
|
||||
BrigRegOperandInfo(Brig::BrigKind16_t _kind,
|
||||
Brig::BrigRegisterKind _regKind)
|
||||
: kind(_kind), regKind(_regKind)
|
||||
{
|
||||
}
|
||||
|
||||
BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type)
|
||||
: kind(_kind), type(_type)
|
||||
{
|
||||
}
|
||||
|
||||
BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES),
|
||||
type(Brig::BRIG_TYPE_NONE)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj);
|
||||
|
||||
class BaseRegOperand : public BaseOperand
|
||||
{
|
||||
public:
|
||||
unsigned regIdx;
|
||||
char regFileChar;
|
||||
|
||||
bool init(unsigned opOffset, const BrigObject *obj,
|
||||
unsigned &maxRegIdx, char _regFileChar);
|
||||
|
||||
bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at,
|
||||
unsigned &maxRegIdx, char _regFileChar);
|
||||
|
||||
void initWithStrOffset(unsigned strOffset, const BrigObject *obj,
|
||||
unsigned &maxRegIdx, char _regFileChar);
|
||||
unsigned int regIndex() { return regIdx; }
|
||||
};
|
||||
|
||||
class SRegOperand : public BaseRegOperand
|
||||
{
|
||||
public:
|
||||
static unsigned maxRegIdx;
|
||||
|
||||
bool
|
||||
init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
regOperandSize = sizeof(uint32_t);
|
||||
registerType = Enums::RT_VECTOR;
|
||||
|
||||
return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's');
|
||||
}
|
||||
|
||||
bool
|
||||
init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
|
||||
{
|
||||
regOperandSize = sizeof(uint32_t);
|
||||
registerType = Enums::RT_VECTOR;
|
||||
|
||||
return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
|
||||
's');
|
||||
}
|
||||
|
||||
void
|
||||
initWithStrOffset(unsigned strOffset, const BrigObject *obj)
|
||||
{
|
||||
regOperandSize = sizeof(uint32_t);
|
||||
registerType = Enums::RT_VECTOR;
|
||||
|
||||
return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
|
||||
's');
|
||||
}
|
||||
|
||||
template<typename OperandType>
|
||||
OperandType
|
||||
get(Wavefront *w, int lane)
|
||||
{
|
||||
assert(sizeof(OperandType) <= sizeof(uint32_t));
|
||||
assert(regIdx < w->maxSpVgprs);
|
||||
// if OperandType is smaller than 32-bit, we truncate the value
|
||||
OperandType ret;
|
||||
uint32_t vgprIdx;
|
||||
|
||||
switch (sizeof(OperandType)) {
|
||||
case 1: // 1 byte operand
|
||||
vgprIdx = w->remap(regIdx, 1, 1);
|
||||
ret = (w->computeUnit->vrf[w->simdId]->
|
||||
read<uint32_t>(vgprIdx, lane)) & 0xff;
|
||||
break;
|
||||
case 2: // 2 byte operand
|
||||
vgprIdx = w->remap(regIdx, 2, 1);
|
||||
ret = (w->computeUnit->vrf[w->simdId]->
|
||||
read<uint32_t>(vgprIdx, lane)) & 0xffff;
|
||||
break;
|
||||
case 4: // 4 byte operand
|
||||
vgprIdx = w->remap(regIdx,sizeof(OperandType), 1);
|
||||
ret = w->computeUnit->vrf[w->simdId]->
|
||||
read<OperandType>(vgprIdx, lane);
|
||||
break;
|
||||
default:
|
||||
panic("Bad OperandType\n");
|
||||
break;
|
||||
}
|
||||
|
||||
return (OperandType)ret;
|
||||
}
|
||||
|
||||
// special get method for compatibility with LabelOperand
|
||||
uint32_t
|
||||
getTarget(Wavefront *w, int lane)
|
||||
{
|
||||
return get<uint32_t>(w, lane);
|
||||
}
|
||||
|
||||
template<typename OperandType>
|
||||
void set(Wavefront *w, int lane, OperandType &val);
|
||||
std::string disassemble();
|
||||
};
|
||||
|
||||
template<typename OperandType>
|
||||
void
|
||||
SRegOperand::set(Wavefront *w, int lane, OperandType &val)
|
||||
{
|
||||
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
|
||||
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
|
||||
|
||||
assert(sizeof(OperandType) == sizeof(uint32_t));
|
||||
assert(regIdx < w->maxSpVgprs);
|
||||
uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
|
||||
w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
|
||||
}
|
||||
|
||||
template<>
|
||||
inline void
|
||||
SRegOperand::set(Wavefront *w, int lane, uint64_t &val)
|
||||
{
|
||||
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
|
||||
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
|
||||
|
||||
assert(regIdx < w->maxSpVgprs);
|
||||
uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1);
|
||||
w->computeUnit->vrf[w->simdId]->write<uint32_t>(vgprIdx, val, lane);
|
||||
}
|
||||
|
||||
class DRegOperand : public BaseRegOperand
|
||||
{
|
||||
public:
|
||||
static unsigned maxRegIdx;
|
||||
|
||||
bool
|
||||
init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
regOperandSize = sizeof(uint64_t);
|
||||
registerType = Enums::RT_VECTOR;
|
||||
|
||||
return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd');
|
||||
}
|
||||
|
||||
bool
|
||||
init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
|
||||
{
|
||||
regOperandSize = sizeof(uint64_t);
|
||||
registerType = Enums::RT_VECTOR;
|
||||
|
||||
return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
|
||||
'd');
|
||||
}
|
||||
|
||||
void
|
||||
initWithStrOffset(unsigned strOffset, const BrigObject *obj)
|
||||
{
|
||||
regOperandSize = sizeof(uint64_t);
|
||||
registerType = Enums::RT_VECTOR;
|
||||
|
||||
return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
|
||||
'd');
|
||||
}
|
||||
|
||||
template<typename OperandType>
|
||||
OperandType
|
||||
get(Wavefront *w, int lane)
|
||||
{
|
||||
assert(sizeof(OperandType) <= sizeof(uint64_t));
|
||||
// TODO: this check is valid only for HSAIL
|
||||
assert(regIdx < w->maxDpVgprs);
|
||||
uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
|
||||
|
||||
return w->computeUnit->vrf[w->simdId]->read<OperandType>(vgprIdx,lane);
|
||||
}
|
||||
|
||||
template<typename OperandType>
|
||||
void
|
||||
set(Wavefront *w, int lane, OperandType &val)
|
||||
{
|
||||
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n",
|
||||
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
|
||||
val);
|
||||
|
||||
assert(sizeof(OperandType) <= sizeof(uint64_t));
|
||||
// TODO: this check is valid only for HSAIL
|
||||
assert(regIdx < w->maxDpVgprs);
|
||||
uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
|
||||
w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
|
||||
}
|
||||
|
||||
std::string disassemble();
|
||||
};
|
||||
|
||||
class CRegOperand : public BaseRegOperand
|
||||
{
|
||||
public:
|
||||
static unsigned maxRegIdx;
|
||||
|
||||
bool
|
||||
init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
regOperandSize = sizeof(uint8_t);
|
||||
registerType = Enums::RT_CONDITION;
|
||||
|
||||
return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c');
|
||||
}
|
||||
|
||||
bool
|
||||
init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
|
||||
{
|
||||
regOperandSize = sizeof(uint8_t);
|
||||
registerType = Enums::RT_CONDITION;
|
||||
|
||||
return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
|
||||
'c');
|
||||
}
|
||||
|
||||
void
|
||||
initWithStrOffset(unsigned strOffset, const BrigObject *obj)
|
||||
{
|
||||
regOperandSize = sizeof(uint8_t);
|
||||
registerType = Enums::RT_CONDITION;
|
||||
|
||||
return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
|
||||
'c');
|
||||
}
|
||||
|
||||
template<typename OperandType>
|
||||
OperandType
|
||||
get(Wavefront *w, int lane)
|
||||
{
|
||||
assert(regIdx < w->condRegState->numRegs());
|
||||
|
||||
return w->condRegState->read<OperandType>((int)regIdx, lane);
|
||||
}
|
||||
|
||||
template<typename OperandType>
|
||||
void
|
||||
set(Wavefront *w, int lane, OperandType &val)
|
||||
{
|
||||
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n",
|
||||
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
|
||||
val);
|
||||
|
||||
assert(regIdx < w->condRegState->numRegs());
|
||||
w->condRegState->write<OperandType>(regIdx,lane,val);
|
||||
}
|
||||
|
||||
std::string disassemble();
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class ImmOperand : public BaseOperand
|
||||
{
|
||||
public:
|
||||
T bits;
|
||||
|
||||
bool init(unsigned opOffset, const BrigObject *obj);
|
||||
bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
|
||||
std::string disassemble();
|
||||
|
||||
template<typename OperandType>
|
||||
OperandType
|
||||
get()
|
||||
{
|
||||
assert(sizeof(OperandType) <= sizeof(T));
|
||||
|
||||
return *(OperandType*)&bits;
|
||||
}
|
||||
|
||||
// This version of get() takes a WF* and a lane id for
|
||||
// compatibility with the register-based get() methods.
|
||||
template<typename OperandType>
|
||||
OperandType
|
||||
get(Wavefront *w, int lane)
|
||||
{
|
||||
return get<OperandType>();
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
bool
|
||||
ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
|
||||
|
||||
switch (brigOp->kind) {
|
||||
// this is immediate operand
|
||||
case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES:
|
||||
{
|
||||
DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T),
|
||||
brigOp->byteCount);
|
||||
|
||||
auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
|
||||
|
||||
bits = *((T*)(obj->getData(cbptr->bytes + 4)));
|
||||
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
|
||||
case Brig::BRIG_KIND_OPERAND_WAVESIZE:
|
||||
bits = VSZ;
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
|
||||
{
|
||||
const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
|
||||
|
||||
if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
const Brig::BrigOperandOperandList *brigVecOp =
|
||||
(const Brig::BrigOperandOperandList *)brigOp;
|
||||
|
||||
unsigned *data_offset =
|
||||
(unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1));
|
||||
|
||||
const Brig::BrigOperand *p =
|
||||
(const Brig::BrigOperand *)obj->getOperand(*data_offset);
|
||||
|
||||
if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return init(*data_offset, obj);
|
||||
}
|
||||
template<typename T>
|
||||
std::string
|
||||
ImmOperand<T>::disassemble()
|
||||
{
|
||||
return csprintf("0x%08x", bits);
|
||||
}
|
||||
|
||||
template<typename RegOperand, typename T>
|
||||
class RegOrImmOperand : public BaseOperand
|
||||
{
|
||||
private:
|
||||
bool is_imm;
|
||||
|
||||
public:
|
||||
void setImm(const bool value) { is_imm = value; }
|
||||
|
||||
ImmOperand<T> imm_op;
|
||||
RegOperand reg_op;
|
||||
|
||||
RegOrImmOperand() { is_imm = false; }
|
||||
void init(unsigned opOffset, const BrigObject *obj);
|
||||
void init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
|
||||
std::string disassemble();
|
||||
|
||||
template<typename OperandType>
|
||||
OperandType
|
||||
get(Wavefront *w, int lane)
|
||||
{
|
||||
return is_imm ? imm_op.template get<OperandType>() :
|
||||
reg_op.template get<OperandType>(w, lane);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
opSize()
|
||||
{
|
||||
if (!is_imm) {
|
||||
return reg_op.opSize();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool
|
||||
isVectorRegister()
|
||||
{
|
||||
if (!is_imm) {
|
||||
return reg_op.registerType == Enums::RT_VECTOR;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
isCondRegister()
|
||||
{
|
||||
if (!is_imm) {
|
||||
return reg_op.registerType == Enums::RT_CONDITION;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
isScalarRegister()
|
||||
{
|
||||
if (!is_imm) {
|
||||
return reg_op.registerType == Enums::RT_SCALAR;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned int
|
||||
regIndex()
|
||||
{
|
||||
if (!is_imm) {
|
||||
return reg_op.regIndex();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename RegOperand, typename T>
|
||||
void
|
||||
RegOrImmOperand<RegOperand, T>::init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
is_imm = false;
|
||||
|
||||
if (reg_op.init(opOffset, obj)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (imm_op.init(opOffset, obj)) {
|
||||
is_imm = true;
|
||||
return;
|
||||
}
|
||||
|
||||
fatal("RegOrImmOperand::init(): bad operand kind %d\n",
|
||||
obj->getOperand(opOffset)->kind);
|
||||
}
|
||||
|
||||
template<typename RegOperand, typename T>
|
||||
void
|
||||
RegOrImmOperand<RegOperand, T>::init_from_vect(unsigned opOffset,
|
||||
const BrigObject *obj, int at)
|
||||
{
|
||||
if (reg_op.init_from_vect(opOffset, obj, at)) {
|
||||
is_imm = false;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (imm_op.init_from_vect(opOffset, obj, at)) {
|
||||
is_imm = true;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
fatal("RegOrImmOperand::init(): bad operand kind %d\n",
|
||||
obj->getOperand(opOffset)->kind);
|
||||
}
|
||||
|
||||
template<typename RegOperand, typename T>
|
||||
std::string
|
||||
RegOrImmOperand<RegOperand, T>::disassemble()
|
||||
{
|
||||
return is_imm ? imm_op.disassemble() : reg_op.disassemble();
|
||||
}
|
||||
|
||||
typedef RegOrImmOperand<SRegOperand, uint32_t> SRegOrImmOperand;
|
||||
typedef RegOrImmOperand<DRegOperand, uint64_t> DRegOrImmOperand;
|
||||
typedef RegOrImmOperand<CRegOperand, bool> CRegOrImmOperand;
|
||||
|
||||
class AddrOperandBase : public BaseOperand
|
||||
{
|
||||
protected:
|
||||
// helper function for init()
|
||||
void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj);
|
||||
|
||||
// helper function for disassemble()
|
||||
std::string disassemble(std::string reg_disassembly);
|
||||
uint64_t calcUniformBase();
|
||||
|
||||
public:
|
||||
virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
|
||||
virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
|
||||
|
||||
uint64_t offset;
|
||||
const char *name = nullptr;
|
||||
StorageElement *storageElement;
|
||||
};
|
||||
|
||||
template<typename RegOperandType>
|
||||
class RegAddrOperand : public AddrOperandBase
|
||||
{
|
||||
public:
|
||||
RegOperandType reg;
|
||||
void init(unsigned opOffset, const BrigObject *obj);
|
||||
uint64_t calcUniform();
|
||||
void calcVector(Wavefront *w, uint64_t *addrVec);
|
||||
uint64_t calcLane(Wavefront *w, int lane=0);
|
||||
uint32_t opSize() { return reg.opSize(); }
|
||||
bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
|
||||
bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; }
|
||||
bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; }
|
||||
unsigned int regIndex() { return reg.regIndex(); }
|
||||
std::string disassemble();
|
||||
};
|
||||
|
||||
template<typename RegOperandType>
|
||||
void
|
||||
RegAddrOperand<RegOperandType>::init(unsigned opOffset, const BrigObject *obj)
|
||||
{
|
||||
using namespace Brig;
|
||||
|
||||
const BrigOperand *baseOp = obj->getOperand(opOffset);
|
||||
|
||||
switch (baseOp->kind) {
|
||||
case BRIG_KIND_OPERAND_ADDRESS:
|
||||
{
|
||||
const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
|
||||
storageElement = nullptr;
|
||||
|
||||
offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo);
|
||||
reg.init(op->reg, obj);
|
||||
|
||||
if (reg.regFileChar == 's') {
|
||||
reg.regOperandSize = sizeof(uint32_t);
|
||||
registerType = Enums::RT_VECTOR;
|
||||
}
|
||||
else if (reg.regFileChar == 'd') {
|
||||
reg.regOperandSize = sizeof(uint64_t);
|
||||
registerType = Enums::RT_VECTOR;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename RegOperandType>
|
||||
uint64_t
|
||||
RegAddrOperand<RegOperandType>::calcUniform()
|
||||
{
|
||||
fatal("can't do calcUniform() on register-based address\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<typename RegOperandType>
|
||||
void
|
||||
RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
|
||||
{
|
||||
Addr address = calcUniformBase();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane) {
|
||||
if (w->execMask(lane)) {
|
||||
if (reg.regFileChar == 's') {
|
||||
addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
|
||||
} else {
|
||||
addrVec[lane] = address + reg.template get<Addr>(w, lane);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename RegOperandType>
|
||||
uint64_t
|
||||
RegAddrOperand<RegOperandType>::calcLane(Wavefront *w, int lane)
|
||||
{
|
||||
Addr address = calcUniformBase();
|
||||
|
||||
return address + reg.template get<Addr>(w, lane);
|
||||
}
|
||||
|
||||
template<typename RegOperandType>
|
||||
std::string
|
||||
RegAddrOperand<RegOperandType>::disassemble()
|
||||
{
|
||||
return AddrOperandBase::disassemble(reg.disassemble());
|
||||
}
|
||||
|
||||
typedef RegAddrOperand<SRegOperand> SRegAddrOperand;
|
||||
typedef RegAddrOperand<DRegOperand> DRegAddrOperand;
|
||||
|
||||
class NoRegAddrOperand : public AddrOperandBase
|
||||
{
|
||||
public:
|
||||
void init(unsigned opOffset, const BrigObject *obj);
|
||||
uint64_t calcUniform();
|
||||
void calcVector(Wavefront *w, uint64_t *addrVec);
|
||||
uint64_t calcLane(Wavefront *w, int lane=0);
|
||||
std::string disassemble();
|
||||
};
|
||||
|
||||
inline uint64_t
|
||||
NoRegAddrOperand::calcUniform()
|
||||
{
|
||||
return AddrOperandBase::calcUniformBase();
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
NoRegAddrOperand::calcLane(Wavefront *w, int lane)
|
||||
{
|
||||
return calcUniform();
|
||||
}
|
||||
|
||||
inline void
|
||||
NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
|
||||
{
|
||||
uint64_t address = calcUniformBase();
|
||||
|
||||
for (int lane = 0; lane < VSZ; ++lane)
|
||||
addrVec[lane] = address;
|
||||
}
|
||||
|
||||
class LabelOperand : public BaseOperand
|
||||
{
|
||||
public:
|
||||
Label *label;
|
||||
|
||||
void init(unsigned opOffset, const BrigObject *obj);
|
||||
std::string disassemble();
|
||||
|
||||
// special get method for compatibility with SRegOperand
|
||||
uint32_t getTarget(Wavefront *w, int lane);
|
||||
|
||||
};
|
||||
|
||||
class ListOperand : public BaseOperand
|
||||
{
|
||||
public:
|
||||
int elementCount;
|
||||
std::vector<StorageElement*> callArgs;
|
||||
|
||||
int
|
||||
getSrcOperand(int idx)
|
||||
{
|
||||
DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx,
|
||||
callArgs.size());
|
||||
|
||||
return callArgs.at(idx)->offset;
|
||||
}
|
||||
|
||||
void init(unsigned opOffset, const BrigObject *obj);
|
||||
|
||||
std::string disassemble();
|
||||
|
||||
template<typename OperandType>
|
||||
OperandType
|
||||
get(Wavefront *w, int lane, int arg_idx)
|
||||
{
|
||||
return w->readCallArgMem<OperandType>(lane, getSrcOperand(arg_idx));
|
||||
}
|
||||
|
||||
template<typename OperandType>
|
||||
void
|
||||
set(Wavefront *w, int lane, OperandType val)
|
||||
{
|
||||
w->writeCallArgMem<OperandType>(lane, getSrcOperand(0), val);
|
||||
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n",
|
||||
w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane,
|
||||
getSrcOperand(0), val);
|
||||
}
|
||||
};
|
||||
|
||||
class FunctionRefOperand : public BaseOperand
|
||||
{
|
||||
public:
|
||||
const char *func_name;
|
||||
|
||||
void init(unsigned opOffset, const BrigObject *obj);
|
||||
std::string disassemble();
|
||||
};
|
||||
|
||||
#endif // __ARCH_HSAIL_OPERAND_HH__
|
310
src/gpu-compute/GPU.py
Normal file
310
src/gpu-compute/GPU.py
Normal file
|
@ -0,0 +1,310 @@
|
|||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Steve Reinhardt
|
||||
#
|
||||
|
||||
from ClockedObject import ClockedObject
|
||||
from Device import DmaDevice
|
||||
from m5.defines import buildEnv
|
||||
from m5.params import *
|
||||
from m5.proxy import *
|
||||
from m5.SimObject import SimObject
|
||||
from MemObject import MemObject
|
||||
from Process import EmulatedDriver
|
||||
from Bridge import Bridge
|
||||
from LdsState import LdsState
|
||||
|
||||
class PrefetchType(Enum): vals = [
|
||||
'PF_CU',
|
||||
'PF_PHASE',
|
||||
'PF_WF',
|
||||
'PF_STRIDE',
|
||||
'PF_END',
|
||||
]
|
||||
|
||||
class VectorRegisterFile(SimObject):
|
||||
type = 'VectorRegisterFile'
|
||||
cxx_class = 'VectorRegisterFile'
|
||||
cxx_header = 'gpu-compute/vector_register_file.hh'
|
||||
|
||||
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
|
||||
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
|
||||
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
|
||||
|
||||
class Wavefront(SimObject):
|
||||
type = 'Wavefront'
|
||||
cxx_class = 'Wavefront'
|
||||
cxx_header = 'gpu-compute/wavefront.hh'
|
||||
|
||||
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
|
||||
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
|
||||
|
||||
class ComputeUnit(MemObject):
|
||||
type = 'ComputeUnit'
|
||||
cxx_class = 'ComputeUnit'
|
||||
cxx_header = 'gpu-compute/compute_unit.hh'
|
||||
|
||||
wavefronts = VectorParam.Wavefront('Number of wavefronts')
|
||||
wfSize = Param.Int(64, 'Wavefront size (in work items)')
|
||||
num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
|
||||
|
||||
spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
|
||||
'latency')
|
||||
|
||||
dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
|
||||
'latency')
|
||||
|
||||
issue_period = Param.Int(4, 'number of cycles per issue period')
|
||||
num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
|
||||
num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
|
||||
n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
|
||||
mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
|
||||
"Represents the pipeline to reach the TCP and "\
|
||||
"specified in GPU clock cycles")
|
||||
mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
|
||||
"cu. Represents the pipeline between the TCP "\
|
||||
"and cu as well as TCP data array access. "\
|
||||
"Specified in GPU clock cycles")
|
||||
system = Param.System(Parent.any, "system object")
|
||||
cu_id = Param.Int('CU id')
|
||||
vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
|
||||
"in bytes")
|
||||
coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
|
||||
"in bytes")
|
||||
|
||||
memory_port = VectorMasterPort("Port to the memory system")
|
||||
translation_port = VectorMasterPort('Port to the TLB hierarchy')
|
||||
sqc_port = MasterPort("Port to the SQC (I-cache")
|
||||
sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
|
||||
perLaneTLB = Param.Bool(False, "enable per-lane TLB")
|
||||
prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
|
||||
"(0 turns off prefetching)")
|
||||
prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
|
||||
prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
|
||||
"from last mem req in lane of "\
|
||||
"CU|Phase|Wavefront")
|
||||
execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
|
||||
xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
|
||||
debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
|
||||
functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
|
||||
|
||||
localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
|
||||
"kernel end")
|
||||
|
||||
countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
|
||||
"and how many times")
|
||||
global_mem_queue_size = Param.Int(256, "Number of entries in the global "
|
||||
"memory pipeline's queues")
|
||||
local_mem_queue_size = Param.Int(256, "Number of entries in the local "
|
||||
"memory pipeline's queues")
|
||||
ldsBus = Bridge() # the bridge between the CU and its LDS
|
||||
ldsPort = MasterPort("The port that goes to the LDS")
|
||||
localDataStore = Param.LdsState("the LDS for this CU")
|
||||
|
||||
vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
|
||||
"file")
|
||||
|
||||
class Shader(ClockedObject):
|
||||
type = 'Shader'
|
||||
cxx_class = 'Shader'
|
||||
cxx_header = 'gpu-compute/shader.hh'
|
||||
|
||||
CUs = VectorParam.ComputeUnit('Number of compute units')
|
||||
n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
|
||||
impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
|
||||
ruby at kernel boundaries""")
|
||||
separate_acquire_release = Param.Bool(False,
|
||||
"""Do ld_acquire/st_release generate separate requests for the
|
||||
acquire and release?""")
|
||||
globalmem = Param.MemorySize('64kB', 'Memory size')
|
||||
timing = Param.Bool(False, 'timing memory accesses')
|
||||
|
||||
cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
|
||||
translation = Param.Bool(False, "address translation");
|
||||
|
||||
class ClDriver(EmulatedDriver):
|
||||
type = 'ClDriver'
|
||||
cxx_header = 'gpu-compute/cl_driver.hh'
|
||||
codefile = VectorParam.String('code file name(s)')
|
||||
|
||||
class GpuDispatcher(DmaDevice):
|
||||
type = 'GpuDispatcher'
|
||||
cxx_header = 'gpu-compute/dispatcher.hh'
|
||||
# put at 8GB line for now
|
||||
pio_addr = Param.Addr(0x200000000, "Device Address")
|
||||
pio_latency = Param.Latency('1ns', "Programmed IO latency")
|
||||
shader_pointer = Param.Shader('pointer to shader')
|
||||
translation_port = MasterPort('Port to the dispatcher TLB')
|
||||
cpu = Param.BaseCPU("CPU to wake up on kernel completion")
|
||||
|
||||
cl_driver = Param.ClDriver('pointer to driver')
|
||||
|
||||
class OpType(Enum): vals = [
|
||||
'OT_NULL',
|
||||
'OT_ALU',
|
||||
'OT_SPECIAL',
|
||||
'OT_GLOBAL_READ',
|
||||
'OT_GLOBAL_WRITE',
|
||||
'OT_GLOBAL_ATOMIC',
|
||||
'OT_GLOBAL_HIST',
|
||||
'OT_GLOBAL_LDAS',
|
||||
'OT_SHARED_READ',
|
||||
'OT_SHARED_WRITE',
|
||||
'OT_SHARED_ATOMIC',
|
||||
'OT_SHARED_HIST',
|
||||
'OT_SHARED_LDAS',
|
||||
'OT_PRIVATE_READ',
|
||||
'OT_PRIVATE_WRITE',
|
||||
'OT_PRIVATE_ATOMIC',
|
||||
'OT_PRIVATE_HIST',
|
||||
'OT_PRIVATE_LDAS',
|
||||
'OT_SPILL_READ',
|
||||
'OT_SPILL_WRITE',
|
||||
'OT_SPILL_ATOMIC',
|
||||
'OT_SPILL_HIST',
|
||||
'OT_SPILL_LDAS',
|
||||
'OT_READONLY_READ',
|
||||
'OT_READONLY_WRITE',
|
||||
'OT_READONLY_ATOMIC',
|
||||
'OT_READONLY_HIST',
|
||||
'OT_READONLY_LDAS',
|
||||
'OT_FLAT_READ',
|
||||
'OT_FLAT_WRITE',
|
||||
'OT_FLAT_ATOMIC',
|
||||
'OT_FLAT_HIST',
|
||||
'OT_FLAT_LDAS',
|
||||
'OT_KERN_READ',
|
||||
'OT_BRANCH',
|
||||
|
||||
# note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
|
||||
# of the compiler.
|
||||
'OT_SHARED_MEMFENCE',
|
||||
'OT_GLOBAL_MEMFENCE',
|
||||
'OT_BOTH_MEMFENCE',
|
||||
|
||||
'OT_BARRIER',
|
||||
'OT_PRINT',
|
||||
'OT_RET',
|
||||
'OT_NOP',
|
||||
'OT_ARG'
|
||||
]
|
||||
|
||||
class MemType(Enum): vals = [
|
||||
'M_U8',
|
||||
'M_U16',
|
||||
'M_U32',
|
||||
'M_U64',
|
||||
'M_S8',
|
||||
'M_S16',
|
||||
'M_S32',
|
||||
'M_S64',
|
||||
'M_F16',
|
||||
'M_F32',
|
||||
'M_F64',
|
||||
]
|
||||
|
||||
class MemOpType(Enum): vals = [
|
||||
'MO_LD',
|
||||
'MO_ST',
|
||||
'MO_LDAS',
|
||||
'MO_LDA',
|
||||
'MO_AAND',
|
||||
'MO_AOR',
|
||||
'MO_AXOR',
|
||||
'MO_ACAS',
|
||||
'MO_AEXCH',
|
||||
'MO_AADD',
|
||||
'MO_ASUB',
|
||||
'MO_AINC',
|
||||
'MO_ADEC',
|
||||
'MO_AMAX',
|
||||
'MO_AMIN',
|
||||
'MO_ANRAND',
|
||||
'MO_ANROR',
|
||||
'MO_ANRXOR',
|
||||
'MO_ANRCAS',
|
||||
'MO_ANREXCH',
|
||||
'MO_ANRADD',
|
||||
'MO_ANRSUB',
|
||||
'MO_ANRINC',
|
||||
'MO_ANRDEC',
|
||||
'MO_ANRMAX',
|
||||
'MO_ANRMIN',
|
||||
'MO_HAND',
|
||||
'MO_HOR',
|
||||
'MO_HXOR',
|
||||
'MO_HCAS',
|
||||
'MO_HEXCH',
|
||||
'MO_HADD',
|
||||
'MO_HSUB',
|
||||
'MO_HINC',
|
||||
'MO_HDEC',
|
||||
'MO_HMAX',
|
||||
'MO_HMIN',
|
||||
'MO_UNDEF'
|
||||
]
|
||||
|
||||
class StorageClassType(Enum): vals = [
|
||||
'SC_SPILL',
|
||||
'SC_GLOBAL',
|
||||
'SC_SHARED',
|
||||
'SC_PRIVATE',
|
||||
'SC_READONLY',
|
||||
'SC_KERNARG',
|
||||
'SC_NONE',
|
||||
]
|
||||
|
||||
class RegisterType(Enum): vals = [
|
||||
'RT_VECTOR',
|
||||
'RT_SCALAR',
|
||||
'RT_CONDITION',
|
||||
'RT_HARDWARE',
|
||||
'RT_NONE',
|
||||
]
|
||||
|
||||
class GenericMemoryOrder(Enum): vals = [
|
||||
'MEMORY_ORDER_NONE',
|
||||
'MEMORY_ORDER_RELAXED',
|
||||
'MEMORY_ORDER_SC_ACQUIRE',
|
||||
'MEMORY_ORDER_SC_RELEASE',
|
||||
'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
|
||||
]
|
||||
|
||||
class GenericMemoryScope(Enum): vals = [
|
||||
'MEMORY_SCOPE_NONE',
|
||||
'MEMORY_SCOPE_WORKITEM',
|
||||
'MEMORY_SCOPE_WAVEFRONT',
|
||||
'MEMORY_SCOPE_WORKGROUP',
|
||||
'MEMORY_SCOPE_DEVICE',
|
||||
'MEMORY_SCOPE_SYSTEM',
|
||||
]
|
51
src/gpu-compute/LdsState.py
Normal file
51
src/gpu-compute/LdsState.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Joe Gross
|
||||
#
|
||||
|
||||
from m5.defines import buildEnv
|
||||
from m5.params import *
|
||||
from m5.proxy import *
|
||||
|
||||
from MemObject import MemObject
|
||||
|
||||
class LdsState(MemObject):
|
||||
type = 'LdsState'
|
||||
cxx_class = 'LdsState'
|
||||
cxx_header = 'gpu-compute/lds_state.hh'
|
||||
size = Param.Int(65536, 'the size of the LDS')
|
||||
range = Param.AddrRange('64kB', "address space of the LDS")
|
||||
bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\
|
||||
'accessing data')
|
||||
banks = Param.Int(32, 'Number of LDS banks')
|
||||
cuPort = SlavePort("port that goes to the compute unit")
|
99
src/gpu-compute/SConscript
Normal file
99
src/gpu-compute/SConscript
Normal file
|
@ -0,0 +1,99 @@
|
|||
# -*- mode:python -*-
|
||||
|
||||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Anthony Gutierrez
|
||||
#
|
||||
|
||||
Import('*')
|
||||
|
||||
if not env['BUILD_GPU']:
|
||||
Return()
|
||||
|
||||
SimObject('GPU.py')
|
||||
SimObject('LdsState.py')
|
||||
SimObject('X86GPUTLB.py')
|
||||
|
||||
if env['TARGET_GPU_ISA'] == 'hsail':
|
||||
Source('brig_object.cc')
|
||||
Source('hsail_code.cc')
|
||||
|
||||
Source('cl_driver.cc')
|
||||
Source('compute_unit.cc')
|
||||
Source('condition_register_state.cc')
|
||||
Source('dispatcher.cc')
|
||||
Source('exec_stage.cc')
|
||||
Source('fetch_stage.cc')
|
||||
Source('fetch_unit.cc')
|
||||
Source('global_memory_pipeline.cc')
|
||||
Source('gpu_dyn_inst.cc')
|
||||
Source('gpu_exec_context.cc')
|
||||
Source('gpu_static_inst.cc')
|
||||
Source('gpu_tlb.cc')
|
||||
Source('hsa_object.cc')
|
||||
Source('kernel_cfg.cc')
|
||||
Source('lds_state.cc')
|
||||
Source('local_memory_pipeline.cc')
|
||||
Source('of_scheduling_policy.cc')
|
||||
Source('pool_manager.cc')
|
||||
Source('rr_scheduling_policy.cc')
|
||||
Source('schedule_stage.cc')
|
||||
Source('scheduler.cc')
|
||||
Source('scoreboard_check_stage.cc')
|
||||
Source('shader.cc')
|
||||
Source('simple_pool_manager.cc')
|
||||
Source('tlb_coalescer.cc')
|
||||
Source('vector_register_file.cc')
|
||||
Source('vector_register_state.cc')
|
||||
Source('wavefront.cc')
|
||||
|
||||
DebugFlag('BRIG')
|
||||
DebugFlag('GPUCoalescer')
|
||||
DebugFlag('GPUDisp')
|
||||
DebugFlag('GPUExec')
|
||||
DebugFlag('GPUFetch')
|
||||
DebugFlag('GPUHsailCFInfo')
|
||||
DebugFlag('GPUMem')
|
||||
DebugFlag('GPUPort')
|
||||
DebugFlag('GPUPrefetch')
|
||||
DebugFlag('GPUReg')
|
||||
DebugFlag('GPUSync')
|
||||
DebugFlag('GPUTLB')
|
||||
DebugFlag('HSALoader')
|
||||
DebugFlag('HSAIL')
|
||||
DebugFlag('HSAILObject')
|
||||
DebugFlag('Predictor')
|
||||
DebugFlag('WavefrontStack')
|
||||
|
||||
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
|
||||
'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL'])
|
77
src/gpu-compute/X86GPUTLB.py
Normal file
77
src/gpu-compute/X86GPUTLB.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
#
|
||||
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Lisa Hsu
|
||||
#
|
||||
|
||||
from m5.defines import buildEnv
|
||||
from m5.params import *
|
||||
from m5.proxy import *
|
||||
|
||||
from m5.objects.MemObject import MemObject
|
||||
|
||||
if buildEnv['FULL_SYSTEM']:
|
||||
class X86PagetableWalker(MemObject):
|
||||
type = 'X86PagetableWalker'
|
||||
cxx_class = 'X86ISA::Walker'
|
||||
port = SlavePort("Port for the hardware table walker")
|
||||
system = Param.System(Parent.any, "system object")
|
||||
|
||||
class X86GPUTLB(MemObject):
|
||||
type = 'X86GPUTLB'
|
||||
cxx_class = 'X86ISA::GpuTLB'
|
||||
cxx_header = 'gpu-compute/gpu_tlb.hh'
|
||||
size = Param.Int(64, "TLB size (number of entries)")
|
||||
assoc = Param.Int(64, "TLB associativity")
|
||||
|
||||
if buildEnv['FULL_SYSTEM']:
|
||||
walker = Param.X86PagetableWalker(X86PagetableWalker(),
|
||||
"page table walker")
|
||||
|
||||
hitLatency = Param.Int(2, "Latency of a TLB hit")
|
||||
missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
|
||||
missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
|
||||
maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
|
||||
slave = VectorSlavePort("Port on side closer to CPU/CU")
|
||||
master = VectorMasterPort("Port on side closer to memory")
|
||||
allocationPolicy = Param.Bool(True, "Allocate on an access")
|
||||
accessDistance = Param.Bool(False, "print accessDistance stats")
|
||||
|
||||
class TLBCoalescer(MemObject):
|
||||
type = 'TLBCoalescer'
|
||||
cxx_class = 'TLBCoalescer'
|
||||
cxx_header = 'gpu-compute/tlb_coalescer.hh'
|
||||
probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
|
||||
coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
|
||||
slave = VectorSlavePort("Port on side closer to CPU/CU")
|
||||
master = VectorMasterPort("Port on side closer to memory")
|
||||
disableCoalescing = Param.Bool(False,"Dispable Coalescing")
|
474
src/gpu-compute/brig_object.cc
Normal file
474
src/gpu-compute/brig_object.cc
Normal file
|
@ -0,0 +1,474 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt, Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/brig_object.hh"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "arch/hsail/Brig.h"
|
||||
#include "base/misc.hh"
|
||||
#include "base/trace.hh"
|
||||
#include "debug/BRIG.hh"
|
||||
#include "debug/HSAILObject.hh"
|
||||
#include "debug/HSALoader.hh"
|
||||
|
||||
using namespace Brig;
|
||||
|
||||
std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>>
|
||||
HsaObject::tryFileFuncs = { BrigObject::tryFile };
|
||||
|
||||
extern int getBrigDataTypeBytes(BrigType16_t t);
|
||||
|
||||
const char *BrigObject::sectionNames[] =
|
||||
{
|
||||
"hsa_data",
|
||||
"hsa_code",
|
||||
"hsa_operand",
|
||||
".shstrtab"
|
||||
};
|
||||
|
||||
const char *segmentNames[] =
|
||||
{
|
||||
"none",
|
||||
"flat",
|
||||
"global",
|
||||
"readonly",
|
||||
"kernarg",
|
||||
"group",
|
||||
"private",
|
||||
"spill",
|
||||
"args"
|
||||
};
|
||||
|
||||
const uint8_t*
|
||||
BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const
|
||||
{
|
||||
// allow offs == size for dummy end pointers
|
||||
assert(offs <= sectionInfo[sec].size);
|
||||
|
||||
return sectionInfo[sec].ptr + offs;
|
||||
}
|
||||
|
||||
const char*
|
||||
BrigObject::getString(int offs) const
|
||||
{
|
||||
return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4);
|
||||
}
|
||||
|
||||
const BrigBase*
|
||||
BrigObject::getCodeSectionEntry(int offs) const
|
||||
{
|
||||
return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs);
|
||||
}
|
||||
|
||||
const BrigData*
|
||||
BrigObject::getBrigBaseData(int offs) const
|
||||
{
|
||||
return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs));
|
||||
}
|
||||
|
||||
const uint8_t*
|
||||
BrigObject::getData(int offs) const
|
||||
{
|
||||
return getSectionOffset(DataSectionIndex, offs);
|
||||
}
|
||||
|
||||
const BrigOperand*
|
||||
BrigObject::getOperand(int offs) const
|
||||
{
|
||||
return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs);
|
||||
}
|
||||
|
||||
unsigned
|
||||
BrigObject::getOperandPtr(int offs, int index) const
|
||||
{
|
||||
unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1)));
|
||||
|
||||
return *op_offs;
|
||||
}
|
||||
|
||||
const BrigInstBase*
|
||||
BrigObject::getInst(int offs) const
|
||||
{
|
||||
return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs);
|
||||
}
|
||||
|
||||
HsaCode*
|
||||
BrigObject::getKernel(const std::string &name) const
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
HsaCode*
|
||||
BrigObject::getFunction(const std::string &name) const
|
||||
{
|
||||
for (int i = 0; i < functions.size(); ++i) {
|
||||
if (functions[i]->name() == name) {
|
||||
return functions[i];
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void
|
||||
BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr,
|
||||
StorageMap *storageMap)
|
||||
{
|
||||
while (dirPtr < endPtr) {
|
||||
if (!dirPtr->byteCount) {
|
||||
fatal("Bad directive size 0\n");
|
||||
}
|
||||
|
||||
// calculate next pointer now so we can override it if needed
|
||||
const BrigBase *nextDirPtr = brigNext(dirPtr);
|
||||
|
||||
DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n",
|
||||
dirPtr->kind, dirPtr->byteCount);
|
||||
|
||||
switch (dirPtr->kind) {
|
||||
case BRIG_KIND_DIRECTIVE_FUNCTION:
|
||||
{
|
||||
const BrigDirectiveExecutable *p M5_VAR_USED =
|
||||
reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
|
||||
|
||||
DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: "
|
||||
"%d next: %d\n", getString(p->name),
|
||||
p->firstCodeBlockEntry, p->nextModuleEntry);
|
||||
|
||||
if (p->firstCodeBlockEntry != p->nextModuleEntry) {
|
||||
panic("Function calls are not fully supported yet!!: %s\n",
|
||||
getString(p->name));
|
||||
|
||||
const char *name = getString(p->name);
|
||||
|
||||
HsailCode *code_obj = nullptr;
|
||||
|
||||
for (int i = 0; i < functions.size(); ++i) {
|
||||
if (functions[i]->name() == name) {
|
||||
code_obj = functions[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!code_obj) {
|
||||
// create new local storage map for kernel-local symbols
|
||||
code_obj = new HsailCode(name, p, this,
|
||||
new StorageMap(storageMap));
|
||||
functions.push_back(code_obj);
|
||||
} else {
|
||||
panic("Multiple definition of Function!!: %s\n",
|
||||
getString(p->name));
|
||||
}
|
||||
|
||||
}
|
||||
nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_KERNEL:
|
||||
{
|
||||
const BrigDirectiveExecutable *p =
|
||||
reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
|
||||
|
||||
DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: "
|
||||
"next: %d\n", getString(p->name),
|
||||
p->firstCodeBlockEntry, p->nextModuleEntry);
|
||||
|
||||
const char *name = getString(p->name);
|
||||
|
||||
if (name[0] == '&')
|
||||
name++;
|
||||
|
||||
std::string str = name;
|
||||
char *temp;
|
||||
int len = str.length();
|
||||
|
||||
if (str[len - 1] >= 'a' && str[len - 1] <= 'z') {
|
||||
temp = new char[str.size() + 1];
|
||||
std::copy(str.begin(), str.end() , temp);
|
||||
temp[str.size()] = '\0';
|
||||
} else {
|
||||
temp = new char[str.size()];
|
||||
std::copy(str.begin(), str.end() - 1 , temp);
|
||||
temp[str.size() - 1 ] = '\0';
|
||||
}
|
||||
|
||||
std::string kernel_name = temp;
|
||||
delete[] temp;
|
||||
|
||||
HsailCode *code_obj = nullptr;
|
||||
|
||||
for (const auto &kernel : kernels) {
|
||||
if (kernel->name() == kernel_name) {
|
||||
code_obj = kernel;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!code_obj) {
|
||||
// create new local storage map for kernel-local symbols
|
||||
code_obj = new HsailCode(kernel_name, p, this,
|
||||
new StorageMap(storageMap));
|
||||
|
||||
kernels.push_back(code_obj);
|
||||
}
|
||||
|
||||
nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_VARIABLE:
|
||||
{
|
||||
const BrigDirectiveVariable *p =
|
||||
reinterpret_cast<const BrigDirectiveVariable*>(dirPtr);
|
||||
|
||||
uint64_t readonlySize_old =
|
||||
storageMap->getSize(BRIG_SEGMENT_READONLY);
|
||||
|
||||
StorageElement* se = storageMap->addSymbol(p, this);
|
||||
|
||||
DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n",
|
||||
getString(p->name));
|
||||
|
||||
if (p->segment == BRIG_SEGMENT_READONLY) {
|
||||
// readonly memory has initialization data
|
||||
uint8_t* readonlyData_old = readonlyData;
|
||||
|
||||
readonlyData =
|
||||
new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)];
|
||||
|
||||
if (p->init) {
|
||||
if ((p->type == BRIG_TYPE_ROIMG) ||
|
||||
(p->type == BRIG_TYPE_WOIMG) ||
|
||||
(p->type == BRIG_TYPE_SAMP) ||
|
||||
(p->type == BRIG_TYPE_SIG32) ||
|
||||
(p->type == BRIG_TYPE_SIG64)) {
|
||||
panic("Read only data type not supported: %s\n",
|
||||
getString(p->name));
|
||||
}
|
||||
|
||||
const BrigOperand *brigOp = getOperand(p->init);
|
||||
assert(brigOp->kind ==
|
||||
BRIG_KIND_OPERAND_CONSTANT_BYTES);
|
||||
|
||||
const Brig::BrigData *operand_data M5_VAR_USED =
|
||||
getBrigBaseData(((BrigOperandConstantBytes*)
|
||||
brigOp)->bytes);
|
||||
|
||||
assert((operand_data->byteCount / 4) > 0);
|
||||
|
||||
uint8_t *symbol_data =
|
||||
(uint8_t*)getData(((BrigOperandConstantBytes*)
|
||||
brigOp)->bytes + 4);
|
||||
|
||||
// copy the old data and add the new data
|
||||
if (readonlySize_old > 0) {
|
||||
memcpy(readonlyData, readonlyData_old,
|
||||
readonlySize_old);
|
||||
}
|
||||
|
||||
memcpy(readonlyData + se->offset, symbol_data,
|
||||
se->size);
|
||||
|
||||
delete[] readonlyData_old;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_LABEL:
|
||||
{
|
||||
const BrigDirectiveLabel M5_VAR_USED *p =
|
||||
reinterpret_cast<const BrigDirectiveLabel*>(dirPtr);
|
||||
|
||||
panic("Label directives cannot be at the module level: %s\n",
|
||||
getString(p->name));
|
||||
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_COMMENT:
|
||||
{
|
||||
const BrigDirectiveComment M5_VAR_USED *p =
|
||||
reinterpret_cast<const BrigDirectiveComment*>(dirPtr);
|
||||
|
||||
DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n",
|
||||
getString(p->name));
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_LOC:
|
||||
{
|
||||
DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_MODULE:
|
||||
{
|
||||
const BrigDirectiveModule M5_VAR_USED *p =
|
||||
reinterpret_cast<const BrigDirectiveModule*>(dirPtr);
|
||||
|
||||
DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n",
|
||||
getString(p->name));
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_CONTROL:
|
||||
{
|
||||
DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_PRAGMA:
|
||||
{
|
||||
DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_EXTENSION:
|
||||
{
|
||||
DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
|
||||
{
|
||||
DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
|
||||
{
|
||||
DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (dirPtr->kind >= BRIG_KIND_INST_BEGIN &&
|
||||
dirPtr->kind <= BRIG_KIND_INST_END)
|
||||
break;
|
||||
|
||||
if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
|
||||
dirPtr->kind <= BRIG_KIND_OPERAND_END)
|
||||
break;
|
||||
|
||||
warn("Unknown Brig directive kind: %d\n", dirPtr->kind);
|
||||
break;
|
||||
}
|
||||
|
||||
dirPtr = nextDirPtr;
|
||||
}
|
||||
}
|
||||
|
||||
HsaObject*
|
||||
BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData)
|
||||
{
|
||||
const char *brig_ident = "HSA BRIG";
|
||||
|
||||
if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH))
|
||||
return nullptr;
|
||||
|
||||
return new BrigObject(fname, len, fileData);
|
||||
}
|
||||
|
||||
BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData)
|
||||
: HsaObject(fname), storageMap(new StorageMap())
|
||||
{
|
||||
const char *brig_ident = "HSA BRIG";
|
||||
BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData;
|
||||
|
||||
fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH),
|
||||
"%s is not a BRIG file\n", fname);
|
||||
|
||||
if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR ||
|
||||
mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) {
|
||||
fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n",
|
||||
fname, mod_hdr->brigMajor, mod_hdr->brigMinor,
|
||||
BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR);
|
||||
}
|
||||
|
||||
fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section "
|
||||
"count (%d) != expected value (%d)\n", fname,
|
||||
mod_hdr->sectionCount, NumSectionIndices);
|
||||
|
||||
for (int i = 0; i < NumSectionIndices; ++i) {
|
||||
sectionInfo[i].ptr = nullptr;
|
||||
}
|
||||
|
||||
uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex);
|
||||
for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) {
|
||||
uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx];
|
||||
BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr;
|
||||
|
||||
// It doesn't look like cprintf supports string precision values,
|
||||
// but if this breaks, the right answer is to fix that
|
||||
DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength,
|
||||
sec_hdr->name);
|
||||
|
||||
sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount];
|
||||
memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount);
|
||||
sectionInfo[sec_idx].size = sec_hdr->byteCount;
|
||||
}
|
||||
|
||||
BrigSectionHeader *code_hdr =
|
||||
(BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr;
|
||||
|
||||
DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, "
|
||||
"name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount,
|
||||
code_hdr->nameLength);
|
||||
|
||||
// start at offset 4 to skip initial null entry (see Brig spec)
|
||||
processDirectives(getCodeSectionEntry(code_hdr->headerByteCount),
|
||||
getCodeSectionEntry(sectionInfo[CodeSectionIndex].size),
|
||||
storageMap);
|
||||
|
||||
delete[] fileData;
|
||||
|
||||
DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname);
|
||||
}
|
||||
|
||||
BrigObject::~BrigObject()
|
||||
{
|
||||
for (int i = 0; i < NumSectionIndices; ++i)
|
||||
if (sectionInfo[i].ptr)
|
||||
delete[] sectionInfo[i].ptr;
|
||||
}
|
134
src/gpu-compute/brig_object.hh
Normal file
134
src/gpu-compute/brig_object.hh
Normal file
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt, Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __BRIG_OBJECT_HH__
|
||||
#define __BRIG_OBJECT_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arch/hsail/Brig.h"
|
||||
#include "gpu-compute/hsa_object.hh"
|
||||
#include "gpu-compute/hsail_code.hh"
|
||||
|
||||
class LabelMap;
|
||||
class StorageMap;
|
||||
|
||||
/* @class BrigObject
|
||||
* this class implements the BRIG loader object, and
|
||||
* is used when the simulator directly executes HSAIL.
|
||||
* this class is responsible for extracting all
|
||||
* information about kernels contained in BRIG format
|
||||
* and converts them to HsailCode objects that are
|
||||
* usable by the simulator and emulated runtime.
|
||||
*/
|
||||
|
||||
class BrigObject final : public HsaObject
|
||||
{
|
||||
public:
|
||||
enum SectionIndex
|
||||
{
|
||||
DataSectionIndex,
|
||||
CodeSectionIndex,
|
||||
OperandsSectionIndex,
|
||||
NumSectionIndices
|
||||
};
|
||||
|
||||
static const char *sectionNames[];
|
||||
|
||||
struct SectionInfo
|
||||
{
|
||||
uint8_t *ptr;
|
||||
int size;
|
||||
};
|
||||
|
||||
static HsaObject* tryFile(const std::string &fname, int len,
|
||||
uint8_t *fileData);
|
||||
|
||||
SectionInfo sectionInfo[NumSectionIndices];
|
||||
const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const;
|
||||
|
||||
std::vector<HsailCode*> kernels;
|
||||
std::vector<HsailCode*> functions;
|
||||
std::string kern_block_name;
|
||||
|
||||
void processDirectives(const Brig::BrigBase *dirPtr,
|
||||
const Brig::BrigBase *endPtr,
|
||||
StorageMap *storageMap);
|
||||
|
||||
BrigObject(const std::string &fname, int len, uint8_t *fileData);
|
||||
~BrigObject();
|
||||
|
||||
// eventually these will need to be per-kernel not per-object-file
|
||||
StorageMap *storageMap;
|
||||
LabelMap *labelMap;
|
||||
|
||||
const char* getString(int offs) const;
|
||||
const Brig::BrigData* getBrigBaseData(int offs) const;
|
||||
const uint8_t* getData(int offs) const;
|
||||
const Brig::BrigBase* getCodeSectionEntry(int offs) const;
|
||||
const Brig::BrigOperand* getOperand(int offs) const;
|
||||
unsigned getOperandPtr(int offs, int index) const;
|
||||
const Brig::BrigInstBase* getInst(int offs) const;
|
||||
|
||||
HsaCode* getKernel(const std::string &name) const override;
|
||||
HsaCode* getFunction(const std::string &name) const override;
|
||||
|
||||
int numKernels() const override { return kernels.size(); }
|
||||
|
||||
HsaCode* getKernel(int i) const override { return kernels[i]; }
|
||||
|
||||
// pointer to the current kernel/function we're processing, so elements
|
||||
// under construction can reference it. kinda ugly, but easier
|
||||
// than passing it all over for the few places it's needed.
|
||||
mutable HsailCode *currentCode;
|
||||
};
|
||||
|
||||
// Utility function to bump Brig item pointer to next element given
|
||||
// item size in bytes. Really just an add but with lots of casting.
|
||||
template<typename T>
|
||||
T*
|
||||
brigNext(T *ptr)
|
||||
{
|
||||
Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr;
|
||||
int size = base_ptr->byteCount;
|
||||
assert(size);
|
||||
|
||||
return (T*)((uint8_t*)ptr + size);
|
||||
}
|
||||
|
||||
#endif // __BRIG_OBJECT_HH__
|
272
src/gpu-compute/cl_driver.cc
Normal file
272
src/gpu-compute/cl_driver.cc
Normal file
|
@ -0,0 +1,272 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/cl_driver.hh"
|
||||
|
||||
#include "base/intmath.hh"
|
||||
#include "cpu/thread_context.hh"
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
#include "gpu-compute/hsa_code.hh"
|
||||
#include "gpu-compute/hsa_kernel_info.hh"
|
||||
#include "gpu-compute/hsa_object.hh"
|
||||
#include "params/ClDriver.hh"
|
||||
#include "sim/process.hh"
|
||||
#include "sim/syscall_emul_buf.hh"
|
||||
|
||||
ClDriver::ClDriver(ClDriverParams *p)
|
||||
: EmulatedDriver(p), hsaCode(0)
|
||||
{
|
||||
for (const auto &codeFile : p->codefile)
|
||||
codeFiles.push_back(&codeFile);
|
||||
|
||||
maxFuncArgsSize = 0;
|
||||
|
||||
for (int i = 0; i < codeFiles.size(); ++i) {
|
||||
HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]);
|
||||
|
||||
for (int k = 0; k < obj->numKernels(); ++k) {
|
||||
assert(obj->getKernel(k));
|
||||
kernels.push_back(obj->getKernel(k));
|
||||
kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData);
|
||||
int kern_funcargs_size = kernels.back()->funcarg_size;
|
||||
maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ?
|
||||
kern_funcargs_size : maxFuncArgsSize;
|
||||
}
|
||||
}
|
||||
|
||||
int name_offs = 0;
|
||||
int code_offs = 0;
|
||||
|
||||
for (int i = 0; i < kernels.size(); ++i) {
|
||||
kernelInfo.push_back(HsaKernelInfo());
|
||||
HsaCode *k = kernels[i];
|
||||
|
||||
k->generateHsaKernelInfo(&kernelInfo[i]);
|
||||
|
||||
kernelInfo[i].name_offs = name_offs;
|
||||
kernelInfo[i].code_offs = code_offs;
|
||||
|
||||
name_offs += k->name().size() + 1;
|
||||
code_offs += k->numInsts() * sizeof(GPUStaticInst*);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ClDriver::handshake(GpuDispatcher *_dispatcher)
|
||||
{
|
||||
dispatcher = _dispatcher;
|
||||
dispatcher->setFuncargsSize(maxFuncArgsSize);
|
||||
}
|
||||
|
||||
int
|
||||
ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags)
|
||||
{
|
||||
int fd = p->allocFD(-1, filename, 0, 0, false);
|
||||
FDEntry *fde = p->getFDEntry(fd);
|
||||
fde->driver = this;
|
||||
|
||||
return fd;
|
||||
}
|
||||
|
||||
int
|
||||
ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
|
||||
{
|
||||
int index = 2;
|
||||
Addr buf_addr = process->getSyscallArg(tc, index);
|
||||
|
||||
switch (req) {
|
||||
case HSA_GET_SIZES:
|
||||
{
|
||||
TypedBufferArg<HsaDriverSizes> sizes(buf_addr);
|
||||
sizes->num_kernels = kernels.size();
|
||||
sizes->string_table_size = 0;
|
||||
sizes->code_size = 0;
|
||||
sizes->readonly_size = 0;
|
||||
|
||||
if (kernels.size() > 0) {
|
||||
// all kernels will share the same read-only memory
|
||||
sizes->readonly_size =
|
||||
kernels[0]->getSize(HsaCode::MemorySegment::READONLY);
|
||||
// check our assumption
|
||||
for (int i = 1; i<kernels.size(); ++i) {
|
||||
assert(sizes->readonly_size ==
|
||||
kernels[i]->getSize(HsaCode::MemorySegment::READONLY));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < kernels.size(); ++i) {
|
||||
HsaCode *k = kernels[i];
|
||||
// add one for terminating '\0'
|
||||
sizes->string_table_size += k->name().size() + 1;
|
||||
sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*);
|
||||
}
|
||||
|
||||
sizes.copyOut(tc->getMemProxy());
|
||||
}
|
||||
break;
|
||||
|
||||
case HSA_GET_KINFO:
|
||||
{
|
||||
TypedBufferArg<HsaKernelInfo>
|
||||
kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size());
|
||||
|
||||
for (int i = 0; i < kernels.size(); ++i) {
|
||||
HsaKernelInfo *ki = &kinfo[i];
|
||||
ki->name_offs = kernelInfo[i].name_offs;
|
||||
ki->code_offs = kernelInfo[i].code_offs;
|
||||
ki->sRegCount = kernelInfo[i].sRegCount;
|
||||
ki->dRegCount = kernelInfo[i].dRegCount;
|
||||
ki->cRegCount = kernelInfo[i].cRegCount;
|
||||
ki->static_lds_size = kernelInfo[i].static_lds_size;
|
||||
ki->private_mem_size = kernelInfo[i].private_mem_size;
|
||||
ki->spill_mem_size = kernelInfo[i].spill_mem_size;
|
||||
}
|
||||
|
||||
kinfo.copyOut(tc->getMemProxy());
|
||||
}
|
||||
break;
|
||||
|
||||
case HSA_GET_STRINGS:
|
||||
{
|
||||
int string_table_size = 0;
|
||||
for (int i = 0; i < kernels.size(); ++i) {
|
||||
HsaCode *k = kernels[i];
|
||||
string_table_size += k->name().size() + 1;
|
||||
}
|
||||
|
||||
BufferArg buf(buf_addr, string_table_size);
|
||||
char *bufp = (char*)buf.bufferPtr();
|
||||
|
||||
for (int i = 0; i < kernels.size(); ++i) {
|
||||
HsaCode *k = kernels[i];
|
||||
const char *n = k->name().c_str();
|
||||
|
||||
// idiomatic string copy
|
||||
while ((*bufp++ = *n++));
|
||||
}
|
||||
|
||||
assert(bufp - (char *)buf.bufferPtr() == string_table_size);
|
||||
|
||||
buf.copyOut(tc->getMemProxy());
|
||||
}
|
||||
break;
|
||||
|
||||
case HSA_GET_READONLY_DATA:
|
||||
{
|
||||
// we can pick any kernel --- they share the same
|
||||
// readonly segment (this assumption is checked in GET_SIZES)
|
||||
uint64_t size =
|
||||
kernels.back()->getSize(HsaCode::MemorySegment::READONLY);
|
||||
BufferArg data(buf_addr, size);
|
||||
char *datap = (char *)data.bufferPtr();
|
||||
memcpy(datap,
|
||||
kernels.back()->readonly_data,
|
||||
size);
|
||||
data.copyOut(tc->getMemProxy());
|
||||
}
|
||||
break;
|
||||
|
||||
case HSA_GET_CODE:
|
||||
{
|
||||
// set hsaCode pointer
|
||||
hsaCode = buf_addr;
|
||||
int code_size = 0;
|
||||
|
||||
for (int i = 0; i < kernels.size(); ++i) {
|
||||
HsaCode *k = kernels[i];
|
||||
code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst);
|
||||
}
|
||||
|
||||
TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size);
|
||||
TheGpuISA::RawMachInst *bufp = buf;
|
||||
|
||||
int buf_idx = 0;
|
||||
|
||||
for (int i = 0; i < kernels.size(); ++i) {
|
||||
HsaCode *k = kernels[i];
|
||||
|
||||
for (int j = 0; j < k->numInsts(); ++j) {
|
||||
bufp[buf_idx] = k->insts()->at(j);
|
||||
++buf_idx;
|
||||
}
|
||||
}
|
||||
|
||||
buf.copyOut(tc->getMemProxy());
|
||||
}
|
||||
break;
|
||||
|
||||
case HSA_GET_CU_CNT:
|
||||
{
|
||||
BufferArg buf(buf_addr, sizeof(uint32_t));
|
||||
*((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs();
|
||||
buf.copyOut(tc->getMemProxy());
|
||||
}
|
||||
break;
|
||||
|
||||
case HSA_GET_VSZ:
|
||||
{
|
||||
BufferArg buf(buf_addr, sizeof(uint32_t));
|
||||
*((uint32_t*)buf.bufferPtr()) = VSZ;
|
||||
buf.copyOut(tc->getMemProxy());
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
fatal("ClDriver: bad ioctl %d\n", req);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char*
|
||||
ClDriver::codeOffToKernelName(uint64_t code_ptr)
|
||||
{
|
||||
assert(hsaCode);
|
||||
uint32_t code_offs = code_ptr - hsaCode;
|
||||
|
||||
for (int i = 0; i < kernels.size(); ++i) {
|
||||
if (code_offs == kernelInfo[i].code_offs) {
|
||||
return kernels[i]->name().c_str();
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ClDriver*
|
||||
ClDriverParams::create()
|
||||
{
|
||||
return new ClDriver(this);
|
||||
}
|
77
src/gpu-compute/cl_driver.hh
Normal file
77
src/gpu-compute/cl_driver.hh
Normal file
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __CL_DRIVER_HH__
|
||||
#define __CL_DRIVER_HH__
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "gpu-compute/hsa_kernel_info.hh"
|
||||
#include "sim/emul_driver.hh"
|
||||
|
||||
class GpuDispatcher;
|
||||
class HsaCode;
|
||||
class LiveProcess;
|
||||
class ThreadContext;
|
||||
|
||||
struct ClDriverParams;
|
||||
|
||||
class ClDriver final : public EmulatedDriver
|
||||
{
|
||||
public:
|
||||
ClDriver(ClDriverParams *p);
|
||||
void handshake(GpuDispatcher *_dispatcher);
|
||||
int open(LiveProcess *p, ThreadContext *tc, int mode, int flags);
|
||||
int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req);
|
||||
const char* codeOffToKernelName(uint64_t code_ptr);
|
||||
|
||||
private:
|
||||
GpuDispatcher *dispatcher;
|
||||
|
||||
std::vector<const std::string*> codeFiles;
|
||||
|
||||
// All the kernels we know about
|
||||
std::vector<HsaCode*> kernels;
|
||||
std::vector<HsaCode*> functions;
|
||||
|
||||
std::vector<HsaKernelInfo> kernelInfo;
|
||||
|
||||
// maximum size necessary for function arguments
|
||||
int maxFuncArgsSize;
|
||||
// The host virtual address for the kernel code
|
||||
uint64_t hsaCode;
|
||||
};
|
||||
|
||||
#endif // __CL_DRIVER_HH__
|
51
src/gpu-compute/cl_event.hh
Normal file
51
src/gpu-compute/cl_event.hh
Normal file
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Marc Orr
|
||||
*/
|
||||
|
||||
#ifndef __GPU_CL_EVENT_HH__
|
||||
#define __GPU_CL_EVENT_HH__
|
||||
|
||||
struct HsaQueueEntry;
|
||||
|
||||
class _cl_event {
|
||||
public:
|
||||
_cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { }
|
||||
|
||||
volatile bool done;
|
||||
HsaQueueEntry *hsaTaskPtr;
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
};
|
||||
|
||||
#endif // __GPU_CL_EVENT_HH__
|
116
src/gpu-compute/code_enums.hh
Normal file
116
src/gpu-compute/code_enums.hh
Normal file
|
@ -0,0 +1,116 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __CODE_ENUMS_HH__
|
||||
#define __CODE_ENUMS_HH__
|
||||
|
||||
#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \
|
||||
&& (a)<=Enums::OT_GLOBAL_LDAS)
|
||||
#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \
|
||||
&& (a)<=Enums::OT_SHARED_LDAS)
|
||||
#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \
|
||||
&& (a)<=Enums::OT_PRIVATE_LDAS)
|
||||
#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \
|
||||
&& (a)<=Enums::OT_SPILL_LDAS)
|
||||
#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \
|
||||
&& (a)<=Enums::OT_READONLY_LDAS)
|
||||
#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS)
|
||||
|
||||
#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \
|
||||
||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \
|
||||
||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS)
|
||||
|
||||
#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \
|
||||
||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \
|
||||
||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ)
|
||||
|
||||
#define IS_OT_READ_GM(a) \
|
||||
((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \
|
||||
||(a)==Enums::OT_READONLY_READ)
|
||||
|
||||
#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ)
|
||||
|
||||
#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ)
|
||||
|
||||
#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ)
|
||||
|
||||
#define IS_OT_WRITE(a) \
|
||||
((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \
|
||||
||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \
|
||||
||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE)
|
||||
|
||||
#define IS_OT_WRITE_GM(a) \
|
||||
((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \
|
||||
||(a)==Enums::OT_READONLY_WRITE)
|
||||
|
||||
#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE)
|
||||
|
||||
#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE)
|
||||
|
||||
#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
|
||||
||(a)==Enums::OT_SHARED_ATOMIC \
|
||||
||(a)==Enums::OT_PRIVATE_ATOMIC \
|
||||
||(a)==Enums::OT_SPILL_ATOMIC \
|
||||
||(a)==Enums::OT_READONLY_ATOMIC \
|
||||
||(a)==Enums::OT_FLAT_ATOMIC)
|
||||
|
||||
#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
|
||||
||(a)==Enums::OT_SPILL_ATOMIC \
|
||||
||(a)==Enums::OT_READONLY_ATOMIC \
|
||||
||(a)==Enums::OT_GLOBAL_MEMFENCE \
|
||||
||(a)==Enums::OT_BOTH_MEMFENCE)
|
||||
|
||||
#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
|
||||
||(a)==Enums::OT_SHARED_MEMFENCE \
|
||||
||(a)==Enums::OT_BOTH_MEMFENCE)
|
||||
|
||||
#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
|
||||
|
||||
#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \
|
||||
||(a)==Enums::OT_SHARED_HIST \
|
||||
||(a)==Enums::OT_PRIVATE_HIST \
|
||||
||(a)==Enums::OT_SPILL_HIST \
|
||||
||(a)==Enums::OT_READONLY_HIST \
|
||||
||(a)==Enums::OT_FLAT_HIST)
|
||||
|
||||
#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \
|
||||
||(a)==Enums::OT_SPILL_HIST \
|
||||
||(a)==Enums::OT_READONLY_HIST)
|
||||
|
||||
#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST)
|
||||
|
||||
#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST)
|
||||
|
||||
#endif // __CODE_ENUMS_HH__
|
1817
src/gpu-compute/compute_unit.cc
Normal file
1817
src/gpu-compute/compute_unit.cc
Normal file
File diff suppressed because it is too large
Load diff
767
src/gpu-compute/compute_unit.hh
Normal file
767
src/gpu-compute/compute_unit.hh
Normal file
|
@ -0,0 +1,767 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos, Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __COMPUTE_UNIT_HH__
|
||||
#define __COMPUTE_UNIT_HH__
|
||||
|
||||
#include <deque>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "base/callback.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/types.hh"
|
||||
#include "enums/PrefetchType.hh"
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
#include "gpu-compute/fetch_stage.hh"
|
||||
#include "gpu-compute/global_memory_pipeline.hh"
|
||||
#include "gpu-compute/local_memory_pipeline.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
#include "gpu-compute/schedule_stage.hh"
|
||||
#include "gpu-compute/scoreboard_check_stage.hh"
|
||||
#include "mem/mem_object.hh"
|
||||
#include "mem/port.hh"
|
||||
|
||||
static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
|
||||
static const int MAX_WIDTH_FOR_MEM_INST = 32;
|
||||
|
||||
class NDRange;
|
||||
class Shader;
|
||||
class VectorRegisterFile;
|
||||
|
||||
struct ComputeUnitParams;
|
||||
|
||||
enum EXEC_POLICY
|
||||
{
|
||||
OLDEST = 0,
|
||||
RR
|
||||
};
|
||||
|
||||
// List of execution units
|
||||
enum EXEC_UNIT
|
||||
{
|
||||
SIMD0 = 0,
|
||||
SIMD1,
|
||||
SIMD2,
|
||||
SIMD3,
|
||||
GLBMEM_PIPE,
|
||||
LDSMEM_PIPE,
|
||||
NUM_UNITS
|
||||
};
|
||||
|
||||
enum TLB_CACHE
|
||||
{
|
||||
TLB_MISS_CACHE_MISS = 0,
|
||||
TLB_MISS_CACHE_HIT,
|
||||
TLB_HIT_CACHE_MISS,
|
||||
TLB_HIT_CACHE_HIT
|
||||
};
|
||||
|
||||
class ComputeUnit : public MemObject
|
||||
{
|
||||
public:
|
||||
FetchStage fetchStage;
|
||||
ScoreboardCheckStage scoreboardCheckStage;
|
||||
ScheduleStage scheduleStage;
|
||||
ExecStage execStage;
|
||||
GlobalMemPipeline globalMemoryPipe;
|
||||
LocalMemPipeline localMemoryPipe;
|
||||
|
||||
// Buffers used to communicate between various pipeline stages
|
||||
|
||||
// List of waves which are ready to be scheduled.
|
||||
// Each execution resource has a ready list. readyList is
|
||||
// used to communicate between scoreboardCheck stage and
|
||||
// schedule stage
|
||||
// TODO: make enum to index readyList
|
||||
std::vector<std::vector<Wavefront*>> readyList;
|
||||
|
||||
// Stores the status of waves. A READY implies the
|
||||
// wave is ready to be scheduled this cycle and
|
||||
// is already present in the readyList. waveStatusList is
|
||||
// used to communicate between scoreboardCheck stage and
|
||||
// schedule stage
|
||||
// TODO: convert std::pair to a class to increase readability
|
||||
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. A FILLED implies
|
||||
// dispatch list is non-empty and
|
||||
// execution unit has something to execute
|
||||
// this cycle. Currently, the dispatch list of
|
||||
// an execution resource can hold only one wave because
|
||||
// an execution resource can execute only one wave in a cycle.
|
||||
// dispatchList is used to communicate between schedule
|
||||
// and exec stage
|
||||
// TODO: convert std::pair to a class to increase readability
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
|
||||
|
||||
int rrNextMemID; // used by RR WF exec policy to cycle through WF's
|
||||
int rrNextALUWp;
|
||||
typedef ComputeUnitParams Params;
|
||||
std::vector<std::vector<Wavefront*>> wfList;
|
||||
int cu_id;
|
||||
|
||||
// array of vector register files, one per SIMD
|
||||
std::vector<VectorRegisterFile*> vrf;
|
||||
// Number of vector ALU units (SIMDs) in CU
|
||||
int numSIMDs;
|
||||
// number of pipe stages for bypassing data to next dependent single
|
||||
// precision vector instruction inside the vector ALU pipeline
|
||||
int spBypassPipeLength;
|
||||
// number of pipe stages for bypassing data to next dependent double
|
||||
// precision vector instruction inside the vector ALU pipeline
|
||||
int dpBypassPipeLength;
|
||||
// number of cycles per issue period
|
||||
int issuePeriod;
|
||||
|
||||
// Number of global and local memory execution resources in CU
|
||||
int numGlbMemUnits;
|
||||
int numLocMemUnits;
|
||||
// tracks the last cycle a vector instruction was executed on a SIMD
|
||||
std::vector<uint64_t> lastExecCycle;
|
||||
|
||||
// true if we allow a separate TLB per lane
|
||||
bool perLaneTLB;
|
||||
// if 0, TLB prefetching is off.
|
||||
int prefetchDepth;
|
||||
// if fixed-stride prefetching, this is the stride.
|
||||
int prefetchStride;
|
||||
|
||||
class LastVaddrWave
|
||||
{
|
||||
public:
|
||||
Addr vaddrs[VSZ];
|
||||
Addr& operator[](int idx) {
|
||||
return vaddrs[idx];
|
||||
}
|
||||
|
||||
LastVaddrWave() {
|
||||
for (int i = 0; i < VSZ; ++i)
|
||||
vaddrs[i] = 0;
|
||||
}
|
||||
};
|
||||
|
||||
LastVaddrWave lastVaddrCU;
|
||||
std::vector<LastVaddrWave> lastVaddrPhase;
|
||||
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
|
||||
Enums::PrefetchType prefetchType;
|
||||
EXEC_POLICY exec_policy;
|
||||
|
||||
bool xact_cas_mode;
|
||||
bool debugSegFault;
|
||||
bool functionalTLB;
|
||||
bool localMemBarrier;
|
||||
|
||||
/*
|
||||
* for Counting page accesses
|
||||
*
|
||||
* cuExitCallback inherits from Callback. When you register a callback
|
||||
* function as an exit callback, it will get added to an exit callback
|
||||
* queue, such that on simulation exit, all callbacks in the callback
|
||||
* queue will have their process() function called.
|
||||
*/
|
||||
bool countPages;
|
||||
|
||||
Shader *shader;
|
||||
uint32_t barrier_id;
|
||||
// vector of Vector ALU (MACC) pipelines
|
||||
std::vector<WaitClass> aluPipe;
|
||||
// minimum issue period per SIMD unit (in cycles)
|
||||
std::vector<WaitClass> wfWait;
|
||||
|
||||
// Resource control for Vector Register File->Global Memory pipe buses
|
||||
std::vector<WaitClass> vrfToGlobalMemPipeBus;
|
||||
// Resource control for Vector Register File->Local Memory pipe buses
|
||||
std::vector<WaitClass> vrfToLocalMemPipeBus;
|
||||
int nextGlbMemBus;
|
||||
int nextLocMemBus;
|
||||
// Resource control for global memory to VRF data/address bus
|
||||
WaitClass glbMemToVrfBus;
|
||||
// Resource control for local memory to VRF data/address bus
|
||||
WaitClass locMemToVrfBus;
|
||||
|
||||
uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
|
||||
uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
|
||||
uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
|
||||
uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
|
||||
|
||||
Tick req_tick_latency;
|
||||
Tick resp_tick_latency;
|
||||
|
||||
// number of vector registers being reserved for each SIMD unit
|
||||
std::vector<int> vectorRegsReserved;
|
||||
// number of vector registers per SIMD unit
|
||||
uint32_t numVecRegsPerSimd;
|
||||
// Support for scheduling VGPR status update events
|
||||
std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
|
||||
std::vector<uint64_t> timestampVec;
|
||||
std::vector<uint8_t> statusVec;
|
||||
|
||||
void
|
||||
registerEvent(uint32_t simdId,
|
||||
uint32_t regIdx,
|
||||
uint32_t operandSize,
|
||||
uint64_t when,
|
||||
uint8_t newStatus) {
|
||||
regIdxVec.push_back(std::make_pair(simdId, regIdx));
|
||||
timestampVec.push_back(when);
|
||||
statusVec.push_back(newStatus);
|
||||
if (operandSize > 4) {
|
||||
regIdxVec.push_back(std::make_pair(simdId,
|
||||
((regIdx + 1) %
|
||||
numVecRegsPerSimd)));
|
||||
timestampVec.push_back(when);
|
||||
statusVec.push_back(newStatus);
|
||||
}
|
||||
}
|
||||
|
||||
void updateEvents();
|
||||
|
||||
// this hash map will keep track of page divergence
|
||||
// per memory instruction per wavefront. The hash map
|
||||
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
|
||||
std::map<Addr, int> pagesTouched;
|
||||
|
||||
ComputeUnit(const Params *p);
|
||||
~ComputeUnit();
|
||||
int spBypassLength() { return spBypassPipeLength; };
|
||||
int dpBypassLength() { return dpBypassPipeLength; };
|
||||
int storeBusLength() { return numCyclesPerStoreTransfer; };
|
||||
int loadBusLength() { return numCyclesPerLoadTransfer; };
|
||||
int wfSize() const { return wavefrontSize; };
|
||||
|
||||
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
|
||||
void exec();
|
||||
void initiateFetch(Wavefront *wavefront);
|
||||
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
||||
void FillKernelState(Wavefront *w, NDRange *ndr);
|
||||
|
||||
void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
||||
int trueWgSizeTotal);
|
||||
|
||||
void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
|
||||
int trueWgSize[], int trueWgSizeTotal,
|
||||
LdsChunk *ldsChunk, uint64_t origSpillMemStart);
|
||||
|
||||
void StartWorkgroup(NDRange *ndr);
|
||||
int ReadyWorkgroup(NDRange *ndr);
|
||||
|
||||
bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
|
||||
bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
|
||||
bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
|
||||
int GlbMemUnitId() { return GLBMEM_PIPE; }
|
||||
int ShrMemUnitId() { return LDSMEM_PIPE; }
|
||||
int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
|
||||
int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
|
||||
/* This function cycles through all the wavefronts in all the phases to see
|
||||
* if all of the wavefronts which should be associated with one barrier
|
||||
* (denoted with _barrier_id), are all at the same barrier in the program
|
||||
* (denoted by bcnt). When the number at the barrier matches bslots, then
|
||||
* return true.
|
||||
*/
|
||||
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
|
||||
bool cedeSIMD(int simdId, int wfSlotId);
|
||||
|
||||
template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
|
||||
virtual void init();
|
||||
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
|
||||
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
|
||||
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
|
||||
bool kernelLaunch=true,
|
||||
RequestPtr req=nullptr);
|
||||
void handleMemPacket(PacketPtr pkt, int memport_index);
|
||||
bool processTimingPacket(PacketPtr pkt);
|
||||
void processFetchReturn(PacketPtr pkt);
|
||||
void updatePageDivergenceDist(Addr addr);
|
||||
|
||||
MasterID masterId() { return _masterId; }
|
||||
|
||||
bool isDone() const;
|
||||
bool isSimdDone(uint32_t) const;
|
||||
|
||||
protected:
|
||||
MasterID _masterId;
|
||||
|
||||
LdsState &lds;
|
||||
|
||||
public:
|
||||
// the following stats compute the avg. TLB accesslatency per
|
||||
// uncoalesced request (only for data)
|
||||
Stats::Scalar tlbRequests;
|
||||
Stats::Scalar tlbCycles;
|
||||
Stats::Formula tlbLatency;
|
||||
// hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
|
||||
Stats::Vector hitsPerTLBLevel;
|
||||
|
||||
Stats::Scalar ldsBankAccesses;
|
||||
Stats::Distribution ldsBankConflictDist;
|
||||
|
||||
// over all memory instructions executed over all wavefronts
|
||||
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
|
||||
Stats::Distribution pageDivergenceDist;
|
||||
Stats::Scalar dynamicGMemInstrCnt;
|
||||
Stats::Scalar dynamicLMemInstrCnt;
|
||||
|
||||
Stats::Scalar wgBlockedDueLdsAllocation;
|
||||
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
|
||||
// when the instruction is committed, this number is still incremented by 1
|
||||
Stats::Scalar numInstrExecuted;
|
||||
// Number of cycles among successive instruction executions across all
|
||||
// wavefronts of the same CU
|
||||
Stats::Distribution execRateDist;
|
||||
// number of individual vector operations executed
|
||||
Stats::Scalar numVecOpsExecuted;
|
||||
// Total cycles that something is running on the GPU
|
||||
Stats::Scalar totalCycles;
|
||||
Stats::Formula vpc; // vector ops per cycle
|
||||
Stats::Formula ipc; // vector instructions per cycle
|
||||
Stats::Distribution controlFlowDivergenceDist;
|
||||
Stats::Distribution activeLanesPerGMemInstrDist;
|
||||
Stats::Distribution activeLanesPerLMemInstrDist;
|
||||
// number of vector ALU instructions received
|
||||
Stats::Formula numALUInstsExecuted;
|
||||
// number of times a WG can not start due to lack of free VGPRs in SIMDs
|
||||
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
|
||||
Stats::Scalar numCASOps;
|
||||
Stats::Scalar numFailedCASOps;
|
||||
Stats::Scalar completedWfs;
|
||||
// flag per vector SIMD unit that is set when there is at least one
|
||||
// WV that has a vector ALU instruction as the oldest in its
|
||||
// Instruction Buffer: Defined in the Scoreboard stage, consumed
|
||||
// by the Execute stage.
|
||||
std::vector<bool> vectorAluInstAvail;
|
||||
// number of available (oldest) LDS instructions that could have
|
||||
// been issued to the LDS at a specific issue slot
|
||||
int shrMemInstAvail;
|
||||
// number of available Global memory instructions that could have
|
||||
// been issued to TCP at a specific issue slot
|
||||
int glbMemInstAvail;
|
||||
|
||||
void
|
||||
regStats();
|
||||
|
||||
LdsState &
|
||||
getLds() const
|
||||
{
|
||||
return lds;
|
||||
}
|
||||
|
||||
int32_t
|
||||
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
|
||||
|
||||
bool
|
||||
sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
|
||||
|
||||
typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
|
||||
pageDataStruct pageAccesses;
|
||||
|
||||
class CUExitCallback : public Callback
|
||||
{
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
|
||||
public:
|
||||
virtual ~CUExitCallback() { }
|
||||
|
||||
CUExitCallback(ComputeUnit *_cu)
|
||||
{
|
||||
computeUnit = _cu;
|
||||
}
|
||||
|
||||
virtual void
|
||||
process();
|
||||
};
|
||||
|
||||
CUExitCallback *cuExitCallback;
|
||||
|
||||
/** Data access Port **/
|
||||
class DataPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
|
||||
: MasterPort(_name, _cu), computeUnit(_cu),
|
||||
index(_index) { }
|
||||
|
||||
bool snoopRangeSent;
|
||||
|
||||
struct SenderState : public Packet::SenderState
|
||||
{
|
||||
GPUDynInstPtr _gpuDynInst;
|
||||
int port_index;
|
||||
Packet::SenderState *saved;
|
||||
|
||||
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
|
||||
Packet::SenderState *sender_state=nullptr)
|
||||
: _gpuDynInst(gpuDynInst),
|
||||
port_index(_port_index),
|
||||
saved(sender_state) { }
|
||||
};
|
||||
|
||||
class MemReqEvent : public Event
|
||||
{
|
||||
private:
|
||||
DataPort *dataPort;
|
||||
PacketPtr pkt;
|
||||
|
||||
public:
|
||||
MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
|
||||
: Event(), dataPort(_data_port), pkt(_pkt)
|
||||
{
|
||||
setFlags(Event::AutoDelete);
|
||||
}
|
||||
|
||||
void process();
|
||||
const char *description() const;
|
||||
};
|
||||
|
||||
class MemRespEvent : public Event
|
||||
{
|
||||
private:
|
||||
DataPort *dataPort;
|
||||
PacketPtr pkt;
|
||||
|
||||
public:
|
||||
MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
|
||||
: Event(), dataPort(_data_port), pkt(_pkt)
|
||||
{
|
||||
setFlags(Event::AutoDelete);
|
||||
}
|
||||
|
||||
void process();
|
||||
const char *description() const;
|
||||
};
|
||||
|
||||
std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
|
||||
|
||||
protected:
|
||||
ComputeUnit *computeUnit;
|
||||
int index;
|
||||
|
||||
virtual bool recvTimingResp(PacketPtr pkt);
|
||||
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
||||
virtual void recvFunctional(PacketPtr pkt) { }
|
||||
virtual void recvRangeChange() { }
|
||||
virtual void recvReqRetry();
|
||||
|
||||
virtual void
|
||||
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
|
||||
{
|
||||
resp.clear();
|
||||
snoop = true;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// Instruction cache access port
|
||||
class SQCPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
|
||||
: MasterPort(_name, _cu), computeUnit(_cu),
|
||||
index(_index) { }
|
||||
|
||||
bool snoopRangeSent;
|
||||
|
||||
struct SenderState : public Packet::SenderState
|
||||
{
|
||||
Wavefront *wavefront;
|
||||
Packet::SenderState *saved;
|
||||
|
||||
SenderState(Wavefront *_wavefront, Packet::SenderState
|
||||
*sender_state=nullptr)
|
||||
: wavefront(_wavefront), saved(sender_state) { }
|
||||
};
|
||||
|
||||
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
|
||||
|
||||
protected:
|
||||
ComputeUnit *computeUnit;
|
||||
int index;
|
||||
|
||||
virtual bool recvTimingResp(PacketPtr pkt);
|
||||
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
||||
virtual void recvFunctional(PacketPtr pkt) { }
|
||||
virtual void recvRangeChange() { }
|
||||
virtual void recvReqRetry();
|
||||
|
||||
virtual void
|
||||
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
|
||||
{
|
||||
resp.clear();
|
||||
snoop = true;
|
||||
}
|
||||
};
|
||||
|
||||
/** Data TLB port **/
|
||||
class DTLBPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
|
||||
: MasterPort(_name, _cu), computeUnit(_cu),
|
||||
index(_index), stalled(false)
|
||||
{ }
|
||||
|
||||
bool isStalled() { return stalled; }
|
||||
void stallPort() { stalled = true; }
|
||||
void unstallPort() { stalled = false; }
|
||||
|
||||
/**
|
||||
* here we queue all the translation requests that were
|
||||
* not successfully sent.
|
||||
*/
|
||||
std::deque<PacketPtr> retries;
|
||||
|
||||
/** SenderState is information carried along with the packet
|
||||
* throughout the TLB hierarchy
|
||||
*/
|
||||
struct SenderState: public Packet::SenderState
|
||||
{
|
||||
// the memInst that this is associated with
|
||||
GPUDynInstPtr _gpuDynInst;
|
||||
|
||||
// the lane in the memInst this is associated with, so we send
|
||||
// the memory request down the right port
|
||||
int portIndex;
|
||||
|
||||
// constructor used for packets involved in timing accesses
|
||||
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
|
||||
: _gpuDynInst(gpuDynInst), portIndex(port_index) { }
|
||||
|
||||
};
|
||||
|
||||
protected:
|
||||
ComputeUnit *computeUnit;
|
||||
int index;
|
||||
bool stalled;
|
||||
|
||||
virtual bool recvTimingResp(PacketPtr pkt);
|
||||
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
||||
virtual void recvFunctional(PacketPtr pkt) { }
|
||||
virtual void recvRangeChange() { }
|
||||
virtual void recvReqRetry();
|
||||
};
|
||||
|
||||
class ITLBPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
ITLBPort(const std::string &_name, ComputeUnit *_cu)
|
||||
: MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
|
||||
|
||||
|
||||
bool isStalled() { return stalled; }
|
||||
void stallPort() { stalled = true; }
|
||||
void unstallPort() { stalled = false; }
|
||||
|
||||
/**
|
||||
* here we queue all the translation requests that were
|
||||
* not successfully sent.
|
||||
*/
|
||||
std::deque<PacketPtr> retries;
|
||||
|
||||
/** SenderState is information carried along with the packet
|
||||
* throughout the TLB hierarchy
|
||||
*/
|
||||
struct SenderState: public Packet::SenderState
|
||||
{
|
||||
// The wavefront associated with this request
|
||||
Wavefront *wavefront;
|
||||
|
||||
SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
|
||||
};
|
||||
|
||||
protected:
|
||||
ComputeUnit *computeUnit;
|
||||
bool stalled;
|
||||
|
||||
virtual bool recvTimingResp(PacketPtr pkt);
|
||||
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
||||
virtual void recvFunctional(PacketPtr pkt) { }
|
||||
virtual void recvRangeChange() { }
|
||||
virtual void recvReqRetry();
|
||||
};
|
||||
|
||||
/**
|
||||
* the port intended to communicate between the CU and its LDS
|
||||
*/
|
||||
class LDSPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
|
||||
: MasterPort(_name, _cu, _id), computeUnit(_cu)
|
||||
{
|
||||
}
|
||||
|
||||
bool isStalled() const { return stalled; }
|
||||
void stallPort() { stalled = true; }
|
||||
void unstallPort() { stalled = false; }
|
||||
|
||||
/**
|
||||
* here we queue all the requests that were
|
||||
* not successfully sent.
|
||||
*/
|
||||
std::queue<PacketPtr> retries;
|
||||
|
||||
/**
|
||||
* SenderState is information carried along with the packet, esp. the
|
||||
* GPUDynInstPtr
|
||||
*/
|
||||
class SenderState: public Packet::SenderState
|
||||
{
|
||||
protected:
|
||||
// The actual read/write/atomic request that goes with this command
|
||||
GPUDynInstPtr _gpuDynInst = nullptr;
|
||||
|
||||
public:
|
||||
SenderState(GPUDynInstPtr gpuDynInst):
|
||||
_gpuDynInst(gpuDynInst)
|
||||
{
|
||||
}
|
||||
|
||||
GPUDynInstPtr
|
||||
getMemInst() const
|
||||
{
|
||||
return _gpuDynInst;
|
||||
}
|
||||
};
|
||||
|
||||
virtual bool
|
||||
sendTimingReq(PacketPtr pkt);
|
||||
|
||||
protected:
|
||||
|
||||
bool stalled = false; ///< whether or not it is stalled
|
||||
|
||||
ComputeUnit *computeUnit;
|
||||
|
||||
virtual bool
|
||||
recvTimingResp(PacketPtr pkt);
|
||||
|
||||
virtual Tick
|
||||
recvAtomic(PacketPtr pkt) { return 0; }
|
||||
|
||||
virtual void
|
||||
recvFunctional(PacketPtr pkt)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void
|
||||
recvRangeChange()
|
||||
{
|
||||
}
|
||||
|
||||
virtual void
|
||||
recvReqRetry();
|
||||
};
|
||||
|
||||
/** The port to access the Local Data Store
|
||||
* Can be connected to a LDS object
|
||||
*/
|
||||
LDSPort *ldsPort = nullptr;
|
||||
|
||||
LDSPort *
|
||||
getLdsPort() const
|
||||
{
|
||||
return ldsPort;
|
||||
}
|
||||
|
||||
/** The memory port for SIMD data accesses.
|
||||
* Can be connected to PhysMem for Ruby for timing simulations
|
||||
*/
|
||||
std::vector<DataPort*> memPort;
|
||||
// port to the TLB hierarchy (i.e., the L1 TLB)
|
||||
std::vector<DTLBPort*> tlbPort;
|
||||
// port to the SQC (i.e. the I-cache)
|
||||
SQCPort *sqcPort;
|
||||
// port to the SQC TLB (there's a separate TLB for each I-cache)
|
||||
ITLBPort *sqcTLBPort;
|
||||
|
||||
virtual BaseMasterPort&
|
||||
getMasterPort(const std::string &if_name, PortID idx)
|
||||
{
|
||||
if (if_name == "memory_port") {
|
||||
memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
|
||||
this, idx);
|
||||
return *memPort[idx];
|
||||
} else if (if_name == "translation_port") {
|
||||
tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
|
||||
this, idx);
|
||||
return *tlbPort[idx];
|
||||
} else if (if_name == "sqc_port") {
|
||||
sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
|
||||
this, idx);
|
||||
return *sqcPort;
|
||||
} else if (if_name == "sqc_tlb_port") {
|
||||
sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
|
||||
return *sqcTLBPort;
|
||||
} else if (if_name == "ldsPort") {
|
||||
if (ldsPort) {
|
||||
fatal("an LDS port was already allocated");
|
||||
}
|
||||
ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
|
||||
return *ldsPort;
|
||||
} else {
|
||||
panic("incorrect port name");
|
||||
}
|
||||
}
|
||||
|
||||
// xact_cas_load()
|
||||
class waveIdentifier
|
||||
{
|
||||
public:
|
||||
waveIdentifier() { }
|
||||
waveIdentifier(int _simdId, int _wfSlotId)
|
||||
: simdId(_simdId), wfSlotId(_wfSlotId) { }
|
||||
|
||||
int simdId;
|
||||
int wfSlotId;
|
||||
};
|
||||
|
||||
class waveQueue
|
||||
{
|
||||
public:
|
||||
std::list<waveIdentifier> waveIDQueue;
|
||||
};
|
||||
std::map<unsigned, waveQueue> xactCasLoadMap;
|
||||
|
||||
uint64_t getAndIncSeqNum() { return globalSeqNum++; }
|
||||
|
||||
private:
|
||||
uint64_t globalSeqNum;
|
||||
int wavefrontSize;
|
||||
};
|
||||
|
||||
#endif // __COMPUTE_UNIT_HH__
|
83
src/gpu-compute/condition_register_state.cc
Normal file
83
src/gpu-compute/condition_register_state.cc
Normal file
|
@ -0,0 +1,83 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos
|
||||
*/
|
||||
|
||||
#include "gpu-compute/condition_register_state.hh"
|
||||
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ConditionRegisterState::ConditionRegisterState()
|
||||
{
|
||||
computeUnit = nullptr;
|
||||
c_reg.clear();
|
||||
busy.clear();
|
||||
}
|
||||
|
||||
void
|
||||
ConditionRegisterState::setParent(ComputeUnit *_computeUnit)
|
||||
{
|
||||
computeUnit = _computeUnit;
|
||||
_name = computeUnit->name() + ".CondRegState";
|
||||
}
|
||||
|
||||
void
|
||||
ConditionRegisterState::init(uint32_t _size)
|
||||
{
|
||||
c_reg.resize(_size);
|
||||
busy.resize(_size, 0);
|
||||
}
|
||||
|
||||
void
|
||||
ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w)
|
||||
{
|
||||
// iterate over all operands
|
||||
for (auto i = 0; i < ii->getNumOperands(); ++i) {
|
||||
// is this a condition register destination operand?
|
||||
if (ii->isCondRegister(i) && ii->isDstOperand(i)) {
|
||||
// mark the register as busy
|
||||
markReg(ii->getRegisterIndex(i), 1);
|
||||
uint32_t pipeLen = w->computeUnit->spBypassLength();
|
||||
|
||||
// schedule an event for marking the register as ready
|
||||
w->computeUnit->
|
||||
registerEvent(w->simdId, ii->getRegisterIndex(i),
|
||||
ii->getOperandSize(i),
|
||||
w->computeUnit->shader->tick_cnt +
|
||||
w->computeUnit->shader->ticks(pipeLen), 0);
|
||||
}
|
||||
}
|
||||
}
|
101
src/gpu-compute/condition_register_state.hh
Normal file
101
src/gpu-compute/condition_register_state.hh
Normal file
|
@ -0,0 +1,101 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos
|
||||
*/
|
||||
|
||||
#ifndef __CONDITION_REGISTER_STATE_HH__
|
||||
#define __CONDITION_REGISTER_STATE_HH__
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu-compute/misc.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class GPUStaticInst;
|
||||
class Shader;
|
||||
class Wavefront;
|
||||
|
||||
// Condition Register State (used only when executing HSAIL)
|
||||
class ConditionRegisterState
|
||||
{
|
||||
public:
|
||||
ConditionRegisterState();
|
||||
void init(uint32_t _size);
|
||||
const std::string name() const { return _name; }
|
||||
void setParent(ComputeUnit *_computeUnit);
|
||||
void regStats() { }
|
||||
|
||||
template<typename T>
|
||||
T
|
||||
read(int regIdx, int threadId)
|
||||
{
|
||||
bool tmp = c_reg[regIdx][threadId];
|
||||
T *p0 = (T*)(&tmp);
|
||||
|
||||
return *p0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
write(int regIdx, int threadId, T value)
|
||||
{
|
||||
c_reg[regIdx][threadId] = (bool)(value & 0x01);
|
||||
}
|
||||
|
||||
void
|
||||
markReg(int regIdx, uint8_t value)
|
||||
{
|
||||
busy.at(regIdx) = value;
|
||||
}
|
||||
|
||||
uint8_t
|
||||
regBusy(int idx)
|
||||
{
|
||||
uint8_t status = busy.at(idx);
|
||||
return status;
|
||||
}
|
||||
|
||||
int numRegs() { return c_reg.size(); }
|
||||
void exec(GPUStaticInst *ii, Wavefront *w);
|
||||
|
||||
private:
|
||||
ComputeUnit* computeUnit;
|
||||
std::string _name;
|
||||
// Condition Register state
|
||||
std::vector<VectorMask> c_reg;
|
||||
// flag indicating if a register is busy
|
||||
std::vector<uint8_t> busy;
|
||||
};
|
||||
|
||||
#endif
|
394
src/gpu-compute/dispatcher.cc
Normal file
394
src/gpu-compute/dispatcher.cc
Normal file
|
@ -0,0 +1,394 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Brad Beckmann, Marc Orr
|
||||
*/
|
||||
|
||||
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
|
||||
#include "cpu/base.hh"
|
||||
#include "debug/GPUDisp.hh"
|
||||
#include "gpu-compute/cl_driver.hh"
|
||||
#include "gpu-compute/cl_event.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "mem/packet_access.hh"
|
||||
|
||||
GpuDispatcher *GpuDispatcher::instance = nullptr;
|
||||
|
||||
GpuDispatcher::GpuDispatcher(const Params *p)
|
||||
: DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
|
||||
pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
|
||||
dispatchCount(0), dispatchActive(false), cpu(p->cpu),
|
||||
shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
|
||||
{
|
||||
shader->handshake(this);
|
||||
driver->handshake(this);
|
||||
|
||||
ndRange.wg_disp_rem = false;
|
||||
ndRange.globalWgId = 0;
|
||||
|
||||
schedule(&tickEvent, 0);
|
||||
|
||||
// translation port for the dispatcher
|
||||
tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
|
||||
|
||||
num_kernelLaunched
|
||||
.name(name() + ".num_kernel_launched")
|
||||
.desc("number of kernel launched")
|
||||
;
|
||||
}
|
||||
|
||||
GpuDispatcher *GpuDispatcherParams::create()
|
||||
{
|
||||
GpuDispatcher *dispatcher = new GpuDispatcher(this);
|
||||
GpuDispatcher::setInstance(dispatcher);
|
||||
|
||||
return GpuDispatcher::getInstance();
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::serialize(CheckpointOut &cp) const
|
||||
{
|
||||
Tick event_tick = 0;
|
||||
|
||||
if (ndRange.wg_disp_rem)
|
||||
fatal("Checkpointing not supported during active workgroup execution");
|
||||
|
||||
if (tickEvent.scheduled())
|
||||
event_tick = tickEvent.when();
|
||||
|
||||
SERIALIZE_SCALAR(event_tick);
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::unserialize(CheckpointIn &cp)
|
||||
{
|
||||
Tick event_tick;
|
||||
|
||||
if (tickEvent.scheduled())
|
||||
deschedule(&tickEvent);
|
||||
|
||||
UNSERIALIZE_SCALAR(event_tick);
|
||||
|
||||
if (event_tick)
|
||||
schedule(&tickEvent, event_tick);
|
||||
}
|
||||
|
||||
AddrRangeList
|
||||
GpuDispatcher::getAddrRanges() const
|
||||
{
|
||||
AddrRangeList ranges;
|
||||
|
||||
DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
|
||||
pioAddr, pioSize);
|
||||
|
||||
ranges.push_back(RangeSize(pioAddr, pioSize));
|
||||
|
||||
return ranges;
|
||||
}
|
||||
|
||||
Tick
|
||||
GpuDispatcher::read(PacketPtr pkt)
|
||||
{
|
||||
assert(pkt->getAddr() >= pioAddr);
|
||||
assert(pkt->getAddr() < pioAddr + pioSize);
|
||||
|
||||
int offset = pkt->getAddr() - pioAddr;
|
||||
pkt->allocate();
|
||||
|
||||
DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
|
||||
|
||||
if (offset < 8) {
|
||||
assert(!offset);
|
||||
assert(pkt->getSize() == 8);
|
||||
|
||||
uint64_t retval = dispatchActive;
|
||||
pkt->set(retval);
|
||||
} else {
|
||||
offset -= 8;
|
||||
assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
|
||||
char *curTaskPtr = (char*)&curTask;
|
||||
|
||||
memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
|
||||
}
|
||||
|
||||
pkt->makeAtomicResponse();
|
||||
|
||||
return pioDelay;
|
||||
}
|
||||
|
||||
Tick
|
||||
GpuDispatcher::write(PacketPtr pkt)
|
||||
{
|
||||
assert(pkt->getAddr() >= pioAddr);
|
||||
assert(pkt->getAddr() < pioAddr + pioSize);
|
||||
|
||||
int offset = pkt->getAddr() - pioAddr;
|
||||
|
||||
#if TRACING_ON
|
||||
uint64_t data_val = 0;
|
||||
|
||||
switch (pkt->getSize()) {
|
||||
case 1:
|
||||
data_val = pkt->get<uint8_t>();
|
||||
break;
|
||||
case 2:
|
||||
data_val = pkt->get<uint16_t>();
|
||||
break;
|
||||
case 4:
|
||||
data_val = pkt->get<uint32_t>();
|
||||
break;
|
||||
case 8:
|
||||
data_val = pkt->get<uint64_t>();
|
||||
break;
|
||||
default:
|
||||
DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
|
||||
}
|
||||
|
||||
DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
|
||||
pkt->getSize());
|
||||
#endif
|
||||
if (!offset) {
|
||||
static int nextId = 0;
|
||||
|
||||
// The depends field of the qstruct, which was previously unused, is
|
||||
// used to communicate with simulated application.
|
||||
if (curTask.depends) {
|
||||
HostState hs;
|
||||
shader->ReadMem((uint64_t)(curTask.depends), &hs,
|
||||
sizeof(HostState), 0);
|
||||
|
||||
// update event start time (in nano-seconds)
|
||||
uint64_t start = curTick() / 1000;
|
||||
|
||||
shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
|
||||
&start, sizeof(uint64_t), 0);
|
||||
}
|
||||
|
||||
// launch kernel
|
||||
++num_kernelLaunched;
|
||||
|
||||
NDRange *ndr = &(ndRangeMap[nextId]);
|
||||
// copy dispatch info
|
||||
ndr->q = curTask;
|
||||
|
||||
// update the numDispTask polled by the runtime
|
||||
accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
|
||||
|
||||
ndr->numWgTotal = 1;
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
ndr->wgId[i] = 0;
|
||||
ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
|
||||
ndr->numWgTotal *= ndr->numWg[i];
|
||||
}
|
||||
|
||||
ndr->numWgCompleted = 0;
|
||||
ndr->globalWgId = 0;
|
||||
ndr->wg_disp_rem = true;
|
||||
ndr->execDone = false;
|
||||
ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
|
||||
ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
|
||||
ndr->dispatchId = nextId;
|
||||
ndr->curTid = pkt->req->threadId();
|
||||
DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
|
||||
execIds.push(nextId);
|
||||
++nextId;
|
||||
|
||||
dispatchActive = true;
|
||||
|
||||
if (!tickEvent.scheduled()) {
|
||||
schedule(&tickEvent, curTick() + shader->ticks(1));
|
||||
}
|
||||
} else {
|
||||
// populate current task struct
|
||||
// first 64 bits are launch reg
|
||||
offset -= 8;
|
||||
assert(offset < sizeof(HsaQueueEntry));
|
||||
char *curTaskPtr = (char*)&curTask;
|
||||
memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
|
||||
}
|
||||
|
||||
pkt->makeAtomicResponse();
|
||||
|
||||
return pioDelay;
|
||||
}
|
||||
|
||||
|
||||
BaseMasterPort&
|
||||
GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
|
||||
{
|
||||
if (if_name == "translation_port") {
|
||||
return *tlbPort;
|
||||
}
|
||||
|
||||
return DmaDevice::getMasterPort(if_name, idx);
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::exec()
|
||||
{
|
||||
int fail_count = 0;
|
||||
|
||||
// There are potentially multiple outstanding kernel launches.
|
||||
// It is possible that the workgroups in a different kernel
|
||||
// can fit on the GPU even if another kernel's workgroups cannot
|
||||
DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
|
||||
|
||||
while (execIds.size() > fail_count) {
|
||||
int execId = execIds.front();
|
||||
|
||||
while (ndRangeMap[execId].wg_disp_rem) {
|
||||
//update the thread context
|
||||
shader->updateThreadContext(ndRangeMap[execId].curTid);
|
||||
|
||||
// attempt to dispatch_workgroup
|
||||
if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
|
||||
// if we failed try the next kernel,
|
||||
// it may have smaller workgroups.
|
||||
// put it on the queue to rety latter
|
||||
DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
|
||||
execIds.push(execId);
|
||||
++fail_count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// let's try the next kernel_id
|
||||
execIds.pop();
|
||||
}
|
||||
|
||||
DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
|
||||
|
||||
if (doneIds.size() && cpu) {
|
||||
shader->hostWakeUp(cpu);
|
||||
}
|
||||
|
||||
while (doneIds.size()) {
|
||||
// wakeup the CPU if any Kernels completed this cycle
|
||||
DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
|
||||
doneIds.pop();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::notifyWgCompl(Wavefront *w)
|
||||
{
|
||||
int kern_id = w->kern_id;
|
||||
DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
|
||||
assert(ndRangeMap[kern_id].dispatchId == kern_id);
|
||||
ndRangeMap[kern_id].numWgCompleted++;
|
||||
|
||||
if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
|
||||
ndRangeMap[kern_id].execDone = true;
|
||||
doneIds.push(kern_id);
|
||||
|
||||
if (ndRangeMap[kern_id].addrToNotify) {
|
||||
accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
|
||||
0);
|
||||
}
|
||||
|
||||
accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
|
||||
|
||||
// update event end time (in nano-seconds)
|
||||
if (ndRangeMap[kern_id].q.depends) {
|
||||
HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
|
||||
uint64_t event;
|
||||
shader->ReadMem((uint64_t)(&host_state->event), &event,
|
||||
sizeof(uint64_t), 0);
|
||||
|
||||
uint64_t end = curTick() / 1000;
|
||||
|
||||
shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
|
||||
sizeof(uint64_t), 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!tickEvent.scheduled()) {
|
||||
schedule(&tickEvent, curTick() + shader->ticks(1));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::scheduleDispatch()
|
||||
{
|
||||
if (!tickEvent.scheduled())
|
||||
schedule(&tickEvent, curTick() + shader->ticks(1));
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
|
||||
{
|
||||
if (cpu) {
|
||||
if (off) {
|
||||
shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
|
||||
true);
|
||||
val += off;
|
||||
}
|
||||
|
||||
shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
|
||||
} else {
|
||||
panic("Cannot find host");
|
||||
}
|
||||
}
|
||||
|
||||
GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
|
||||
: Event(CPU_Tick_Pri), dispatcher(_dispatcher)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::TickEvent::process()
|
||||
{
|
||||
dispatcher->exec();
|
||||
}
|
||||
|
||||
const char*
|
||||
GpuDispatcher::TickEvent::description() const
|
||||
{
|
||||
return "GPU Dispatcher tick";
|
||||
}
|
||||
|
||||
// helper functions for driver to retrieve GPU attributes
|
||||
int
|
||||
GpuDispatcher::getNumCUs()
|
||||
{
|
||||
return shader->cuList.size();
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::setFuncargsSize(int funcargs_size)
|
||||
{
|
||||
shader->funcargs_size = funcargs_size;
|
||||
}
|
163
src/gpu-compute/dispatcher.hh
Normal file
163
src/gpu-compute/dispatcher.hh
Normal file
|
@ -0,0 +1,163 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Brad Beckmann, Marc Orr
|
||||
*/
|
||||
|
||||
#ifndef __GPU_DISPATCHER_HH__
|
||||
#define __GPU_DISPATCHER_HH__
|
||||
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "dev/dma_device.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/ndrange.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "params/GpuDispatcher.hh"
|
||||
|
||||
class BaseCPU;
|
||||
class Shader;
|
||||
|
||||
class GpuDispatcher : public DmaDevice
|
||||
{
|
||||
public:
|
||||
typedef GpuDispatcherParams Params;
|
||||
|
||||
class TickEvent : public Event
|
||||
{
|
||||
private:
|
||||
GpuDispatcher *dispatcher;
|
||||
|
||||
public:
|
||||
TickEvent(GpuDispatcher *);
|
||||
void process();
|
||||
const char *description() const;
|
||||
};
|
||||
|
||||
MasterID masterId() { return _masterId; }
|
||||
|
||||
protected:
|
||||
MasterID _masterId;
|
||||
|
||||
// Base and length of PIO register space
|
||||
Addr pioAddr;
|
||||
Addr pioSize;
|
||||
Tick pioDelay;
|
||||
|
||||
HsaQueueEntry curTask;
|
||||
|
||||
std::unordered_map<int, NDRange> ndRangeMap;
|
||||
NDRange ndRange;
|
||||
|
||||
// list of kernel_ids to launch
|
||||
std::queue<int> execIds;
|
||||
// list of kernel_ids that have finished
|
||||
std::queue<int> doneIds;
|
||||
|
||||
uint64_t dispatchCount;
|
||||
// is there a kernel in execution?
|
||||
bool dispatchActive;
|
||||
|
||||
BaseCPU *cpu;
|
||||
Shader *shader;
|
||||
ClDriver *driver;
|
||||
TickEvent tickEvent;
|
||||
|
||||
static GpuDispatcher *instance;
|
||||
|
||||
// sycall emulation mode can have only 1 application running(?)
|
||||
// else we have to do some pid based tagging
|
||||
// unused
|
||||
typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
|
||||
TranslationBuffer tlb;
|
||||
|
||||
public:
|
||||
/*statistics*/
|
||||
Stats::Scalar num_kernelLaunched;
|
||||
GpuDispatcher(const Params *p);
|
||||
|
||||
~GpuDispatcher() { }
|
||||
|
||||
void exec();
|
||||
virtual void serialize(CheckpointOut &cp) const;
|
||||
virtual void unserialize(CheckpointIn &cp);
|
||||
void notifyWgCompl(Wavefront *w);
|
||||
void scheduleDispatch();
|
||||
void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
|
||||
|
||||
// using singleton so that glue code can pass pointer locations
|
||||
// to the dispatcher. when there are multiple dispatchers, we can
|
||||
// call something like getInstance(index)
|
||||
static void
|
||||
setInstance(GpuDispatcher *_instance)
|
||||
{
|
||||
instance = _instance;
|
||||
}
|
||||
|
||||
static GpuDispatcher* getInstance() { return instance; }
|
||||
|
||||
class TLBPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
|
||||
TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
|
||||
: MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
|
||||
|
||||
protected:
|
||||
GpuDispatcher *dispatcher;
|
||||
|
||||
virtual bool recvTimingResp(PacketPtr pkt) { return true; }
|
||||
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
||||
virtual void recvFunctional(PacketPtr pkt) { }
|
||||
virtual void recvRangeChange() { }
|
||||
virtual void recvReqRetry() { }
|
||||
|
||||
};
|
||||
|
||||
TLBPort *tlbPort;
|
||||
|
||||
virtual BaseMasterPort& getMasterPort(const std::string &if_name,
|
||||
PortID idx);
|
||||
|
||||
AddrRangeList getAddrRanges() const;
|
||||
Tick read(PacketPtr pkt);
|
||||
Tick write(PacketPtr pkt);
|
||||
|
||||
// helper functions to retrieve/set GPU attributes
|
||||
int getNumCUs();
|
||||
void setFuncargsSize(int funcargs_size);
|
||||
};
|
||||
|
||||
#endif // __GPU_DISPATCHER_HH__
|
203
src/gpu-compute/exec_stage.cc
Normal file
203
src/gpu-compute/exec_stage.cc
Normal file
|
@ -0,0 +1,203 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos, Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
|
||||
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
|
||||
vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
|
||||
shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
|
||||
thisTimeInstExecuted(false), instrExecuted (false),
|
||||
executionResourcesUsed(0)
|
||||
{
|
||||
numTransActiveIdle = 0;
|
||||
idle_dur = 0;
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::init(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".ExecStage";
|
||||
dispatchList = &computeUnit->dispatchList;
|
||||
vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
|
||||
glbMemInstAvail= &(computeUnit->glbMemInstAvail);
|
||||
shrMemInstAvail= &(computeUnit->shrMemInstAvail);
|
||||
idle_dur = 0;
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
|
||||
if (stage == IdleExec) {
|
||||
// count cycles of no vector ALU instruction executed
|
||||
// even if one was the oldest in a WV of that vector SIMD unit
|
||||
if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
|
||||
numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
}
|
||||
|
||||
// count cycles of no global memory (vector) instruction executed
|
||||
// even if one was the oldest in a WV of that vector SIMD unit
|
||||
if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
|
||||
numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
(*glbMemInstAvail)--;
|
||||
}
|
||||
|
||||
// count cycles of no shared memory (vector) instruction executed
|
||||
// even if one was the oldest in a WV of that vector SIMD unit
|
||||
if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
|
||||
numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
(*shrMemInstAvail)--;
|
||||
}
|
||||
} else if (stage == BusyExec) {
|
||||
// count the number of cycles an instruction to a specific unit
|
||||
// was issued
|
||||
numCyclesWithInstrTypeIssued[unitId]++;
|
||||
thisTimeInstExecuted = true;
|
||||
instrExecuted = true;
|
||||
++executionResourcesUsed;
|
||||
} else if (stage == PostExec) {
|
||||
// count the number of transitions from active to idle
|
||||
if (lastTimeInstExecuted && !thisTimeInstExecuted) {
|
||||
++numTransActiveIdle;
|
||||
}
|
||||
|
||||
if (!lastTimeInstExecuted && thisTimeInstExecuted) {
|
||||
idleDur.sample(idle_dur);
|
||||
idle_dur = 0;
|
||||
} else if (!thisTimeInstExecuted) {
|
||||
idle_dur++;
|
||||
}
|
||||
|
||||
lastTimeInstExecuted = thisTimeInstExecuted;
|
||||
// track the number of cycles we either issued one vector instruction
|
||||
// or issued no instructions at all
|
||||
if (instrExecuted) {
|
||||
numCyclesWithInstrIssued++;
|
||||
} else {
|
||||
numCyclesWithNoIssue++;
|
||||
}
|
||||
|
||||
spc.sample(executionResourcesUsed);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::initStatistics()
|
||||
{
|
||||
instrExecuted = false;
|
||||
executionResourcesUsed = 0;
|
||||
thisTimeInstExecuted = false;
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::exec()
|
||||
{
|
||||
initStatistics();
|
||||
|
||||
for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
|
||||
// if dispatch list for this execution resource is empty,
|
||||
// skip this execution resource this cycle
|
||||
if (dispatchList->at(unitId).second == EMPTY) {
|
||||
collectStatistics(IdleExec, unitId);
|
||||
continue;
|
||||
}
|
||||
|
||||
collectStatistics(BusyExec, unitId);
|
||||
// execute an instruction for the WF
|
||||
dispatchList->at(unitId).first->exec();
|
||||
// clear the dispatch list entry
|
||||
dispatchList->at(unitId).second = EMPTY;
|
||||
dispatchList->at(unitId).first = (Wavefront*)nullptr;
|
||||
}
|
||||
|
||||
collectStatistics(PostExec, 0);
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::regStats()
|
||||
{
|
||||
numTransActiveIdle
|
||||
.name(name() + ".num_transitions_active_to_idle")
|
||||
.desc("number of CU transitions from active to idle")
|
||||
;
|
||||
|
||||
numCyclesWithNoIssue
|
||||
.name(name() + ".num_cycles_with_no_issue")
|
||||
.desc("number of cycles the CU issues nothing")
|
||||
;
|
||||
|
||||
numCyclesWithInstrIssued
|
||||
.name(name() + ".num_cycles_with_instr_issued")
|
||||
.desc("number of cycles the CU issued at least one instruction")
|
||||
;
|
||||
|
||||
spc
|
||||
.init(0, numSIMDs + numMemUnits, 1)
|
||||
.name(name() + ".spc")
|
||||
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
|
||||
;
|
||||
|
||||
idleDur
|
||||
.init(0,75,5)
|
||||
.name(name() + ".idle_duration_in_cycles")
|
||||
.desc("duration of idle periods in cycles")
|
||||
;
|
||||
|
||||
numCyclesWithInstrTypeIssued
|
||||
.init(numSIMDs + numMemUnits)
|
||||
.name(name() + ".num_cycles_with_instrtype_issue")
|
||||
.desc("Number of cycles at least one instruction of specific type "
|
||||
"issued")
|
||||
;
|
||||
|
||||
numCyclesWithNoInstrTypeIssued
|
||||
.init(numSIMDs + numMemUnits)
|
||||
.name(name() + ".num_cycles_with_instr_type_no_issue")
|
||||
.desc("Number of cycles no instruction of specific type issued")
|
||||
;
|
||||
|
||||
for (int i = 0; i < numSIMDs; ++i) {
|
||||
numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
|
||||
numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
|
||||
}
|
||||
|
||||
numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
|
||||
numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
|
||||
numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
|
||||
numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
|
||||
}
|
129
src/gpu-compute/exec_stage.hh
Normal file
129
src/gpu-compute/exec_stage.hh
Normal file
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos, Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __EXEC_STAGE_HH__
|
||||
#define __EXEC_STAGE_HH__
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "sim/stats.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
struct ComputeUnitParams;
|
||||
|
||||
enum STAT_STATUS
|
||||
{
|
||||
IdleExec,
|
||||
BusyExec,
|
||||
PostExec
|
||||
};
|
||||
|
||||
enum DISPATCH_STATUS
|
||||
{
|
||||
EMPTY = 0,
|
||||
FILLED
|
||||
};
|
||||
|
||||
// Execution stage.
|
||||
// Each execution resource executes the
|
||||
// wave which is in its dispatch list.
|
||||
// The schedule stage is responsible for
|
||||
// adding a wave into each execution resource's
|
||||
// dispatch list.
|
||||
|
||||
class ExecStage
|
||||
{
|
||||
public:
|
||||
ExecStage(const ComputeUnitParams* params);
|
||||
~ExecStage() { }
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
std::string name() { return _name; }
|
||||
void regStats();
|
||||
// number of idle cycles
|
||||
Stats::Scalar numCyclesWithNoIssue;
|
||||
// number of busy cycles
|
||||
Stats::Scalar numCyclesWithInstrIssued;
|
||||
// number of cycles (per execution unit) during which at least one
|
||||
// instruction was issued to that unit
|
||||
Stats::Vector numCyclesWithInstrTypeIssued;
|
||||
// number of idle cycles (per execution unit) during which the unit issued
|
||||
// no instruction targeting that unit, even though there is at least one
|
||||
// Wavefront with such an instruction as the oldest
|
||||
Stats::Vector numCyclesWithNoInstrTypeIssued;
|
||||
// SIMDs active per cycle
|
||||
Stats::Distribution spc;
|
||||
|
||||
private:
|
||||
void collectStatistics(enum STAT_STATUS stage, int unitId);
|
||||
void initStatistics();
|
||||
ComputeUnit *computeUnit;
|
||||
uint32_t numSIMDs;
|
||||
|
||||
// Number of memory execution resources;
|
||||
// both global and local memory execution resources in CU
|
||||
uint32_t numMemUnits;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. A FILLED implies
|
||||
// dispatch list is non-empty and
|
||||
// execution unit has something to execute
|
||||
// this cycle. Currently, the dispatch list of
|
||||
// an execution resource can hold only one wave because
|
||||
// an execution resource can execute only one wave in a cycle.
|
||||
// dispatchList is used to communicate between schedule
|
||||
// and exec stage
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
|
||||
// flag per vector SIMD unit that is set when there is at least one
|
||||
// WV that has a vector ALU instruction as the oldest in its
|
||||
// Instruction Buffer
|
||||
std::vector<bool> *vectorAluInstAvail;
|
||||
int *glbMemInstAvail;
|
||||
int *shrMemInstAvail;
|
||||
bool lastTimeInstExecuted;
|
||||
bool thisTimeInstExecuted;
|
||||
bool instrExecuted;
|
||||
Stats::Scalar numTransActiveIdle;
|
||||
Stats::Distribution idleDur;
|
||||
uint32_t executionResourcesUsed;
|
||||
uint64_t idle_dur;
|
||||
std::string _name;
|
||||
};
|
||||
|
||||
#endif // __EXEC_STAGE_HH__
|
106
src/gpu-compute/fetch_stage.cc
Normal file
106
src/gpu-compute/fetch_stage.cc
Normal file
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez, Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/fetch_stage.hh"
|
||||
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
|
||||
computeUnit(nullptr)
|
||||
{
|
||||
for (int j = 0; j < numSIMDs; ++j) {
|
||||
FetchUnit newFetchUnit(p);
|
||||
fetchUnit.push_back(newFetchUnit);
|
||||
}
|
||||
}
|
||||
|
||||
FetchStage::~FetchStage()
|
||||
{
|
||||
fetchUnit.clear();
|
||||
}
|
||||
|
||||
void
|
||||
FetchStage::init(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".FetchStage";
|
||||
|
||||
for (int j = 0; j < numSIMDs; ++j) {
|
||||
fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
|
||||
fetchUnit[j].init(computeUnit);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchStage::exec()
|
||||
{
|
||||
for (int j = 0; j < numSIMDs; ++j) {
|
||||
fetchUnit[j].exec();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchStage::processFetchReturn(PacketPtr pkt)
|
||||
{
|
||||
ComputeUnit::SQCPort::SenderState *sender_state =
|
||||
safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
|
||||
|
||||
Wavefront *wavefront = sender_state->wavefront;
|
||||
|
||||
const unsigned num_instructions = pkt->req->getSize() /
|
||||
sizeof(TheGpuISA::RawMachInst);
|
||||
|
||||
instFetchInstReturned.sample(num_instructions);
|
||||
uint32_t simdId = wavefront->simdId;
|
||||
fetchUnit[simdId].processFetchReturn(pkt);
|
||||
}
|
||||
|
||||
void
|
||||
FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
|
||||
{
|
||||
fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
|
||||
}
|
||||
|
||||
void
|
||||
FetchStage::regStats()
|
||||
{
|
||||
instFetchInstReturned
|
||||
.init(1, 32, 1)
|
||||
.name(name() + ".inst_fetch_instr_returned")
|
||||
.desc("For each instruction fetch request recieved record how many "
|
||||
"instructions you got from it")
|
||||
;
|
||||
}
|
78
src/gpu-compute/fetch_stage.hh
Normal file
78
src/gpu-compute/fetch_stage.hh
Normal file
|
@ -0,0 +1,78 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez, Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __FETCH_STAGE_HH__
|
||||
#define __FETCH_STAGE_HH__
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu-compute/fetch_unit.hh"
|
||||
|
||||
// Instruction fetch stage.
|
||||
// All dispatched wavefronts for all SIMDS are analyzed for the
|
||||
// need to fetch instructions. From the fetch eligible waves,
|
||||
// one wave is selected from each SIMD and fetch is initiated
|
||||
// for the selected waves.
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
class FetchStage
|
||||
{
|
||||
public:
|
||||
FetchStage(const ComputeUnitParams* params);
|
||||
~FetchStage();
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
void processFetchReturn(PacketPtr pkt);
|
||||
void fetch(PacketPtr pkt, Wavefront *wave);
|
||||
|
||||
// Stats related variables and methods
|
||||
std::string name() { return _name; }
|
||||
void regStats();
|
||||
Stats::Distribution instFetchInstReturned;
|
||||
|
||||
private:
|
||||
uint32_t numSIMDs;
|
||||
ComputeUnit *computeUnit;
|
||||
|
||||
// List of fetch units. A fetch unit is
|
||||
// instantiated per SIMD
|
||||
std::vector<FetchUnit> fetchUnit;
|
||||
std::string _name;
|
||||
};
|
||||
|
||||
#endif // __FETCH_STAGE_HH__
|
293
src/gpu-compute/fetch_unit.cc
Normal file
293
src/gpu-compute/fetch_unit.cc
Normal file
|
@ -0,0 +1,293 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Brad Beckmann, Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/fetch_unit.hh"
|
||||
|
||||
#include "debug/GPUFetch.hh"
|
||||
#include "debug/GPUPort.hh"
|
||||
#include "debug/GPUTLB.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "mem/ruby/system/RubySystem.hh"
|
||||
|
||||
uint32_t FetchUnit::globalFetchUnitID;
|
||||
|
||||
FetchUnit::FetchUnit(const ComputeUnitParams* params) :
|
||||
timingSim(true),
|
||||
computeUnit(nullptr),
|
||||
fetchScheduler(params),
|
||||
waveList(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
FetchUnit::~FetchUnit()
|
||||
{
|
||||
fetchQueue.clear();
|
||||
fetchStatusQueue.clear();
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::init(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
timingSim = computeUnit->shader->timingSim;
|
||||
fetchQueue.clear();
|
||||
fetchStatusQueue.resize(computeUnit->shader->n_wf);
|
||||
|
||||
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
|
||||
fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
|
||||
}
|
||||
|
||||
fetchScheduler.bindList(&fetchQueue);
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::exec()
|
||||
{
|
||||
// re-evaluate waves which are marked as not ready for fetch
|
||||
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
|
||||
// Following code assumes 64-bit opertaion and all insts are
|
||||
// represented by 64-bit pointers to inst objects.
|
||||
Wavefront *curWave = fetchStatusQueue[j].first;
|
||||
assert (curWave);
|
||||
|
||||
// The wavefront has to be active, the IB occupancy has to be
|
||||
// 4 or less instructions and it can not have any branches to
|
||||
// prevent speculative instruction fetches
|
||||
if (!fetchStatusQueue[j].second) {
|
||||
if (curWave->status == Wavefront::S_RUNNING &&
|
||||
curWave->instructionBuffer.size() <= 4 &&
|
||||
!curWave->instructionBufferHasBranch() &&
|
||||
!curWave->pendingFetch) {
|
||||
fetchQueue.push_back(curWave);
|
||||
fetchStatusQueue[j].second = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch only if there is some wave ready to be fetched
|
||||
// An empty fetchQueue will cause the schedular to panic
|
||||
if (fetchQueue.size()) {
|
||||
Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
|
||||
waveToBeFetched->pendingFetch = true;
|
||||
fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
|
||||
initiateFetch(waveToBeFetched);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::initiateFetch(Wavefront *wavefront)
|
||||
{
|
||||
// calculate the virtual address to fetch from the SQC
|
||||
Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
|
||||
vaddr = wavefront->base_ptr + vaddr * sizeof(GPUStaticInst*);
|
||||
|
||||
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
|
||||
|
||||
// Since this is an instruction prefetch, if you're split then just finish
|
||||
// out the current line.
|
||||
unsigned block_size = RubySystem::getBlockSizeBytes();
|
||||
// check for split accesses
|
||||
Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
|
||||
unsigned size = block_size;
|
||||
|
||||
if (split_addr > vaddr) {
|
||||
// misaligned access, just grab the rest of the line
|
||||
size = split_addr - vaddr;
|
||||
}
|
||||
|
||||
// set up virtual request
|
||||
Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
|
||||
computeUnit->masterId(), 0, 0, 0);
|
||||
|
||||
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
|
||||
// This fetchBlock is kind of faux right now - because the translations so
|
||||
// far don't actually return Data
|
||||
uint64_t fetchBlock;
|
||||
pkt->dataStatic(&fetchBlock);
|
||||
|
||||
if (timingSim) {
|
||||
// SenderState needed on Return
|
||||
pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
|
||||
|
||||
// Sender State needed by TLB hierarchy
|
||||
pkt->senderState =
|
||||
new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
|
||||
computeUnit->shader->gpuTc,
|
||||
false, pkt->senderState);
|
||||
|
||||
if (computeUnit->sqcTLBPort->isStalled()) {
|
||||
assert(computeUnit->sqcTLBPort->retries.size() > 0);
|
||||
|
||||
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
|
||||
vaddr);
|
||||
|
||||
computeUnit->sqcTLBPort->retries.push_back(pkt);
|
||||
} else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
|
||||
// Stall the data port;
|
||||
// No more packet is issued till
|
||||
// ruby indicates resources are freed by
|
||||
// a recvReqRetry() call back on this port.
|
||||
computeUnit->sqcTLBPort->stallPort();
|
||||
|
||||
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
|
||||
vaddr);
|
||||
|
||||
computeUnit->sqcTLBPort->retries.push_back(pkt);
|
||||
} else {
|
||||
DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
|
||||
}
|
||||
} else {
|
||||
pkt->senderState =
|
||||
new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
|
||||
computeUnit->shader->gpuTc);
|
||||
|
||||
computeUnit->sqcTLBPort->sendFunctional(pkt);
|
||||
|
||||
TheISA::GpuTLB::TranslationState *sender_state =
|
||||
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
|
||||
|
||||
delete sender_state->tlbEntry;
|
||||
delete sender_state;
|
||||
// fetch the instructions from the SQC when we operate in
|
||||
// functional mode only
|
||||
fetch(pkt, wavefront);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
|
||||
{
|
||||
assert(pkt->req->hasPaddr());
|
||||
assert(pkt->req->hasSize());
|
||||
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
pkt->req->getPaddr());
|
||||
|
||||
// this is necessary because the GPU TLB receives packets instead of
|
||||
// requests. when the translation is complete, all relevent fields in the
|
||||
// request will be populated, but not in the packet. here we create the
|
||||
// new packet so we can set the size, addr, and proper flags.
|
||||
PacketPtr oldPkt = pkt;
|
||||
pkt = new Packet(oldPkt->req, oldPkt->cmd);
|
||||
delete oldPkt;
|
||||
|
||||
TheGpuISA::RawMachInst *data =
|
||||
new TheGpuISA::RawMachInst[pkt->req->getSize() /
|
||||
sizeof(TheGpuISA::RawMachInst)];
|
||||
|
||||
pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
|
||||
|
||||
// New SenderState for the memory access
|
||||
pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
|
||||
|
||||
if (timingSim) {
|
||||
// translation is done. Send the appropriate timing memory request.
|
||||
|
||||
if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
|
||||
computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
|
||||
wavefront));
|
||||
|
||||
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
pkt->req->getPaddr());
|
||||
} else {
|
||||
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
pkt->req->getPaddr());
|
||||
}
|
||||
} else {
|
||||
computeUnit->sqcPort->sendFunctional(pkt);
|
||||
processFetchReturn(pkt);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::processFetchReturn(PacketPtr pkt)
|
||||
{
|
||||
ComputeUnit::SQCPort::SenderState *sender_state =
|
||||
safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
|
||||
|
||||
Wavefront *wavefront = sender_state->wavefront;
|
||||
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
|
||||
"%d bytes, %d instructions!\n", computeUnit->cu_id,
|
||||
wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
|
||||
pkt->req->getSize(), pkt->req->getSize() /
|
||||
sizeof(TheGpuISA::RawMachInst));
|
||||
|
||||
if (wavefront->dropFetch) {
|
||||
assert(wavefront->instructionBuffer.empty());
|
||||
wavefront->dropFetch = false;
|
||||
} else {
|
||||
TheGpuISA::RawMachInst *inst_index_ptr =
|
||||
(TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
|
||||
|
||||
assert(wavefront->instructionBuffer.size() <= 4);
|
||||
|
||||
for (int i = 0; i < pkt->req->getSize() /
|
||||
sizeof(TheGpuISA::RawMachInst); ++i) {
|
||||
GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
|
||||
|
||||
assert(inst_ptr);
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
|
||||
computeUnit->cu_id, wavefront->simdId,
|
||||
wavefront->wfSlotId, inst_ptr->disassemble());
|
||||
|
||||
GPUDynInstPtr gpuDynInst =
|
||||
std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
|
||||
computeUnit->getAndIncSeqNum());
|
||||
|
||||
wavefront->instructionBuffer.push_back(gpuDynInst);
|
||||
}
|
||||
}
|
||||
|
||||
wavefront->pendingFetch = false;
|
||||
|
||||
delete pkt->senderState;
|
||||
delete pkt->req;
|
||||
delete pkt;
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
|
||||
{
|
||||
waveList = wave_list;
|
||||
}
|
89
src/gpu-compute/fetch_unit.hh
Normal file
89
src/gpu-compute/fetch_unit.hh
Normal file
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Brad Beckmann, Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __FETCH_UNIT_HH__
|
||||
#define __FETCH_UNIT_HH__
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arch/gpu_decoder.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "gpu-compute/scheduler.hh"
|
||||
#include "mem/packet.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
class FetchUnit
|
||||
{
|
||||
public:
|
||||
FetchUnit(const ComputeUnitParams* params);
|
||||
~FetchUnit();
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
void bindWaveList(std::vector<Wavefront*> *list);
|
||||
void initiateFetch(Wavefront *wavefront);
|
||||
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
||||
void processFetchReturn(PacketPtr pkt);
|
||||
static uint32_t globalFetchUnitID;
|
||||
|
||||
private:
|
||||
bool timingSim;
|
||||
ComputeUnit *computeUnit;
|
||||
TheGpuISA::Decoder decoder;
|
||||
|
||||
// Fetch scheduler; Selects one wave from
|
||||
// the fetch queue for instruction fetching.
|
||||
// The selection is made according to
|
||||
// a scheduling policy
|
||||
Scheduler fetchScheduler;
|
||||
|
||||
// Stores the list of waves that are
|
||||
// ready to be fetched this cycle
|
||||
std::vector<Wavefront*> fetchQueue;
|
||||
|
||||
// Stores the fetch status of all waves dispatched to this SIMD.
|
||||
// TRUE implies the wave is ready to fetch and is already
|
||||
// moved to fetchQueue
|
||||
std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;
|
||||
|
||||
// Pointer to list of waves dispatched on to this SIMD unit
|
||||
std::vector<Wavefront*> *waveList;
|
||||
};
|
||||
|
||||
#endif // __FETCH_UNIT_HH__
|
242
src/gpu-compute/global_memory_pipeline.cc
Normal file
242
src/gpu-compute/global_memory_pipeline.cc
Normal file
|
@ -0,0 +1,242 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos, Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/global_memory_pipeline.hh"
|
||||
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "debug/GPUReg.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
|
||||
computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
|
||||
inflightStores(0), inflightLoads(0)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
GlobalMemPipeline::init(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
globalMemSize = computeUnit->shader->globalMemSize;
|
||||
_name = computeUnit->name() + ".GlobalMemPipeline";
|
||||
}
|
||||
|
||||
void
|
||||
GlobalMemPipeline::exec()
|
||||
{
|
||||
// apply any returned global memory operations
|
||||
GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
|
||||
!gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
|
||||
|
||||
bool accessVrf = true;
|
||||
// check the VRF to see if the operands of a load (or load component
|
||||
// of an atomic) are accessible
|
||||
if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
|
||||
Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
|
||||
|
||||
accessVrf =
|
||||
w->computeUnit->vrf[m->simdId]->
|
||||
vrfOperandAccessReady(m->seqNum(), w, m,
|
||||
VrfAccessType::WRITE);
|
||||
}
|
||||
|
||||
if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
|
||||
m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
|
||||
accessVrf && m->statusBitVector == VectorMask(0) &&
|
||||
(computeUnit->shader->coissue_return ||
|
||||
computeUnit->wfWait.at(m->pipeId).rdy())) {
|
||||
|
||||
if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
|
||||
doGmReturn<uint32_t, uint8_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
|
||||
doGmReturn<uint32_t, uint16_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
|
||||
doGmReturn<uint32_t, uint32_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
|
||||
doGmReturn<int32_t, int8_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
|
||||
doGmReturn<int32_t, int16_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
|
||||
doGmReturn<int32_t, int32_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
|
||||
doGmReturn<float, Float16>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
|
||||
doGmReturn<float, float>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
|
||||
doGmReturn<uint64_t, uint8_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
|
||||
doGmReturn<uint64_t, uint16_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
|
||||
doGmReturn<uint64_t, uint32_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
|
||||
doGmReturn<uint64_t, uint64_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
|
||||
doGmReturn<int64_t, int8_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
|
||||
doGmReturn<int64_t, int16_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
|
||||
doGmReturn<int64_t, int32_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
|
||||
doGmReturn<int64_t, int64_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
|
||||
doGmReturn<double, Float16>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
|
||||
doGmReturn<double, float>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
|
||||
doGmReturn<double, double>(m);
|
||||
}
|
||||
|
||||
// If pipeline has executed a global memory instruction
|
||||
// execute global memory packets and issue global
|
||||
// memory packets to DTLB
|
||||
if (!gmIssuedRequests.empty()) {
|
||||
GPUDynInstPtr mp = gmIssuedRequests.front();
|
||||
if (mp->m_op == Enums::MO_LD ||
|
||||
(mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) ||
|
||||
(mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) {
|
||||
|
||||
if (inflightLoads >= gmQueueSize) {
|
||||
return;
|
||||
} else {
|
||||
++inflightLoads;
|
||||
}
|
||||
} else {
|
||||
if (inflightStores >= gmQueueSize) {
|
||||
return;
|
||||
} else {
|
||||
++inflightStores;
|
||||
}
|
||||
}
|
||||
|
||||
mp->initiateAcc(mp);
|
||||
gmIssuedRequests.pop();
|
||||
|
||||
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
|
||||
computeUnit->cu_id, mp->simdId, mp->wfSlotId,
|
||||
Enums::MemOpTypeStrings[mp->m_op]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename c0, typename c1>
|
||||
void
|
||||
GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
|
||||
{
|
||||
Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
|
||||
|
||||
// Return data to registers
|
||||
if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
|
||||
gmReturnedLoads.pop();
|
||||
assert(inflightLoads > 0);
|
||||
--inflightLoads;
|
||||
|
||||
if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
|
||||
std::vector<uint32_t> regVec;
|
||||
// iterate over number of destination register operands since
|
||||
// this is a load or atomic operation
|
||||
for (int k = 0; k < m->n_reg; ++k) {
|
||||
assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
|
||||
int dst = m->dst_reg + k;
|
||||
|
||||
if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
|
||||
dst = m->dst_reg_vec[k];
|
||||
// virtual->physical VGPR mapping
|
||||
int physVgpr = w->remap(dst, sizeof(c0), 1);
|
||||
// save the physical VGPR index
|
||||
regVec.push_back(physVgpr);
|
||||
c1 *p1 = &((c1*)m->d_data)[k * VSZ];
|
||||
|
||||
for (int i = 0; i < VSZ; ++i) {
|
||||
if (m->exec_mask[i]) {
|
||||
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
|
||||
"$%s%d <- %d global ld done (src = wavefront "
|
||||
"ld inst)\n", w->computeUnit->cu_id, w->simdId,
|
||||
w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
|
||||
dst, *p1);
|
||||
// write the value into the physical VGPR. This is a
|
||||
// purely functional operation. No timing is modeled.
|
||||
w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
|
||||
*p1, i);
|
||||
}
|
||||
++p1;
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule the write operation of the load data on the VRF.
|
||||
// This simply models the timing aspect of the VRF write operation.
|
||||
// It does not modify the physical VGPR.
|
||||
loadVrfBankConflictCycles +=
|
||||
w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
|
||||
w, regVec, sizeof(c0),
|
||||
m->time);
|
||||
}
|
||||
} else {
|
||||
gmReturnedStores.pop();
|
||||
assert(inflightStores > 0);
|
||||
--inflightStores;
|
||||
}
|
||||
|
||||
// Decrement outstanding register count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
|
||||
|
||||
if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) ||
|
||||
MO_H(m->m_op)) {
|
||||
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time,
|
||||
-1);
|
||||
}
|
||||
|
||||
if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
|
||||
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time,
|
||||
-1);
|
||||
}
|
||||
|
||||
// Mark write bus busy for appropriate amount of time
|
||||
computeUnit->glbMemToVrfBus.set(m->time);
|
||||
if (!computeUnit->shader->coissue_return)
|
||||
w->computeUnit->wfWait.at(m->pipeId).set(m->time);
|
||||
}
|
||||
|
||||
void
|
||||
GlobalMemPipeline::regStats()
|
||||
{
|
||||
loadVrfBankConflictCycles
|
||||
.name(name() + ".load_vrf_bank_conflict_cycles")
|
||||
.desc("total number of cycles GM data are delayed before updating "
|
||||
"the VRF")
|
||||
;
|
||||
}
|
123
src/gpu-compute/global_memory_pipeline.hh
Normal file
123
src/gpu-compute/global_memory_pipeline.hh
Normal file
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos, Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __GLOBAL_MEMORY_PIPELINE_HH__
|
||||
#define __GLOBAL_MEMORY_PIPELINE_HH__
|
||||
|
||||
#include <queue>
|
||||
#include <string>
|
||||
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
#include "sim/stats.hh"
|
||||
|
||||
/*
|
||||
* @file global_memory_pipeline.hh
|
||||
*
|
||||
* The global memory pipeline issues newly created global memory packets
|
||||
* from the pipeline to DTLB. The exec() method of the memory packet issues
|
||||
* the packet to the DTLB if there is space available in the return fifo.
|
||||
* This stage also retires previously issued loads and stores that have
|
||||
* returned from the memory sub-system.
|
||||
*/
|
||||
|
||||
class ComputeUnit;
|
||||
|
||||
class GlobalMemPipeline
|
||||
{
|
||||
public:
|
||||
GlobalMemPipeline(const ComputeUnitParams *params);
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
|
||||
|
||||
std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
|
||||
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
|
||||
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
|
||||
|
||||
bool
|
||||
isGMLdRespFIFOWrRdy() const
|
||||
{
|
||||
return gmReturnedLoads.size() < gmQueueSize;
|
||||
}
|
||||
|
||||
bool
|
||||
isGMStRespFIFOWrRdy() const
|
||||
{
|
||||
return gmReturnedStores.size() < gmQueueSize;
|
||||
}
|
||||
|
||||
bool
|
||||
isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
|
||||
{
|
||||
return (gmIssuedRequests.size() + pendReqs) < gmQueueSize;
|
||||
}
|
||||
|
||||
const std::string &name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
std::string _name;
|
||||
int gmQueueSize;
|
||||
|
||||
// number of cycles of delaying the update of a VGPR that is the
|
||||
// target of a load instruction (or the load component of an atomic)
|
||||
// The delay is due to VRF bank conflicts
|
||||
Stats::Scalar loadVrfBankConflictCycles;
|
||||
// Counters to track the inflight loads and stores
|
||||
// so that we can provide the proper backpressure
|
||||
// on the number of inflight memory operations.
|
||||
int inflightStores;
|
||||
int inflightLoads;
|
||||
|
||||
// The size of global memory.
|
||||
int globalMemSize;
|
||||
|
||||
// Global Memory Request FIFO: all global memory requests
|
||||
// are issued to this FIFO from the memory pipelines
|
||||
std::queue<GPUDynInstPtr> gmIssuedRequests;
|
||||
|
||||
// Globa Store Response FIFO: all responses of global memory
|
||||
// stores are sent to this FIFO from TCP
|
||||
std::queue<GPUDynInstPtr> gmReturnedStores;
|
||||
|
||||
// Global Load Response FIFO: all responses of global memory
|
||||
// loads are sent to this FIFO from TCP
|
||||
std::queue<GPUDynInstPtr> gmReturnedLoads;
|
||||
};
|
||||
|
||||
#endif // __GLOBAL_MEMORY_PIPELINE_HH__
|
198
src/gpu-compute/gpu_dyn_inst.cc
Normal file
198
src/gpu-compute/gpu_dyn_inst.cc
Normal file
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
|
||||
GPUStaticInst *_staticInst, uint64_t instSeqNum)
|
||||
: GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
|
||||
memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
|
||||
statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
|
||||
{
|
||||
tlbHitLevel.assign(VSZ, -1);
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::execute()
|
||||
{
|
||||
GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst,
|
||||
_seqNum);
|
||||
staticInst->execute(gpuDynInst);
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::numSrcRegOperands()
|
||||
{
|
||||
return staticInst->numSrcRegOperands();
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::numDstRegOperands()
|
||||
{
|
||||
return staticInst->numDstRegOperands();
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::getNumOperands()
|
||||
{
|
||||
return staticInst->getNumOperands();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isVectorRegister(int operandIdx)
|
||||
{
|
||||
return staticInst->isVectorRegister(operandIdx);
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isScalarRegister(int operandIdx)
|
||||
{
|
||||
return staticInst->isVectorRegister(operandIdx);
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::getRegisterIndex(int operandIdx)
|
||||
{
|
||||
return staticInst->getRegisterIndex(operandIdx);
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::getOperandSize(int operandIdx)
|
||||
{
|
||||
return staticInst->getOperandSize(operandIdx);
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isDstOperand(int operandIdx)
|
||||
{
|
||||
return staticInst->isDstOperand(operandIdx);
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isSrcOperand(int operandIdx)
|
||||
{
|
||||
return staticInst->isSrcOperand(operandIdx);
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isArgLoad()
|
||||
{
|
||||
return staticInst->isArgLoad();
|
||||
}
|
||||
|
||||
const std::string&
|
||||
GPUDynInst::disassemble() const
|
||||
{
|
||||
return staticInst->disassemble();
|
||||
}
|
||||
|
||||
uint64_t
|
||||
GPUDynInst::seqNum() const
|
||||
{
|
||||
return _seqNum;
|
||||
}
|
||||
|
||||
Enums::OpType
|
||||
GPUDynInst::opType()
|
||||
{
|
||||
return staticInst->o_type;
|
||||
}
|
||||
|
||||
Enums::StorageClassType
|
||||
GPUDynInst::executedAs()
|
||||
{
|
||||
return staticInst->executed_as;
|
||||
}
|
||||
|
||||
// Process a memory instruction and (if necessary) submit timing request
|
||||
void
|
||||
GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
|
||||
cu->cu_id, simdId, wfSlotId, exec_mask);
|
||||
|
||||
staticInst->initiateAcc(gpuDynInst);
|
||||
time = 0;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::scalarOp() const
|
||||
{
|
||||
return staticInst->scalarOp();
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::updateStats()
|
||||
{
|
||||
if (staticInst->isLocalMem()) {
|
||||
// access to LDS (shared) memory
|
||||
cu->dynamicLMemInstrCnt++;
|
||||
} else {
|
||||
// access to global memory
|
||||
|
||||
// update PageDivergence histogram
|
||||
int number_pages_touched = cu->pagesTouched.size();
|
||||
assert(number_pages_touched);
|
||||
cu->pageDivergenceDist.sample(number_pages_touched);
|
||||
|
||||
std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
|
||||
|
||||
for (auto it : cu->pagesTouched) {
|
||||
// see if this page has been touched before. if not, this also
|
||||
// inserts the page into the table.
|
||||
ret = cu->pageAccesses
|
||||
.insert(ComputeUnit::pageDataStruct::value_type(it.first,
|
||||
std::make_pair(1, it.second)));
|
||||
|
||||
// if yes, then update the stats
|
||||
if (!ret.second) {
|
||||
ret.first->second.first++;
|
||||
ret.first->second.second += it.second;
|
||||
}
|
||||
}
|
||||
|
||||
cu->pagesTouched.clear();
|
||||
|
||||
// total number of memory instructions (dynamic)
|
||||
// Atomics are counted as a single memory instruction.
|
||||
// this is # memory instructions per wavefronts, not per workitem
|
||||
cu->dynamicGMemInstrCnt++;
|
||||
}
|
||||
}
|
464
src/gpu-compute/gpu_dyn_inst.hh
Normal file
464
src/gpu-compute/gpu_dyn_inst.hh
Normal file
|
@ -0,0 +1,464 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __GPU_DYN_INST_HH__
|
||||
#define __GPU_DYN_INST_HH__
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "enums/GenericMemoryOrder.hh"
|
||||
#include "enums/GenericMemoryScope.hh"
|
||||
#include "enums/MemOpType.hh"
|
||||
#include "enums/MemType.hh"
|
||||
#include "enums/OpType.hh"
|
||||
#include "enums/StorageClassType.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_exec_context.hh"
|
||||
|
||||
class GPUStaticInst;
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpAnd : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T a;
|
||||
|
||||
AtomicOpAnd(T _a) : a(_a) { }
|
||||
void execute(T *b) { *b &= a; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpOr : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T a;
|
||||
AtomicOpOr(T _a) : a(_a) { }
|
||||
void execute(T *b) { *b |= a; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpXor : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T a;
|
||||
AtomicOpXor(T _a) : a(_a) {}
|
||||
void execute(T *b) { *b ^= a; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpCAS : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T c;
|
||||
T s;
|
||||
|
||||
ComputeUnit *computeUnit;
|
||||
|
||||
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
|
||||
: c(_c), s(_s), computeUnit(compute_unit) { }
|
||||
|
||||
void
|
||||
execute(T *b)
|
||||
{
|
||||
computeUnit->numCASOps++;
|
||||
|
||||
if (*b == c) {
|
||||
*b = s;
|
||||
} else {
|
||||
computeUnit->numFailedCASOps++;
|
||||
}
|
||||
|
||||
if (computeUnit->xact_cas_mode) {
|
||||
computeUnit->xactCasLoadMap.clear();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpExch : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T a;
|
||||
AtomicOpExch(T _a) : a(_a) { }
|
||||
void execute(T *b) { *b = a; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpAdd : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T a;
|
||||
AtomicOpAdd(T _a) : a(_a) { }
|
||||
void execute(T *b) { *b += a; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpSub : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T a;
|
||||
AtomicOpSub(T _a) : a(_a) { }
|
||||
void execute(T *b) { *b -= a; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpInc : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
AtomicOpInc() { }
|
||||
void execute(T *b) { *b += 1; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpDec : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
AtomicOpDec() {}
|
||||
void execute(T *b) { *b -= 1; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpMax : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T a;
|
||||
AtomicOpMax(T _a) : a(_a) { }
|
||||
|
||||
void
|
||||
execute(T *b)
|
||||
{
|
||||
if (a > *b)
|
||||
*b = a;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class AtomicOpMin : public TypedAtomicOpFunctor<T>
|
||||
{
|
||||
public:
|
||||
T a;
|
||||
AtomicOpMin(T _a) : a(_a) {}
|
||||
|
||||
void
|
||||
execute(T *b)
|
||||
{
|
||||
if (a < *b)
|
||||
*b = a;
|
||||
}
|
||||
};
|
||||
|
||||
#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
|
||||
#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
|
||||
#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
|
||||
|
||||
typedef enum
|
||||
{
|
||||
VT_32,
|
||||
VT_64,
|
||||
} vgpr_type;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
SEG_PRIVATE,
|
||||
SEG_SPILL,
|
||||
SEG_GLOBAL,
|
||||
SEG_SHARED,
|
||||
SEG_READONLY,
|
||||
SEG_FLAT
|
||||
} seg_type;
|
||||
|
||||
class GPUDynInst : public GPUExecContext
|
||||
{
|
||||
public:
|
||||
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
|
||||
uint64_t instSeqNum);
|
||||
|
||||
void execute();
|
||||
int numSrcRegOperands();
|
||||
int numDstRegOperands();
|
||||
int getNumOperands();
|
||||
bool isVectorRegister(int operandIdx);
|
||||
bool isScalarRegister(int operandIdx);
|
||||
int getRegisterIndex(int operandIdx);
|
||||
int getOperandSize(int operandIdx);
|
||||
bool isDstOperand(int operandIdx);
|
||||
bool isSrcOperand(int operandIdx);
|
||||
bool isArgLoad();
|
||||
|
||||
const std::string &disassemble() const;
|
||||
|
||||
uint64_t seqNum() const;
|
||||
|
||||
Enums::OpType opType();
|
||||
Enums::StorageClassType executedAs();
|
||||
|
||||
// The address of the memory operation
|
||||
Addr addr[VSZ];
|
||||
Addr pAddr;
|
||||
|
||||
// The data to get written
|
||||
uint8_t d_data[VSZ * 16];
|
||||
// Additional data (for atomics)
|
||||
uint8_t a_data[VSZ * 8];
|
||||
// Additional data (for atomics)
|
||||
uint8_t x_data[VSZ * 8];
|
||||
// The execution mask
|
||||
VectorMask exec_mask;
|
||||
|
||||
// The memory type (M_U32, M_S32, ...)
|
||||
Enums::MemType m_type;
|
||||
// The memory operation (MO_LD, MO_ST, ...)
|
||||
Enums::MemOpType m_op;
|
||||
Enums::GenericMemoryOrder memoryOrder;
|
||||
|
||||
// Scope of the request
|
||||
Enums::GenericMemoryScope scope;
|
||||
// The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
|
||||
seg_type s_type;
|
||||
// The equivalency class
|
||||
int equiv;
|
||||
// The return VGPR type (VT_32 or VT_64)
|
||||
vgpr_type v_type;
|
||||
// Number of VGPR's accessed (1, 2, or 4)
|
||||
int n_reg;
|
||||
// The return VGPR index
|
||||
int dst_reg;
|
||||
// There can be max 4 dest regs>
|
||||
int dst_reg_vec[4];
|
||||
// SIMD where the WF of the memory instruction has been mapped to
|
||||
int simdId;
|
||||
// unique id of the WF where the memory instruction belongs to
|
||||
int wfDynId;
|
||||
// The kernel id of the requesting wf
|
||||
int kern_id;
|
||||
// The CU id of the requesting wf
|
||||
int cu_id;
|
||||
// HW slot id where the WF is mapped to inside a SIMD unit
|
||||
int wfSlotId;
|
||||
// execution pipeline id where the memory instruction has been scheduled
|
||||
int pipeId;
|
||||
// The execution time of this operation
|
||||
Tick time;
|
||||
// The latency of this operation
|
||||
WaitClass latency;
|
||||
// A list of bank conflicts for the 4 cycles.
|
||||
uint32_t bc[4];
|
||||
|
||||
// A pointer to ROM
|
||||
uint8_t *rom;
|
||||
// The size of the READONLY segment
|
||||
int sz_rom;
|
||||
|
||||
// Initiate the specified memory operation, by creating a
|
||||
// memory request and sending it off to the memory system.
|
||||
void initiateAcc(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
void updateStats();
|
||||
|
||||
GPUStaticInst* staticInstruction() { return staticInst; }
|
||||
|
||||
// Is the instruction a scalar or vector op?
|
||||
bool scalarOp() const;
|
||||
|
||||
/*
|
||||
* Loads/stores/atomics may have acquire/release semantics associated
|
||||
* withthem. Some protocols want to see the acquire/release as separate
|
||||
* requests from the load/store/atomic. We implement that separation
|
||||
* using continuations (i.e., a function pointer with an object associated
|
||||
* with it). When, for example, the front-end generates a store with
|
||||
* release semantics, we will first issue a normal store and set the
|
||||
* continuation in the GPUDynInst to a function that generate a
|
||||
* release request. That continuation will be called when the normal
|
||||
* store completes (in ComputeUnit::DataPort::recvTimingResponse). The
|
||||
* continuation will be called in the context of the same GPUDynInst
|
||||
* that generated the initial store.
|
||||
*/
|
||||
std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
|
||||
|
||||
// when true, call execContinuation when response arrives
|
||||
bool useContinuation;
|
||||
|
||||
template<typename c0> AtomicOpFunctor*
|
||||
makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
|
||||
{
|
||||
using namespace Enums;
|
||||
|
||||
switch(op) {
|
||||
case MO_AAND:
|
||||
case MO_ANRAND:
|
||||
return new AtomicOpAnd<c0>(*reg0);
|
||||
case MO_AOR:
|
||||
case MO_ANROR:
|
||||
return new AtomicOpOr<c0>(*reg0);
|
||||
case MO_AXOR:
|
||||
case MO_ANRXOR:
|
||||
return new AtomicOpXor<c0>(*reg0);
|
||||
case MO_ACAS:
|
||||
case MO_ANRCAS:
|
||||
return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
|
||||
case MO_AEXCH:
|
||||
case MO_ANREXCH:
|
||||
return new AtomicOpExch<c0>(*reg0);
|
||||
case MO_AADD:
|
||||
case MO_ANRADD:
|
||||
return new AtomicOpAdd<c0>(*reg0);
|
||||
case MO_ASUB:
|
||||
case MO_ANRSUB:
|
||||
return new AtomicOpSub<c0>(*reg0);
|
||||
case MO_AINC:
|
||||
case MO_ANRINC:
|
||||
return new AtomicOpInc<c0>();
|
||||
case MO_ADEC:
|
||||
case MO_ANRDEC:
|
||||
return new AtomicOpDec<c0>();
|
||||
case MO_AMAX:
|
||||
case MO_ANRMAX:
|
||||
return new AtomicOpMax<c0>(*reg0);
|
||||
case MO_AMIN:
|
||||
case MO_ANRMIN:
|
||||
return new AtomicOpMin<c0>(*reg0);
|
||||
default:
|
||||
panic("Unrecognized atomic operation");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
setRequestFlags(Request *req, bool setMemOrder=true)
|
||||
{
|
||||
// currently these are the easy scopes to deduce
|
||||
switch (s_type) {
|
||||
case SEG_PRIVATE:
|
||||
req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
|
||||
break;
|
||||
case SEG_SPILL:
|
||||
req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
|
||||
break;
|
||||
case SEG_GLOBAL:
|
||||
req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
|
||||
break;
|
||||
case SEG_READONLY:
|
||||
req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
|
||||
break;
|
||||
case SEG_SHARED:
|
||||
req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
|
||||
break;
|
||||
case SEG_FLAT:
|
||||
// TODO: translate to correct scope
|
||||
assert(false);
|
||||
default:
|
||||
panic("Bad segment type");
|
||||
break;
|
||||
}
|
||||
|
||||
switch (scope) {
|
||||
case Enums::MEMORY_SCOPE_NONE:
|
||||
case Enums::MEMORY_SCOPE_WORKITEM:
|
||||
break;
|
||||
case Enums::MEMORY_SCOPE_WAVEFRONT:
|
||||
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
|
||||
Request::WAVEFRONT_SCOPE);
|
||||
break;
|
||||
case Enums::MEMORY_SCOPE_WORKGROUP:
|
||||
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
|
||||
Request::WORKGROUP_SCOPE);
|
||||
break;
|
||||
case Enums::MEMORY_SCOPE_DEVICE:
|
||||
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
|
||||
Request::DEVICE_SCOPE);
|
||||
break;
|
||||
case Enums::MEMORY_SCOPE_SYSTEM:
|
||||
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
|
||||
Request::SYSTEM_SCOPE);
|
||||
break;
|
||||
default:
|
||||
panic("Bad scope type");
|
||||
break;
|
||||
}
|
||||
|
||||
if (setMemOrder) {
|
||||
// set acquire and release flags
|
||||
switch (memoryOrder){
|
||||
case Enums::MEMORY_ORDER_SC_ACQUIRE:
|
||||
req->setFlags(Request::ACQUIRE);
|
||||
break;
|
||||
case Enums::MEMORY_ORDER_SC_RELEASE:
|
||||
req->setFlags(Request::RELEASE);
|
||||
break;
|
||||
case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
|
||||
req->setFlags(Request::ACQUIRE | Request::RELEASE);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// set atomic type
|
||||
// currently, the instruction genenerator only produces atomic return
|
||||
// but a magic instruction can produce atomic no return
|
||||
if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
|
||||
m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
|
||||
m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
|
||||
m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
|
||||
m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
|
||||
m_op == Enums::MO_ACAS) {
|
||||
req->setFlags(Request::ATOMIC_RETURN_OP);
|
||||
} else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
|
||||
m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
|
||||
m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
|
||||
m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
|
||||
m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
|
||||
m_op == Enums::MO_ANRCAS) {
|
||||
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
|
||||
}
|
||||
}
|
||||
|
||||
// Map returned packets and the addresses they satisfy with which lane they
|
||||
// were requested from
|
||||
typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
|
||||
StatusVector memStatusVector;
|
||||
|
||||
// Track the status of memory requests per lane, a bit per lane
|
||||
VectorMask statusBitVector;
|
||||
// for ld_v# or st_v#
|
||||
std::vector<int> statusVector;
|
||||
std::vector<int> tlbHitLevel;
|
||||
|
||||
private:
|
||||
GPUStaticInst *staticInst;
|
||||
uint64_t _seqNum;
|
||||
};
|
||||
|
||||
#endif // __GPU_DYN_INST_HH__
|
53
src/gpu-compute/gpu_exec_context.cc
Normal file
53
src/gpu-compute/gpu_exec_context.cc
Normal file
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/gpu_exec_context.hh"
|
||||
|
||||
GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
|
||||
: cu(_cu), wf(_wf)
|
||||
{
|
||||
}
|
||||
|
||||
ComputeUnit*
|
||||
GPUExecContext::computeUnit()
|
||||
{
|
||||
return cu;
|
||||
}
|
||||
|
||||
Wavefront*
|
||||
GPUExecContext::wavefront()
|
||||
{
|
||||
return wf;
|
||||
}
|
54
src/gpu-compute/gpu_exec_context.hh
Normal file
54
src/gpu-compute/gpu_exec_context.hh
Normal file
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __GPU_EXEC_CONTEXT_HH__
|
||||
#define __GPU_EXEC_CONTEXT_HH__
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
class GPUExecContext
|
||||
{
|
||||
public:
|
||||
GPUExecContext(ComputeUnit *_cu, Wavefront *_wf);
|
||||
Wavefront* wavefront();
|
||||
ComputeUnit* computeUnit();
|
||||
|
||||
protected:
|
||||
ComputeUnit *cu;
|
||||
Wavefront *wf;
|
||||
};
|
||||
|
||||
#endif // __GPU_EXEC_CONTEXT_HH__
|
42
src/gpu-compute/gpu_static_inst.cc
Normal file
42
src/gpu-compute/gpu_static_inst.cc
Normal file
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
|
||||
GPUStaticInst::GPUStaticInst(const std::string &opcode)
|
||||
: o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode),
|
||||
_instNum(0), _scalarOp(false)
|
||||
{
|
||||
}
|
166
src/gpu-compute/gpu_static_inst.hh
Normal file
166
src/gpu-compute/gpu_static_inst.hh
Normal file
|
@ -0,0 +1,166 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __GPU_STATIC_INST_HH__
|
||||
#define __GPU_STATIC_INST_HH__
|
||||
|
||||
/*
|
||||
* @file gpu_static_inst.hh
|
||||
*
|
||||
* Defines the base class representing static instructions for the GPU. The
|
||||
* instructions are "static" because they contain no dynamic instruction
|
||||
* information. GPUStaticInst corresponds to the StaticInst class for the CPU
|
||||
* models.
|
||||
*/
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "enums/OpType.hh"
|
||||
#include "enums/StorageClassType.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
|
||||
class BaseOperand;
|
||||
class BaseRegOperand;
|
||||
class Wavefront;
|
||||
|
||||
class GPUStaticInst
|
||||
{
|
||||
public:
|
||||
GPUStaticInst(const std::string &opcode);
|
||||
|
||||
void instNum(int num) { _instNum = num; }
|
||||
|
||||
int instNum() { return _instNum; }
|
||||
|
||||
void ipdInstNum(int num) { _ipdInstNum = num; }
|
||||
|
||||
int ipdInstNum() const { return _ipdInstNum; }
|
||||
|
||||
virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
|
||||
virtual void generateDisassembly() = 0;
|
||||
virtual const std::string &disassemble() = 0;
|
||||
virtual int getNumOperands() = 0;
|
||||
virtual bool isCondRegister(int operandIndex) = 0;
|
||||
virtual bool isScalarRegister(int operandIndex) = 0;
|
||||
virtual bool isVectorRegister(int operandIndex) = 0;
|
||||
virtual bool isSrcOperand(int operandIndex) = 0;
|
||||
virtual bool isDstOperand(int operandIndex) = 0;
|
||||
virtual int getOperandSize(int operandIndex) = 0;
|
||||
virtual int getRegisterIndex(int operandIndex) = 0;
|
||||
virtual int numDstRegOperands() = 0;
|
||||
virtual int numSrcRegOperands() = 0;
|
||||
|
||||
/*
|
||||
* Most instructions (including all HSAIL instructions)
|
||||
* are vector ops, so _scalarOp will be false by default.
|
||||
* Derived instruction objects that are scalar ops must
|
||||
* set _scalarOp to true in their constructors.
|
||||
*/
|
||||
bool scalarOp() const { return _scalarOp; }
|
||||
|
||||
virtual bool isLocalMem() const
|
||||
{
|
||||
fatal("calling isLocalMem() on non-memory instruction.\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isArgLoad() { return false; }
|
||||
virtual uint32_t instSize() = 0;
|
||||
|
||||
// only used for memory instructions
|
||||
virtual void
|
||||
initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling initiateAcc() on a non-memory instruction.\n");
|
||||
}
|
||||
|
||||
virtual uint32_t getTargetPc() { return 0; }
|
||||
|
||||
/**
|
||||
* Query whether the instruction is an unconditional jump i.e., the jump
|
||||
* is always executed because there is no condition to be evaluated.
|
||||
*
|
||||
* If the instruction is not of branch type, the result is always false.
|
||||
*
|
||||
* @return True if the instruction is an unconditional jump.
|
||||
*/
|
||||
virtual bool unconditionalJumpInstruction() { return false; }
|
||||
|
||||
static uint64_t dynamic_id_count;
|
||||
|
||||
Enums::OpType o_type;
|
||||
// For flat memory accesses
|
||||
Enums::StorageClassType executed_as;
|
||||
|
||||
protected:
|
||||
virtual void
|
||||
execLdAcq(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling execLdAcq() on a non-load instruction.\n");
|
||||
}
|
||||
|
||||
virtual void
|
||||
execSt(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling execLdAcq() on a non-load instruction.\n");
|
||||
}
|
||||
|
||||
virtual void
|
||||
execAtomic(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling execAtomic() on a non-atomic instruction.\n");
|
||||
}
|
||||
|
||||
virtual void
|
||||
execAtomicAcq(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
|
||||
}
|
||||
|
||||
const std::string opcode;
|
||||
std::string disassembly;
|
||||
int _instNum;
|
||||
/**
|
||||
* Identifier of the immediate post-dominator instruction.
|
||||
*/
|
||||
int _ipdInstNum;
|
||||
|
||||
bool _scalarOp;
|
||||
};
|
||||
|
||||
#endif // __GPU_STATIC_INST_HH__
|
1801
src/gpu-compute/gpu_tlb.cc
Normal file
1801
src/gpu-compute/gpu_tlb.cc
Normal file
File diff suppressed because it is too large
Load diff
465
src/gpu-compute/gpu_tlb.hh
Normal file
465
src/gpu-compute/gpu_tlb.hh
Normal file
|
@ -0,0 +1,465 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Lisa Hsu
|
||||
*/
|
||||
|
||||
#ifndef __GPU_TLB_HH__
|
||||
#define __GPU_TLB_HH__
|
||||
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arch/generic/tlb.hh"
|
||||
#include "arch/x86/pagetable.hh"
|
||||
#include "arch/x86/pagetable_walker.hh"
|
||||
#include "arch/x86/regs/segment.hh"
|
||||
#include "base/callback.hh"
|
||||
#include "base/misc.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "mem/mem_object.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "mem/request.hh"
|
||||
#include "params/X86GPUTLB.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
class BaseTLB;
|
||||
class Packet;
|
||||
class ThreadContext;
|
||||
|
||||
namespace X86ISA
|
||||
{
|
||||
class GpuTlbEntry : public TlbEntry
|
||||
{
|
||||
public:
|
||||
GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
|
||||
: TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
|
||||
|
||||
GpuTlbEntry() : TlbEntry() { }
|
||||
|
||||
bool valid;
|
||||
};
|
||||
|
||||
class GpuTLB : public MemObject
|
||||
{
|
||||
protected:
|
||||
friend class Walker;
|
||||
|
||||
typedef std::list<GpuTlbEntry*> EntryList;
|
||||
|
||||
uint32_t configAddress;
|
||||
|
||||
// TLB clock: will inherit clock from shader's clock period in terms
|
||||
// of nuber of ticks of curTime (aka global simulation clock)
|
||||
// The assignment of TLB clock from shader clock is done in the python
|
||||
// config files.
|
||||
int clock;
|
||||
|
||||
public:
|
||||
// clock related functions ; maps to-and-from Simulation ticks and
|
||||
// object clocks.
|
||||
Tick frequency() const { return SimClock::Frequency / clock; }
|
||||
|
||||
Tick
|
||||
ticks(int numCycles) const
|
||||
{
|
||||
return (Tick)clock * numCycles;
|
||||
}
|
||||
|
||||
Tick curCycle() const { return curTick() / clock; }
|
||||
Tick tickToCycles(Tick val) const { return val / clock;}
|
||||
|
||||
typedef X86GPUTLBParams Params;
|
||||
GpuTLB(const Params *p);
|
||||
~GpuTLB();
|
||||
|
||||
typedef enum BaseTLB::Mode Mode;
|
||||
|
||||
class Translation
|
||||
{
|
||||
public:
|
||||
virtual ~Translation() { }
|
||||
|
||||
/**
|
||||
* Signal that the translation has been delayed due to a hw page
|
||||
* table walk.
|
||||
*/
|
||||
virtual void markDelayed() = 0;
|
||||
|
||||
/**
|
||||
* The memory for this object may be dynamically allocated, and it
|
||||
* may be responsible for cleaning itslef up which will happen in
|
||||
* this function. Once it's called the object is no longer valid.
|
||||
*/
|
||||
virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
|
||||
Mode mode) = 0;
|
||||
};
|
||||
|
||||
void dumpAll();
|
||||
GpuTlbEntry *lookup(Addr va, bool update_lru=true);
|
||||
void setConfigAddress(uint32_t addr);
|
||||
|
||||
protected:
|
||||
EntryList::iterator lookupIt(Addr va, bool update_lru=true);
|
||||
Walker *walker;
|
||||
|
||||
public:
|
||||
Walker *getWalker();
|
||||
void invalidateAll();
|
||||
void invalidateNonGlobal();
|
||||
void demapPage(Addr va, uint64_t asn);
|
||||
|
||||
protected:
|
||||
int size;
|
||||
int assoc;
|
||||
int numSets;
|
||||
|
||||
/**
|
||||
* true if this is a fully-associative TLB
|
||||
*/
|
||||
bool FA;
|
||||
Addr setMask;
|
||||
|
||||
/**
|
||||
* Allocation Policy: true if we always allocate on a hit, false
|
||||
* otherwise. Default is true.
|
||||
*/
|
||||
bool allocationPolicy;
|
||||
|
||||
/**
|
||||
* if true, then this is not the last level TLB
|
||||
*/
|
||||
bool hasMemSidePort;
|
||||
|
||||
/**
|
||||
* Print out accessDistance stats. One stat file
|
||||
* per TLB.
|
||||
*/
|
||||
bool accessDistance;
|
||||
|
||||
GpuTlbEntry *tlb;
|
||||
|
||||
/*
|
||||
* It's a per-set list. As long as we have not reached
|
||||
* the full capacity of the given set, grab an entry from
|
||||
* the freeList.
|
||||
*/
|
||||
std::vector<EntryList> freeList;
|
||||
|
||||
/**
|
||||
* An entryList per set is the equivalent of an LRU stack;
|
||||
* it's used to guide replacement decisions. The head of the list
|
||||
* contains the MRU TLB entry of the given set. If the freeList
|
||||
* for this set is empty, the last element of the list
|
||||
* is evicted (i.e., dropped on the floor).
|
||||
*/
|
||||
std::vector<EntryList> entryList;
|
||||
|
||||
Fault translateInt(RequestPtr req, ThreadContext *tc);
|
||||
|
||||
Fault translate(RequestPtr req, ThreadContext *tc,
|
||||
Translation *translation, Mode mode, bool &delayedResponse,
|
||||
bool timing, int &latency);
|
||||
|
||||
public:
|
||||
// latencies for a TLB hit, miss and page fault
|
||||
int hitLatency;
|
||||
int missLatency1;
|
||||
int missLatency2;
|
||||
|
||||
// local_stats are as seen from the TLB
|
||||
// without taking into account coalescing
|
||||
Stats::Scalar localNumTLBAccesses;
|
||||
Stats::Scalar localNumTLBHits;
|
||||
Stats::Scalar localNumTLBMisses;
|
||||
Stats::Formula localTLBMissRate;
|
||||
|
||||
// global_stats are as seen from the
|
||||
// CU's perspective taking into account
|
||||
// all coalesced requests.
|
||||
Stats::Scalar globalNumTLBAccesses;
|
||||
Stats::Scalar globalNumTLBHits;
|
||||
Stats::Scalar globalNumTLBMisses;
|
||||
Stats::Formula globalTLBMissRate;
|
||||
|
||||
// from the CU perspective (global)
|
||||
Stats::Scalar accessCycles;
|
||||
// from the CU perspective (global)
|
||||
Stats::Scalar pageTableCycles;
|
||||
Stats::Scalar numUniquePages;
|
||||
// from the perspective of this TLB
|
||||
Stats::Scalar localCycles;
|
||||
// from the perspective of this TLB
|
||||
Stats::Formula localLatency;
|
||||
// I take the avg. per page and then
|
||||
// the avg. over all pages.
|
||||
Stats::Scalar avgReuseDistance;
|
||||
|
||||
void regStats();
|
||||
void updatePageFootprint(Addr virt_page_addr);
|
||||
void printAccessPattern();
|
||||
|
||||
|
||||
Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
|
||||
int &latency);
|
||||
|
||||
void translateTiming(RequestPtr req, ThreadContext *tc,
|
||||
Translation *translation, Mode mode,
|
||||
int &latency);
|
||||
|
||||
Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
|
||||
Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
|
||||
|
||||
GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
|
||||
|
||||
// Checkpointing
|
||||
virtual void serialize(CheckpointOut& cp) const;
|
||||
virtual void unserialize(CheckpointIn& cp);
|
||||
void issueTranslation();
|
||||
enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
|
||||
bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
|
||||
|
||||
void handleTranslationReturn(Addr addr, tlbOutcome outcome,
|
||||
PacketPtr pkt);
|
||||
|
||||
void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
|
||||
|
||||
void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
|
||||
GpuTlbEntry *tlb_entry, Mode mode);
|
||||
|
||||
void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
|
||||
Addr phys_page_addr);
|
||||
|
||||
void issueTLBLookup(PacketPtr pkt);
|
||||
|
||||
// CpuSidePort is the TLB Port closer to the CPU/CU side
|
||||
class CpuSidePort : public SlavePort
|
||||
{
|
||||
public:
|
||||
CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
|
||||
PortID _index)
|
||||
: SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
|
||||
|
||||
protected:
|
||||
GpuTLB *tlb;
|
||||
int index;
|
||||
|
||||
virtual bool recvTimingReq(PacketPtr pkt);
|
||||
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
||||
virtual void recvFunctional(PacketPtr pkt);
|
||||
virtual void recvRangeChange() { }
|
||||
virtual void recvReqRetry();
|
||||
virtual void recvRespRetry() { assert(false); }
|
||||
virtual AddrRangeList getAddrRanges() const;
|
||||
};
|
||||
|
||||
/**
|
||||
* MemSidePort is the TLB Port closer to the memory side
|
||||
* If this is a last level TLB then this port will not be connected.
|
||||
*
|
||||
* Future action item: if we ever do real page walks, then this port
|
||||
* should be connected to a RubyPort.
|
||||
*/
|
||||
class MemSidePort : public MasterPort
|
||||
{
|
||||
public:
|
||||
MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
|
||||
PortID _index)
|
||||
: MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
|
||||
|
||||
std::deque<PacketPtr> retries;
|
||||
|
||||
protected:
|
||||
GpuTLB *tlb;
|
||||
int index;
|
||||
|
||||
virtual bool recvTimingResp(PacketPtr pkt);
|
||||
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
||||
virtual void recvFunctional(PacketPtr pkt) { }
|
||||
virtual void recvRangeChange() { }
|
||||
virtual void recvReqRetry();
|
||||
};
|
||||
|
||||
// TLB ports on the cpu Side
|
||||
std::vector<CpuSidePort*> cpuSidePort;
|
||||
// TLB ports on the memory side
|
||||
std::vector<MemSidePort*> memSidePort;
|
||||
|
||||
BaseMasterPort &getMasterPort(const std::string &if_name,
|
||||
PortID idx=InvalidPortID);
|
||||
|
||||
BaseSlavePort &getSlavePort(const std::string &if_name,
|
||||
PortID idx=InvalidPortID);
|
||||
|
||||
/**
|
||||
* TLB TranslationState: this currently is a somewhat bastardization of
|
||||
* the usage of SenderState, whereby the receiver of a packet is not
|
||||
* usually supposed to need to look at the contents of the senderState,
|
||||
* you're really only supposed to look at what you pushed on, pop it
|
||||
* off, and send it back.
|
||||
*
|
||||
* However, since there is state that we want to pass to the TLBs using
|
||||
* the send/recv Timing/Functional/etc. APIs, which don't allow for new
|
||||
* arguments, we need a common TLB senderState to pass between TLBs,
|
||||
* both "forwards" and "backwards."
|
||||
*
|
||||
* So, basically, the rule is that any packet received by a TLB port
|
||||
* (cpuside OR memside) must be safely castable to a TranslationState.
|
||||
*/
|
||||
|
||||
struct TranslationState : public Packet::SenderState
|
||||
{
|
||||
// TLB mode, read or write
|
||||
Mode tlbMode;
|
||||
// Thread context associated with this req
|
||||
ThreadContext *tc;
|
||||
|
||||
/*
|
||||
* TLB entry to be populated and passed back and filled in
|
||||
* previous TLBs. Equivalent to the data cache concept of
|
||||
* "data return."
|
||||
*/
|
||||
GpuTlbEntry *tlbEntry;
|
||||
// Is this a TLB prefetch request?
|
||||
bool prefetch;
|
||||
// When was the req for this translation issued
|
||||
uint64_t issueTime;
|
||||
// Remember where this came from
|
||||
std::vector<SlavePort*>ports;
|
||||
|
||||
// keep track of #uncoalesced reqs per packet per TLB level;
|
||||
// reqCnt per level >= reqCnt higher level
|
||||
std::vector<int> reqCnt;
|
||||
// TLB level this packet hit in; 0 if it hit in the page table
|
||||
int hitLevel;
|
||||
Packet::SenderState *saved;
|
||||
|
||||
TranslationState(Mode tlb_mode, ThreadContext *_tc,
|
||||
bool _prefetch=false,
|
||||
Packet::SenderState *_saved=nullptr)
|
||||
: tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
|
||||
prefetch(_prefetch), issueTime(0),
|
||||
hitLevel(0),saved(_saved) { }
|
||||
};
|
||||
|
||||
// maximum number of permitted coalesced requests per cycle
|
||||
int maxCoalescedReqs;
|
||||
|
||||
// Current number of outstandings coalesced requests.
|
||||
// Should be <= maxCoalescedReqs
|
||||
int outstandingReqs;
|
||||
|
||||
/**
|
||||
* A TLBEvent is scheduled after the TLB lookup and helps us take the
|
||||
* appropriate actions:
|
||||
* (e.g., update TLB on a hit,
|
||||
* send request to lower level TLB on a miss,
|
||||
* or start a page walk if this was the last-level TLB).
|
||||
*/
|
||||
void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
|
||||
PacketPtr pkt);
|
||||
|
||||
class TLBEvent : public Event
|
||||
{
|
||||
private:
|
||||
GpuTLB *tlb;
|
||||
Addr virtPageAddr;
|
||||
/**
|
||||
* outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
|
||||
*/
|
||||
tlbOutcome outcome;
|
||||
PacketPtr pkt;
|
||||
|
||||
public:
|
||||
TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
|
||||
PacketPtr _pkt);
|
||||
|
||||
void process();
|
||||
const char *description() const;
|
||||
|
||||
// updateOutcome updates the tlbOutcome of a TLBEvent
|
||||
void updateOutcome(tlbOutcome _outcome);
|
||||
Addr getTLBEventVaddr();
|
||||
};
|
||||
|
||||
std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
|
||||
|
||||
// this FIFO queue keeps track of the virt. page addresses
|
||||
// that are pending cleanup
|
||||
std::queue<Addr> cleanupQueue;
|
||||
|
||||
// the cleanupEvent is scheduled after a TLBEvent triggers in order to
|
||||
// free memory and do the required clean-up
|
||||
void cleanup();
|
||||
|
||||
EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
|
||||
|
||||
/**
|
||||
* This hash map will use the virtual page address as a key
|
||||
* and will keep track of total number of accesses per page
|
||||
*/
|
||||
|
||||
struct AccessInfo
|
||||
{
|
||||
unsigned int lastTimeAccessed; // last access to this page
|
||||
unsigned int accessesPerPage;
|
||||
// need to divide it by accessesPerPage at the end
|
||||
unsigned int totalReuseDistance;
|
||||
|
||||
/**
|
||||
* The field below will help us compute the access distance,
|
||||
* that is the number of (coalesced) TLB accesses that
|
||||
* happened in between each access to this page
|
||||
*
|
||||
* localTLBAccesses[x] is the value of localTLBNumAccesses
|
||||
* when the page <Addr> was accessed for the <x>th time
|
||||
*/
|
||||
std::vector<unsigned int> localTLBAccesses;
|
||||
unsigned int sumDistance;
|
||||
unsigned int meanDistance;
|
||||
};
|
||||
|
||||
typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
|
||||
AccessPatternTable TLBFootprint;
|
||||
|
||||
// Called at the end of simulation to dump page access stats.
|
||||
void exitCallback();
|
||||
|
||||
EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
|
||||
};
|
||||
}
|
||||
|
||||
#endif // __GPU_TLB_HH__
|
101
src/gpu-compute/hsa_code.hh
Normal file
101
src/gpu-compute/hsa_code.hh
Normal file
|
@ -0,0 +1,101 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __HSA_CODE_HH__
|
||||
#define __HSA_CODE_HH__
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arch/gpu_types.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
|
||||
class HsaKernelInfo;
|
||||
|
||||
/* @class HsaCode
|
||||
* base code object for the set of HSA kernels associated
|
||||
* with a single application. this class provides the common
|
||||
* methods for creating, accessing, and storing information
|
||||
* about kernel and variable symbols, symbol name, memory
|
||||
* segment sizes, and instruction count, etc.
|
||||
*/
|
||||
|
||||
class HsaCode
|
||||
{
|
||||
public:
|
||||
HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0),
|
||||
_name(name)
|
||||
{
|
||||
}
|
||||
|
||||
enum class MemorySegment {
|
||||
NONE,
|
||||
FLAT,
|
||||
GLOBAL,
|
||||
READONLY,
|
||||
KERNARG,
|
||||
GROUP,
|
||||
PRIVATE,
|
||||
SPILL,
|
||||
ARG,
|
||||
EXTSPACE0
|
||||
};
|
||||
|
||||
const std::string& name() const { return _name; }
|
||||
int numInsts() const { return _insts.size(); }
|
||||
std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; }
|
||||
|
||||
void
|
||||
setReadonlyData(uint8_t *_readonly_data)
|
||||
{
|
||||
readonly_data = _readonly_data;
|
||||
}
|
||||
|
||||
virtual int getSize(MemorySegment segment) const = 0;
|
||||
virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0;
|
||||
|
||||
uint8_t *readonly_data;
|
||||
int funcarg_size;
|
||||
|
||||
protected:
|
||||
// An array that stores instruction indices (0 through kernel size)
|
||||
// for a kernel passed to code object constructor as an argument.
|
||||
std::vector<TheGpuISA::RawMachInst> _insts;
|
||||
|
||||
private:
|
||||
const std::string _name;
|
||||
};
|
||||
|
||||
#endif // __HSA_CODE_HH__
|
79
src/gpu-compute/hsa_kernel_info.hh
Normal file
79
src/gpu-compute/hsa_kernel_info.hh
Normal file
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __HSA_KERNEL_INFO_HH__
|
||||
#define __HSA_KERNEL_INFO_HH__
|
||||
|
||||
// This file defines the public interface between the HSA emulated
|
||||
// driver and application programs.
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static const int HSA_GET_SIZES = 0x4801;
|
||||
static const int HSA_GET_KINFO = 0x4802;
|
||||
static const int HSA_GET_STRINGS = 0x4803;
|
||||
static const int HSA_GET_CODE = 0x4804;
|
||||
static const int HSA_GET_READONLY_DATA = 0x4805;
|
||||
static const int HSA_GET_CU_CNT = 0x4806;
|
||||
static const int HSA_GET_VSZ = 0x4807;
|
||||
|
||||
// Return value (via buffer ptr) for HSA_GET_SIZES
|
||||
struct HsaDriverSizes
|
||||
{
|
||||
uint32_t num_kernels;
|
||||
uint32_t string_table_size;
|
||||
uint32_t code_size;
|
||||
uint32_t readonly_size;
|
||||
};
|
||||
|
||||
// HSA_GET_KINFO returns an array of num_kernels of these structs
|
||||
struct HsaKernelInfo
|
||||
{
|
||||
// byte offset into string table
|
||||
uint32_t name_offs;
|
||||
// byte offset into code array
|
||||
uint32_t code_offs;
|
||||
uint32_t static_lds_size;
|
||||
uint32_t private_mem_size;
|
||||
uint32_t spill_mem_size;
|
||||
// Number of s registers
|
||||
uint32_t sRegCount;
|
||||
// Number of d registers
|
||||
uint32_t dRegCount;
|
||||
// Number of c registers
|
||||
uint32_t cRegCount;
|
||||
};
|
||||
|
||||
#endif // __HSA_KERNEL_INFO_HH__
|
76
src/gpu-compute/hsa_object.cc
Normal file
76
src/gpu-compute/hsa_object.cc
Normal file
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/hsa_object.hh"
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#include "gpu-compute/brig_object.hh"
|
||||
|
||||
HsaObject::HsaObject(const std::string &fname)
|
||||
: readonlyData(nullptr), filename(fname)
|
||||
{
|
||||
}
|
||||
|
||||
HsaObject*
|
||||
HsaObject::createHsaObject(const std::string &fname)
|
||||
{
|
||||
HsaObject *hsaObj = nullptr;
|
||||
uint8_t *file_data = nullptr;
|
||||
int file_length = 0;
|
||||
|
||||
std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in |
|
||||
std::ifstream::binary);
|
||||
|
||||
assert(code_file.is_open());
|
||||
assert(code_file.good());
|
||||
|
||||
file_length = code_file.tellg();
|
||||
code_file.seekg(0, code_file.beg);
|
||||
file_data = new uint8_t[file_length];
|
||||
code_file.read((char*)file_data, file_length);
|
||||
code_file.close();
|
||||
|
||||
for (const auto &tryFile : tryFileFuncs) {
|
||||
if ((hsaObj = tryFile(fname, file_length, file_data))) {
|
||||
return hsaObj;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] file_data;
|
||||
fatal("Unknown HSA object type for file: %s.\n", fname);
|
||||
|
||||
return nullptr;
|
||||
}
|
74
src/gpu-compute/hsa_object.hh
Normal file
74
src/gpu-compute/hsa_object.hh
Normal file
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __HSA_OBJECT_HH__
|
||||
#define __HSA_OBJECT_HH__
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class HsaCode;
|
||||
|
||||
/* @class HsaObject
|
||||
* base loader object for HSA kernels. this class provides
|
||||
* the base method definitions for loading, storing, and
|
||||
* accessing HSA kernel objects into the simulator.
|
||||
*/
|
||||
|
||||
class HsaObject
|
||||
{
|
||||
public:
|
||||
HsaObject(const std::string &fileName);
|
||||
|
||||
static HsaObject* createHsaObject(const std::string &fname);
|
||||
static std::vector<std::function<HsaObject*(const std::string&, int,
|
||||
uint8_t*)>> tryFileFuncs;
|
||||
|
||||
virtual HsaCode* getKernel(const std::string &name) const = 0;
|
||||
virtual HsaCode* getKernel(int i) const = 0;
|
||||
virtual HsaCode* getFunction(const std::string &name) const = 0;
|
||||
virtual int numKernels() const = 0;
|
||||
|
||||
const std::string& name() const { return filename; }
|
||||
|
||||
uint8_t *readonlyData;
|
||||
|
||||
|
||||
protected:
|
||||
const std::string filename;
|
||||
};
|
||||
|
||||
#endif // __HSA_OBJECT_HH__
|
453
src/gpu-compute/hsail_code.cc
Normal file
453
src/gpu-compute/hsail_code.cc
Normal file
|
@ -0,0 +1,453 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#include "gpu-compute/hsail_code.hh"
|
||||
|
||||
#include "arch/gpu_types.hh"
|
||||
#include "arch/hsail/Brig.h"
|
||||
#include "arch/hsail/operand.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "debug/BRIG.hh"
|
||||
#include "debug/HSAILObject.hh"
|
||||
#include "gpu-compute/brig_object.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/kernel_cfg.hh"
|
||||
|
||||
using namespace Brig;
|
||||
|
||||
int getBrigDataTypeBytes(BrigType16_t t);
|
||||
|
||||
HsailCode::HsailCode(const std::string &name_str)
|
||||
: HsaCode(name_str), private_size(-1), readonly_size(-1)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj,
|
||||
StorageMap *objStorageMap)
|
||||
{
|
||||
storageMap = objStorageMap;
|
||||
|
||||
// set pointer so that decoding process can find this kernel context when
|
||||
// needed
|
||||
obj->currentCode = this;
|
||||
|
||||
if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION &&
|
||||
code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) {
|
||||
fatal("unexpected directive kind %d inside kernel/function init\n",
|
||||
code_dir->base.kind);
|
||||
}
|
||||
|
||||
DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n",
|
||||
code_dir->firstCodeBlockEntry);
|
||||
|
||||
// clear these static vars so we can properly track the max index
|
||||
// for this kernel
|
||||
SRegOperand::maxRegIdx = 0;
|
||||
DRegOperand::maxRegIdx = 0;
|
||||
CRegOperand::maxRegIdx = 0;
|
||||
setPrivateSize(0);
|
||||
|
||||
const BrigBase *entryPtr = brigNext((BrigBase*)code_dir);
|
||||
const BrigBase *endPtr =
|
||||
obj->getCodeSectionEntry(code_dir->nextModuleEntry);
|
||||
|
||||
int inst_idx = 0;
|
||||
std::vector<GPUStaticInst*> instructions;
|
||||
int funcarg_size_scope = 0;
|
||||
|
||||
// walk through instructions in code section and directives in
|
||||
// directive section in parallel, processing directives that apply
|
||||
// when we reach the relevant code point.
|
||||
while (entryPtr < endPtr) {
|
||||
switch (entryPtr->kind) {
|
||||
case BRIG_KIND_DIRECTIVE_VARIABLE:
|
||||
{
|
||||
const BrigDirectiveVariable *sym =
|
||||
(const BrigDirectiveVariable*)entryPtr;
|
||||
|
||||
DPRINTF(HSAILObject,"Initializing code, directive is "
|
||||
"kind_variable, symbol is: %s\n",
|
||||
obj->getString(sym->name));
|
||||
|
||||
StorageElement *se = storageMap->addSymbol(sym, obj);
|
||||
|
||||
if (sym->segment == BRIG_SEGMENT_PRIVATE) {
|
||||
setPrivateSize(se->size);
|
||||
} else { // spill
|
||||
funcarg_size_scope += se->size;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_LABEL:
|
||||
{
|
||||
const BrigDirectiveLabel *lbl =
|
||||
(const BrigDirectiveLabel*)entryPtr;
|
||||
|
||||
DPRINTF(HSAILObject,"Initializing code, directive is "
|
||||
"kind_label, label is: %s \n",
|
||||
obj->getString(lbl->name));
|
||||
|
||||
labelMap.addLabel(lbl, inst_idx, obj);
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_PRAGMA:
|
||||
{
|
||||
DPRINTF(HSAILObject, "Initializing code, directive "
|
||||
"is kind_pragma\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_COMMENT:
|
||||
{
|
||||
DPRINTF(HSAILObject, "Initializing code, directive is "
|
||||
"kind_comment\n");
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
|
||||
{
|
||||
DPRINTF(HSAILObject, "Initializing code, directive is "
|
||||
"kind_arg_block_start\n");
|
||||
|
||||
storageMap->resetOffset(BRIG_SEGMENT_ARG);
|
||||
funcarg_size_scope = 0;
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
|
||||
{
|
||||
DPRINTF(HSAILObject, "Initializing code, directive is "
|
||||
"kind_arg_block_end\n");
|
||||
|
||||
funcarg_size = funcarg_size < funcarg_size_scope ?
|
||||
funcarg_size_scope : funcarg_size;
|
||||
}
|
||||
break;
|
||||
|
||||
case BRIG_KIND_DIRECTIVE_END:
|
||||
DPRINTF(HSAILObject, "Initializing code, dircetive is "
|
||||
"kind_end\n");
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
if (entryPtr->kind >= BRIG_KIND_INST_BEGIN &&
|
||||
entryPtr->kind <= BRIG_KIND_INST_END) {
|
||||
|
||||
BrigInstBase *instPtr = (BrigInstBase*)entryPtr;
|
||||
TheGpuISA::MachInst machInst = { instPtr, obj };
|
||||
GPUStaticInst *iptr = decoder.decode(machInst);
|
||||
|
||||
if (iptr) {
|
||||
DPRINTF(HSAILObject, "Initializing code, processing inst "
|
||||
"#%d idx %d: OPCODE=%d\n",
|
||||
inst_idx, _insts.size(), instPtr->opcode);
|
||||
|
||||
TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr);
|
||||
iptr->instNum(inst_idx);
|
||||
_insts.push_back(inst_num);
|
||||
instructions.push_back(iptr);
|
||||
}
|
||||
++inst_idx;
|
||||
} else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
|
||||
entryPtr->kind < BRIG_KIND_OPERAND_END) {
|
||||
warn("unexpected operand entry in code segment\n");
|
||||
} else {
|
||||
// there are surely some more cases we will need to handle,
|
||||
// but we'll deal with them as we find them.
|
||||
fatal("unexpected directive kind %d inside kernel scope\n",
|
||||
entryPtr->kind);
|
||||
}
|
||||
}
|
||||
|
||||
entryPtr = brigNext(entryPtr);
|
||||
}
|
||||
|
||||
// compute Control Flow Graph for current kernel
|
||||
ControlFlowInfo::assignImmediatePostDominators(instructions);
|
||||
|
||||
max_sreg = SRegOperand::maxRegIdx;
|
||||
max_dreg = DRegOperand::maxRegIdx;
|
||||
max_creg = CRegOperand::maxRegIdx;
|
||||
|
||||
obj->currentCode = nullptr;
|
||||
}
|
||||
|
||||
HsailCode::HsailCode(const std::string &name_str,
|
||||
const BrigDirectiveExecutable *code_dir,
|
||||
const BrigObject *obj, StorageMap *objStorageMap)
|
||||
: HsaCode(name_str), private_size(-1), readonly_size(-1)
|
||||
{
|
||||
init(code_dir, obj, objStorageMap);
|
||||
}
|
||||
|
||||
void
|
||||
LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index,
|
||||
const BrigObject *obj)
|
||||
{
|
||||
std::string lbl_name = obj->getString(lblDir->name);
|
||||
Label &lbl = map[lbl_name];
|
||||
|
||||
if (lbl.defined()) {
|
||||
fatal("Attempt to redefine existing label %s\n", lbl_name);
|
||||
}
|
||||
|
||||
lbl.define(lbl_name, inst_index);
|
||||
DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index);
|
||||
}
|
||||
|
||||
Label*
|
||||
LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir,
|
||||
const BrigObject *obj)
|
||||
{
|
||||
std::string name = obj->getString(lblDir->name);
|
||||
Label &lbl = map[name];
|
||||
lbl.checkName(name);
|
||||
|
||||
return &lbl;
|
||||
}
|
||||
|
||||
int
|
||||
getBrigDataTypeBytes(BrigType16_t t)
|
||||
{
|
||||
switch (t) {
|
||||
case BRIG_TYPE_S8:
|
||||
case BRIG_TYPE_U8:
|
||||
case BRIG_TYPE_B8:
|
||||
return 1;
|
||||
|
||||
case BRIG_TYPE_S16:
|
||||
case BRIG_TYPE_U16:
|
||||
case BRIG_TYPE_B16:
|
||||
case BRIG_TYPE_F16:
|
||||
return 2;
|
||||
|
||||
case BRIG_TYPE_S32:
|
||||
case BRIG_TYPE_U32:
|
||||
case BRIG_TYPE_B32:
|
||||
case BRIG_TYPE_F32:
|
||||
return 4;
|
||||
|
||||
case BRIG_TYPE_S64:
|
||||
case BRIG_TYPE_U64:
|
||||
case BRIG_TYPE_B64:
|
||||
case BRIG_TYPE_F64:
|
||||
return 8;
|
||||
|
||||
case BRIG_TYPE_B1:
|
||||
|
||||
default:
|
||||
fatal("unhandled symbol data type %d", t);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
StorageElement*
|
||||
StorageSpace::addSymbol(const BrigDirectiveVariable *sym,
|
||||
const BrigObject *obj)
|
||||
{
|
||||
const char *sym_name = obj->getString(sym->name);
|
||||
uint64_t size = 0;
|
||||
uint64_t offset = 0;
|
||||
|
||||
if (sym->type & BRIG_TYPE_ARRAY) {
|
||||
size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY);
|
||||
size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo);
|
||||
|
||||
offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type &
|
||||
~BRIG_TYPE_ARRAY));
|
||||
} else {
|
||||
size = getBrigDataTypeBytes(sym->type);
|
||||
offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type));
|
||||
}
|
||||
|
||||
nextOffset = offset + size;
|
||||
|
||||
DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n",
|
||||
segmentNames[segment], sym_name, size, offset, sym->init);
|
||||
|
||||
StorageElement* se = new StorageElement(sym_name, offset, size, sym);
|
||||
elements.push_back(se);
|
||||
elements_by_addr.insert(AddrRange(offset, offset + size - 1), se);
|
||||
elements_by_brigptr[sym] = se;
|
||||
|
||||
return se;
|
||||
}
|
||||
|
||||
StorageElement*
|
||||
StorageSpace::findSymbol(std::string name)
|
||||
{
|
||||
for (auto it : elements) {
|
||||
if (it->name == name) {
|
||||
return it;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
StorageElement*
|
||||
StorageSpace::findSymbol(uint64_t addr)
|
||||
{
|
||||
assert(elements_by_addr.size() > 0);
|
||||
|
||||
auto se = elements_by_addr.find(addr);
|
||||
|
||||
if (se == elements_by_addr.end()) {
|
||||
return nullptr;
|
||||
} else {
|
||||
return se->second;
|
||||
}
|
||||
}
|
||||
|
||||
StorageElement*
|
||||
StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr)
|
||||
{
|
||||
assert(elements_by_brigptr.size() > 0);
|
||||
|
||||
auto se = elements_by_brigptr.find(brigptr);
|
||||
|
||||
if (se == elements_by_brigptr.end()) {
|
||||
return nullptr;
|
||||
} else {
|
||||
return se->second;
|
||||
}
|
||||
}
|
||||
|
||||
StorageMap::StorageMap(StorageMap *outerScope)
|
||||
: outerScopeMap(outerScope)
|
||||
{
|
||||
for (int i = 0; i < NumSegments; ++i)
|
||||
space[i] = new StorageSpace((BrigSegment)i);
|
||||
}
|
||||
|
||||
StorageElement*
|
||||
StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj)
|
||||
{
|
||||
BrigSegment8_t segment = sym->segment;
|
||||
|
||||
assert(segment >= Brig::BRIG_SEGMENT_FLAT);
|
||||
assert(segment < NumSegments);
|
||||
|
||||
return space[segment]->addSymbol(sym, obj);
|
||||
}
|
||||
|
||||
int
|
||||
StorageMap::getSize(Brig::BrigSegment segment)
|
||||
{
|
||||
assert(segment > Brig::BRIG_SEGMENT_GLOBAL);
|
||||
assert(segment < NumSegments);
|
||||
|
||||
if (segment != Brig::BRIG_SEGMENT_GROUP &&
|
||||
segment != Brig::BRIG_SEGMENT_READONLY) {
|
||||
return space[segment]->getSize();
|
||||
} else {
|
||||
int ret = space[segment]->getSize();
|
||||
|
||||
if (outerScopeMap) {
|
||||
ret += outerScopeMap->getSize(segment);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
StorageMap::resetOffset(Brig::BrigSegment segment)
|
||||
{
|
||||
space[segment]->resetOffset();
|
||||
}
|
||||
|
||||
StorageElement*
|
||||
StorageMap::findSymbol(BrigSegment segment, std::string name)
|
||||
{
|
||||
StorageElement *se = space[segment]->findSymbol(name);
|
||||
|
||||
if (se)
|
||||
return se;
|
||||
|
||||
if (outerScopeMap)
|
||||
return outerScopeMap->findSymbol(segment, name);
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
StorageElement*
|
||||
StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr)
|
||||
{
|
||||
StorageSpace *sp = space[segment];
|
||||
|
||||
if (!sp) {
|
||||
// there is no memory in segment?
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
StorageElement *se = sp->findSymbol(addr);
|
||||
|
||||
if (se)
|
||||
return se;
|
||||
|
||||
if (outerScopeMap)
|
||||
return outerScopeMap->findSymbol(segment, addr);
|
||||
|
||||
return nullptr;
|
||||
|
||||
}
|
||||
|
||||
StorageElement*
|
||||
StorageMap::findSymbol(Brig::BrigSegment segment,
|
||||
const BrigDirectiveVariable *brigptr)
|
||||
{
|
||||
StorageSpace *sp = space[segment];
|
||||
|
||||
if (!sp) {
|
||||
// there is no memory in segment?
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
StorageElement *se = sp->findSymbol(brigptr);
|
||||
|
||||
if (se)
|
||||
return se;
|
||||
|
||||
if (outerScopeMap)
|
||||
return outerScopeMap->findSymbol(segment, brigptr);
|
||||
|
||||
return nullptr;
|
||||
|
||||
}
|
447
src/gpu-compute/hsail_code.hh
Normal file
447
src/gpu-compute/hsail_code.hh
Normal file
|
@ -0,0 +1,447 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __HSAIL_CODE_HH__
|
||||
#define __HSAIL_CODE_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arch/gpu_decoder.hh"
|
||||
#include "arch/hsail/Brig.h"
|
||||
#include "base/addr_range_map.hh"
|
||||
#include "base/intmath.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "gpu-compute/hsa_code.hh"
|
||||
#include "gpu-compute/hsa_kernel_info.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
|
||||
class BrigObject;
|
||||
class GPUStaticInst;
|
||||
|
||||
inline int
|
||||
popcount(uint64_t src, int sz)
|
||||
{
|
||||
int cnt = 0;
|
||||
|
||||
for (int i = 0; i < sz; ++i) {
|
||||
if (src & 1)
|
||||
++cnt;
|
||||
src >>= 1;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
inline int
|
||||
firstbit(uint64_t src, int sz)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < sz; ++i) {
|
||||
if (src & 1)
|
||||
break;
|
||||
src >>= 1;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
inline int
|
||||
lastbit(uint64_t src, int sz)
|
||||
{
|
||||
int i0 = -1;
|
||||
|
||||
for (int i = 0; i < sz; ++i) {
|
||||
if (src & 1)
|
||||
i0 = i;
|
||||
src >>= 1;
|
||||
}
|
||||
|
||||
return i0;
|
||||
}
|
||||
|
||||
inline int
|
||||
signbit(uint64_t src, int sz)
|
||||
{
|
||||
int i0 = -1;
|
||||
|
||||
if (src & (1 << (sz - 1))) {
|
||||
for (int i = 0; i < sz - 1; ++i) {
|
||||
if (!(src & 1))
|
||||
i0 = i;
|
||||
src >>= 1;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < sz - 1; ++i) {
|
||||
if (src & 1)
|
||||
i0 = i;
|
||||
src >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return i0;
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
bitrev(uint64_t src, int sz)
|
||||
{
|
||||
uint64_t r = 0;
|
||||
|
||||
for (int i = 0; i < sz; ++i) {
|
||||
r <<= 1;
|
||||
if (src & 1)
|
||||
r |= 1;
|
||||
src >>= 1;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
mul_hi(uint32_t a, uint32_t b)
|
||||
{
|
||||
return ((uint64_t)a * (uint64_t)b) >> 32;
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
mul_hi(int32_t a, int32_t b)
|
||||
{
|
||||
return ((int64_t)a * (int64_t)b) >> 32;
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
mul_hi(uint64_t a, uint64_t b)
|
||||
{
|
||||
return ((uint64_t)a * (uint64_t)b) >> 32;
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
mul_hi(int64_t a, int64_t b)
|
||||
{
|
||||
return ((int64_t)a * (int64_t)b) >> 32;
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
mul_hi(double a, double b)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
class Label
|
||||
{
|
||||
public:
|
||||
std::string name;
|
||||
int value;
|
||||
|
||||
Label() : value(-1)
|
||||
{
|
||||
}
|
||||
|
||||
bool defined() { return value != -1; }
|
||||
|
||||
void
|
||||
checkName(std::string &_name)
|
||||
{
|
||||
if (name.empty()) {
|
||||
name = _name;
|
||||
} else {
|
||||
assert(name == _name);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
define(std::string &_name, int _value)
|
||||
{
|
||||
assert(!defined());
|
||||
assert(_value != -1);
|
||||
value = _value;
|
||||
checkName(_name);
|
||||
}
|
||||
|
||||
int
|
||||
get()
|
||||
{
|
||||
assert(defined());
|
||||
return value;
|
||||
}
|
||||
};
|
||||
|
||||
class LabelMap
|
||||
{
|
||||
std::map<std::string, Label> map;
|
||||
|
||||
public:
|
||||
LabelMap() { }
|
||||
|
||||
void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index,
|
||||
const BrigObject *obj);
|
||||
|
||||
Label *refLabel(const Brig::BrigDirectiveLabel *lbl,
|
||||
const BrigObject *obj);
|
||||
};
|
||||
|
||||
const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN;
|
||||
|
||||
extern const char *segmentNames[];
|
||||
|
||||
class StorageElement
|
||||
{
|
||||
public:
|
||||
std::string name;
|
||||
uint64_t offset;
|
||||
|
||||
uint64_t size;
|
||||
const Brig::BrigDirectiveVariable *brigSymbol;
|
||||
StorageElement(const char *_name, uint64_t _offset, int _size,
|
||||
const Brig::BrigDirectiveVariable *sym)
|
||||
: name(_name), offset(_offset), size(_size), brigSymbol(sym)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
class StorageSpace
|
||||
{
|
||||
typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*>
|
||||
DirVarToSE_map;
|
||||
|
||||
std::list<StorageElement*> elements;
|
||||
AddrRangeMap<StorageElement*> elements_by_addr;
|
||||
DirVarToSE_map elements_by_brigptr;
|
||||
|
||||
uint64_t nextOffset;
|
||||
Brig::BrigSegment segment;
|
||||
|
||||
public:
|
||||
StorageSpace(Brig::BrigSegment _class)
|
||||
: nextOffset(0), segment(_class)
|
||||
{
|
||||
}
|
||||
|
||||
StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
|
||||
const BrigObject *obj);
|
||||
|
||||
StorageElement* findSymbol(std::string name);
|
||||
StorageElement* findSymbol(uint64_t addr);
|
||||
StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr);
|
||||
|
||||
int getSize() { return nextOffset; }
|
||||
void resetOffset() { nextOffset = 0; }
|
||||
};
|
||||
|
||||
class StorageMap
|
||||
{
|
||||
StorageMap *outerScopeMap;
|
||||
StorageSpace *space[NumSegments];
|
||||
|
||||
public:
|
||||
StorageMap(StorageMap *outerScope = nullptr);
|
||||
|
||||
StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
|
||||
const BrigObject *obj);
|
||||
|
||||
StorageElement* findSymbol(Brig::BrigSegment segment, std::string name);
|
||||
StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr);
|
||||
|
||||
StorageElement* findSymbol(Brig::BrigSegment segment,
|
||||
const Brig::BrigDirectiveVariable *brigptr);
|
||||
|
||||
// overloaded version to avoid casting
|
||||
StorageElement*
|
||||
findSymbol(Brig::BrigSegment8_t segment, std::string name)
|
||||
{
|
||||
return findSymbol((Brig::BrigSegment)segment, name);
|
||||
}
|
||||
|
||||
int getSize(Brig::BrigSegment segment);
|
||||
void resetOffset(Brig::BrigSegment segment);
|
||||
};
|
||||
|
||||
typedef enum
|
||||
{
|
||||
BT_DEFAULT,
|
||||
BT_B8,
|
||||
BT_U8,
|
||||
BT_U16,
|
||||
BT_U32,
|
||||
BT_U64,
|
||||
BT_S8,
|
||||
BT_S16,
|
||||
BT_S32,
|
||||
BT_S64,
|
||||
BT_F16,
|
||||
BT_F32,
|
||||
BT_F64,
|
||||
BT_NULL
|
||||
} base_type_e;
|
||||
|
||||
/* @class HsailCode
|
||||
* the HsailCode class is used to store information
|
||||
* about HSA kernels stored in the BRIG format. it holds
|
||||
* all information about a kernel, function, or variable
|
||||
* symbol and provides methods for accessing that
|
||||
* information.
|
||||
*/
|
||||
|
||||
class HsailCode final : public HsaCode
|
||||
{
|
||||
public:
|
||||
TheGpuISA::Decoder decoder;
|
||||
|
||||
StorageMap *storageMap;
|
||||
LabelMap labelMap;
|
||||
uint32_t kernarg_start;
|
||||
uint32_t kernarg_end;
|
||||
int32_t private_size;
|
||||
|
||||
int32_t readonly_size;
|
||||
|
||||
// We track the maximum register index used for each register
|
||||
// class when we load the code so we can size the register files
|
||||
// appropriately (i.e., one more than the max index).
|
||||
uint32_t max_creg; // maximum c-register index
|
||||
uint32_t max_sreg; // maximum s-register index
|
||||
uint32_t max_dreg; // maximum d-register index
|
||||
|
||||
HsailCode(const std::string &name_str,
|
||||
const Brig::BrigDirectiveExecutable *code_dir,
|
||||
const BrigObject *obj,
|
||||
StorageMap *objStorageMap);
|
||||
|
||||
// this version is used to create a placeholder when
|
||||
// we encounter a kernel-related directive before the
|
||||
// kernel itself
|
||||
HsailCode(const std::string &name_str);
|
||||
|
||||
void init(const Brig::BrigDirectiveExecutable *code_dir,
|
||||
const BrigObject *obj, StorageMap *objStorageMap);
|
||||
|
||||
void
|
||||
generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const
|
||||
{
|
||||
hsaKernelInfo->sRegCount = max_sreg + 1;
|
||||
hsaKernelInfo->dRegCount = max_dreg + 1;
|
||||
hsaKernelInfo->cRegCount = max_creg + 1;
|
||||
|
||||
hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP);
|
||||
|
||||
hsaKernelInfo->private_mem_size =
|
||||
roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8);
|
||||
|
||||
hsaKernelInfo->spill_mem_size =
|
||||
roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8);
|
||||
}
|
||||
|
||||
int
|
||||
getSize(MemorySegment segment) const
|
||||
{
|
||||
Brig::BrigSegment brigSeg;
|
||||
|
||||
switch (segment) {
|
||||
case MemorySegment::NONE:
|
||||
brigSeg = Brig::BRIG_SEGMENT_NONE;
|
||||
break;
|
||||
case MemorySegment::FLAT:
|
||||
brigSeg = Brig::BRIG_SEGMENT_FLAT;
|
||||
break;
|
||||
case MemorySegment::GLOBAL:
|
||||
brigSeg = Brig::BRIG_SEGMENT_GLOBAL;
|
||||
break;
|
||||
case MemorySegment::READONLY:
|
||||
brigSeg = Brig::BRIG_SEGMENT_READONLY;
|
||||
break;
|
||||
case MemorySegment::KERNARG:
|
||||
brigSeg = Brig::BRIG_SEGMENT_KERNARG;
|
||||
break;
|
||||
case MemorySegment::GROUP:
|
||||
brigSeg = Brig::BRIG_SEGMENT_GROUP;
|
||||
break;
|
||||
case MemorySegment::PRIVATE:
|
||||
brigSeg = Brig::BRIG_SEGMENT_PRIVATE;
|
||||
break;
|
||||
case MemorySegment::SPILL:
|
||||
brigSeg = Brig::BRIG_SEGMENT_SPILL;
|
||||
break;
|
||||
case MemorySegment::ARG:
|
||||
brigSeg = Brig::BRIG_SEGMENT_ARG;
|
||||
break;
|
||||
case MemorySegment::EXTSPACE0:
|
||||
brigSeg = Brig::BRIG_SEGMENT_AMD_GCN;
|
||||
break;
|
||||
default:
|
||||
fatal("Unknown BrigSegment type.\n");
|
||||
}
|
||||
|
||||
return getSize(brigSeg);
|
||||
}
|
||||
|
||||
private:
|
||||
int
|
||||
getSize(Brig::BrigSegment segment) const
|
||||
{
|
||||
if (segment == Brig::BRIG_SEGMENT_PRIVATE) {
|
||||
// with the code generated by new HSA compiler the assertion
|
||||
// does not hold anymore..
|
||||
//assert(private_size != -1);
|
||||
return private_size;
|
||||
} else {
|
||||
return storageMap->getSize(segment);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
StorageElement*
|
||||
findSymbol(Brig::BrigSegment segment, uint64_t addr)
|
||||
{
|
||||
return storageMap->findSymbol(segment, addr);
|
||||
}
|
||||
|
||||
void
|
||||
setPrivateSize(int32_t _private_size)
|
||||
{
|
||||
private_size = _private_size;
|
||||
}
|
||||
|
||||
Label*
|
||||
refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj)
|
||||
{
|
||||
return labelMap.refLabel(lbl, obj);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __HSAIL_CODE_HH__
|
296
src/gpu-compute/kernel_cfg.cc
Normal file
296
src/gpu-compute/kernel_cfg.cc
Normal file
|
@ -0,0 +1,296 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#include "gpu-compute/kernel_cfg.hh"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
|
||||
void
|
||||
ControlFlowInfo::assignImmediatePostDominators(
|
||||
const std::vector<GPUStaticInst*>& instructions)
|
||||
{
|
||||
ControlFlowInfo cfg(instructions);
|
||||
cfg.findImmediatePostDominators();
|
||||
}
|
||||
|
||||
|
||||
ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) :
|
||||
instructions(insts)
|
||||
{
|
||||
createBasicBlocks();
|
||||
connectBasicBlocks();
|
||||
}
|
||||
|
||||
BasicBlock*
|
||||
ControlFlowInfo::basicBlock(int inst_num) const {
|
||||
for (auto& block: basicBlocks) {
|
||||
int first_block_id = block->firstInstruction->instNum();
|
||||
if (inst_num >= first_block_id &&
|
||||
inst_num < first_block_id + block->size) {
|
||||
return block.get();
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
GPUStaticInst*
|
||||
ControlFlowInfo::lastInstruction(const BasicBlock* block) const
|
||||
{
|
||||
if (block->isExit()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return instructions.at(block->firstInstruction->instNum() +
|
||||
block->size - 1);
|
||||
}
|
||||
|
||||
BasicBlock*
|
||||
ControlFlowInfo::postDominator(const BasicBlock* block) const
|
||||
{
|
||||
if (block->isExit()) {
|
||||
return nullptr;
|
||||
}
|
||||
return basicBlock(lastInstruction(block)->ipdInstNum());
|
||||
}
|
||||
|
||||
void
|
||||
ControlFlowInfo::createBasicBlocks()
|
||||
{
|
||||
assert(!instructions.empty());
|
||||
std::set<int> leaders;
|
||||
// first instruction is a leader
|
||||
leaders.insert(0);
|
||||
for (int i = 1; i < instructions.size(); i++) {
|
||||
GPUStaticInst* instruction = instructions[i];
|
||||
if (instruction->o_type == Enums::OT_BRANCH) {
|
||||
const int target_pc = instruction->getTargetPc();
|
||||
leaders.insert(target_pc);
|
||||
leaders.insert(i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
size_t block_size = 0;
|
||||
for (int i = 0; i < instructions.size(); i++) {
|
||||
if (leaders.find(i) != leaders.end()) {
|
||||
uint32_t id = basicBlocks.size();
|
||||
if (id > 0) {
|
||||
basicBlocks.back()->size = block_size;
|
||||
}
|
||||
block_size = 0;
|
||||
basicBlocks.emplace_back(new BasicBlock(id, instructions[i]));
|
||||
}
|
||||
block_size++;
|
||||
}
|
||||
basicBlocks.back()->size = block_size;
|
||||
// exit basic block
|
||||
basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr));
|
||||
}
|
||||
|
||||
void
|
||||
ControlFlowInfo::connectBasicBlocks()
|
||||
{
|
||||
BasicBlock* exit_bb = basicBlocks.back().get();
|
||||
for (auto& bb : basicBlocks) {
|
||||
if (bb->isExit()) {
|
||||
break;
|
||||
}
|
||||
GPUStaticInst* last = lastInstruction(bb.get());
|
||||
if (last->o_type == Enums::OT_RET) {
|
||||
bb->successorIds.insert(exit_bb->id);
|
||||
break;
|
||||
}
|
||||
if (last->o_type == Enums::OT_BRANCH) {
|
||||
const uint32_t target_pc = last->getTargetPc();
|
||||
BasicBlock* target_bb = basicBlock(target_pc);
|
||||
bb->successorIds.insert(target_bb->id);
|
||||
}
|
||||
|
||||
// Unconditional jump instructions have a unique successor
|
||||
if (!last->unconditionalJumpInstruction()) {
|
||||
BasicBlock* next_bb = basicBlock(last->instNum() + 1);
|
||||
bb->successorIds.insert(next_bb->id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// In-place set intersection
|
||||
static void
|
||||
intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b)
|
||||
{
|
||||
std::set<uint32_t>::iterator it = a.begin();
|
||||
while (it != a.end()) {
|
||||
it = b.find(*it) != b.end() ? ++it : a.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ControlFlowInfo::findPostDominators()
|
||||
{
|
||||
// the only postdominator of the exit block is itself
|
||||
basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id);
|
||||
//copy all basic blocks to all postdominator lists except for exit block
|
||||
for (auto& block : basicBlocks) {
|
||||
if (!block->isExit()) {
|
||||
for (uint32_t i = 0; i < basicBlocks.size(); i++) {
|
||||
block->postDominatorIds.insert(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool change = true;
|
||||
while (change) {
|
||||
change = false;
|
||||
for (int h = basicBlocks.size() - 2; h >= 0; --h) {
|
||||
size_t num_postdominators =
|
||||
basicBlocks[h]->postDominatorIds.size();
|
||||
for (int s : basicBlocks[h]->successorIds) {
|
||||
intersect(basicBlocks[h]->postDominatorIds,
|
||||
basicBlocks[s]->postDominatorIds);
|
||||
}
|
||||
basicBlocks[h]->postDominatorIds.insert(h);
|
||||
change |= (num_postdominators
|
||||
!= basicBlocks[h]->postDominatorIds.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// In-place set difference
|
||||
static void
|
||||
setDifference(std::set<uint32_t>&a,
|
||||
const std::set<uint32_t>& b, uint32_t exception)
|
||||
{
|
||||
for (uint32_t b_elem : b) {
|
||||
if (b_elem != exception) {
|
||||
a.erase(b_elem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ControlFlowInfo::findImmediatePostDominators()
|
||||
{
|
||||
assert(basicBlocks.size() > 1); // Entry and exit blocks must be present
|
||||
|
||||
findPostDominators();
|
||||
|
||||
for (auto& basicBlock : basicBlocks) {
|
||||
if (basicBlock->isExit()) {
|
||||
continue;
|
||||
}
|
||||
std::set<uint32_t> candidates = basicBlock->postDominatorIds;
|
||||
candidates.erase(basicBlock->id);
|
||||
for (uint32_t postDominatorId : basicBlock->postDominatorIds) {
|
||||
if (postDominatorId != basicBlock->id) {
|
||||
setDifference(candidates,
|
||||
basicBlocks[postDominatorId]->postDominatorIds,
|
||||
postDominatorId);
|
||||
}
|
||||
}
|
||||
assert(candidates.size() == 1);
|
||||
GPUStaticInst* last_instruction = lastInstruction(basicBlock.get());
|
||||
BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get();
|
||||
if (!ipd_block->isExit()) {
|
||||
GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction;
|
||||
last_instruction->ipdInstNum(ipd_first_inst->instNum());
|
||||
} else {
|
||||
last_instruction->ipdInstNum(last_instruction->instNum() + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ControlFlowInfo::printPostDominators() const
|
||||
{
|
||||
for (auto& block : basicBlocks) {
|
||||
std::cout << "PD(" << block->id << ") = {";
|
||||
std::copy(block->postDominatorIds.begin(),
|
||||
block->postDominatorIds.end(),
|
||||
std::ostream_iterator<uint32_t>(std::cout, ", "));
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ControlFlowInfo::printImmediatePostDominators() const
|
||||
{
|
||||
for (const auto& block : basicBlocks) {
|
||||
if (block->isExit()) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "IPD(" << block->id << ") = ";
|
||||
std::cout << postDominator(block.get())->id << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
void
|
||||
ControlFlowInfo::printBasicBlocks() const
|
||||
{
|
||||
for (GPUStaticInst* inst : instructions) {
|
||||
int inst_num = inst->instNum();
|
||||
std::cout << inst_num << " [" << basicBlock(inst_num)->id
|
||||
<< "]: " << inst->disassemble();
|
||||
if (inst->o_type == Enums::OT_BRANCH) {
|
||||
std::cout << ", PC = " << inst->getTargetPc();
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ControlFlowInfo::printBasicBlockDot() const
|
||||
{
|
||||
printf("digraph {\n");
|
||||
for (const auto& basic_block : basicBlocks) {
|
||||
printf("\t");
|
||||
for (uint32_t successorId : basic_block->successorIds) {
|
||||
printf("%d -> %d; ", basic_block->id, successorId);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("}\n");
|
||||
}
|
133
src/gpu-compute/kernel_cfg.hh
Normal file
133
src/gpu-compute/kernel_cfg.hh
Normal file
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __KERNEL_CFG_HH__
|
||||
#define __KERNEL_CFG_HH__
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
|
||||
class GPUStaticInst;
|
||||
class HsailCode;
|
||||
|
||||
struct BasicBlock
|
||||
{
|
||||
BasicBlock(uint32_t num, GPUStaticInst* begin) :
|
||||
id(num), size(0), firstInstruction(begin)
|
||||
{
|
||||
}
|
||||
|
||||
bool
|
||||
isEntry() const
|
||||
{
|
||||
return !id;
|
||||
}
|
||||
|
||||
bool
|
||||
isExit() const
|
||||
{
|
||||
return !size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unique identifier for the block within a given kernel.
|
||||
*/
|
||||
const uint32_t id;
|
||||
|
||||
/**
|
||||
* Number of instructions contained in the block
|
||||
*/
|
||||
size_t size;
|
||||
|
||||
/**
|
||||
* Pointer to first instruction of the block.
|
||||
*/
|
||||
GPUStaticInst* firstInstruction;
|
||||
|
||||
/**
|
||||
* Identifiers of the blocks that follow (are reachable from) this block.
|
||||
*/
|
||||
std::set<uint32_t> successorIds;
|
||||
|
||||
/**
|
||||
* Identifiers of the blocks that will be visited from this block.
|
||||
*/
|
||||
std::set<uint32_t> postDominatorIds;
|
||||
};
|
||||
|
||||
class ControlFlowInfo
|
||||
{
|
||||
public:
|
||||
|
||||
/**
|
||||
* Compute immediate post-dominator instruction for kernel instructions.
|
||||
*/
|
||||
static void assignImmediatePostDominators(
|
||||
const std::vector<GPUStaticInst*>& instructions);
|
||||
|
||||
private:
|
||||
ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions);
|
||||
|
||||
GPUStaticInst* lastInstruction(const BasicBlock* block) const;
|
||||
|
||||
BasicBlock* basicBlock(int inst_num) const;
|
||||
|
||||
BasicBlock* postDominator(const BasicBlock* block) const;
|
||||
|
||||
void createBasicBlocks();
|
||||
|
||||
void connectBasicBlocks();
|
||||
|
||||
void findPostDominators();
|
||||
|
||||
void findImmediatePostDominators();
|
||||
|
||||
void printBasicBlocks() const;
|
||||
|
||||
void printBasicBlockDot() const;
|
||||
|
||||
void printPostDominators() const;
|
||||
|
||||
void printImmediatePostDominators() const;
|
||||
|
||||
std::vector<std::unique_ptr<BasicBlock>> basicBlocks;
|
||||
std::vector<GPUStaticInst*> instructions;
|
||||
};
|
||||
|
||||
#endif // __KERNEL_CFG_HH__
|
341
src/gpu-compute/lds_state.cc
Normal file
341
src/gpu-compute/lds_state.cc
Normal file
|
@ -0,0 +1,341 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos, Joe Gross
|
||||
*/
|
||||
|
||||
#include "gpu-compute/lds_state.hh"
|
||||
|
||||
#include <array>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
|
||||
/**
|
||||
* the default constructor that works with SWIG
|
||||
*/
|
||||
LdsState::LdsState(const Params *params) :
|
||||
MemObject(params),
|
||||
tickEvent(this),
|
||||
cuPort(name() + ".port", this),
|
||||
maximumSize(params->size),
|
||||
range(params->range),
|
||||
bankConflictPenalty(params->bankConflictPenalty),
|
||||
banks(params->banks)
|
||||
{
|
||||
fatal_if(params->banks <= 0,
|
||||
"Number of LDS banks should be positive number");
|
||||
fatal_if((params->banks & (params->banks - 1)) != 0,
|
||||
"Number of LDS banks should be a power of 2");
|
||||
fatal_if(params->size <= 0,
|
||||
"cannot allocate an LDS with a size less than 1");
|
||||
fatal_if(params->size % 2,
|
||||
"the LDS should be an even number");
|
||||
}
|
||||
|
||||
/**
|
||||
* Needed by the SWIG compiler
|
||||
*/
|
||||
LdsState *
|
||||
LdsStateParams::create()
|
||||
{
|
||||
return new LdsState(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* set the parent and name based on the parent
|
||||
*/
|
||||
void
|
||||
LdsState::setParent(ComputeUnit *x_parent)
|
||||
{
|
||||
// check that this gets assigned to the same thing each time
|
||||
fatal_if(!x_parent, "x_parent should not be nullptr");
|
||||
fatal_if(x_parent == parent,
|
||||
"should not be setting the parent twice");
|
||||
|
||||
parent = x_parent;
|
||||
_name = x_parent->name() + ".LdsState";
|
||||
}
|
||||
|
||||
/**
|
||||
* derive the gpu mem packet from the packet and then count the bank conflicts
|
||||
*/
|
||||
unsigned
|
||||
LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
|
||||
{
|
||||
Packet::SenderState *baseSenderState = packet->senderState;
|
||||
while (baseSenderState->predecessor) {
|
||||
baseSenderState = baseSenderState->predecessor;
|
||||
}
|
||||
const ComputeUnit::LDSPort::SenderState *senderState =
|
||||
dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
|
||||
|
||||
fatal_if(!senderState,
|
||||
"did not get the right sort of sender state");
|
||||
|
||||
GPUDynInstPtr gpuDynInst = senderState->getMemInst();
|
||||
|
||||
return countBankConflicts(gpuDynInst, bankAccesses);
|
||||
}
|
||||
|
||||
// Count the total number of bank conflicts for the local memory packet
|
||||
unsigned
|
||||
LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
|
||||
unsigned *numBankAccesses)
|
||||
{
|
||||
int bank_conflicts = 0;
|
||||
std::vector<int> bank;
|
||||
// the number of LDS banks being touched by the memory instruction
|
||||
int numBanks = std::min(parent->wfSize(), banks);
|
||||
// if the wavefront size is larger than the number of LDS banks, we
|
||||
// need to iterate over all work items to calculate the total
|
||||
// number of bank conflicts
|
||||
int groups = (parent->wfSize() > numBanks) ?
|
||||
(parent->wfSize() / numBanks) : 1;
|
||||
for (int i = 0; i < groups; i++) {
|
||||
// Address Array holding all the work item addresses of an instruction
|
||||
std::vector<Addr> addr_array;
|
||||
addr_array.resize(numBanks, 0);
|
||||
bank.clear();
|
||||
bank.resize(banks, 0);
|
||||
int max_bank = 0;
|
||||
|
||||
// populate the address array for all active work items
|
||||
for (int j = 0; j < numBanks; j++) {
|
||||
if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
|
||||
addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
|
||||
} else {
|
||||
addr_array[j] = std::numeric_limits<Addr>::max();
|
||||
}
|
||||
}
|
||||
|
||||
if (gpuDynInst->m_op == Enums::MO_LD ||
|
||||
gpuDynInst->m_op == Enums::MO_ST) {
|
||||
// mask identical addresses
|
||||
for (int j = 0; j < numBanks; ++j) {
|
||||
for (int j0 = 0; j0 < j; j0++) {
|
||||
if (addr_array[j] != std::numeric_limits<Addr>::max()
|
||||
&& addr_array[j] == addr_array[j0]) {
|
||||
addr_array[j] = std::numeric_limits<Addr>::max();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// calculate bank conflicts
|
||||
for (int j = 0; j < numBanks; ++j) {
|
||||
if (addr_array[j] != std::numeric_limits<Addr>::max()) {
|
||||
int bankId = addr_array[j] % banks;
|
||||
bank[bankId]++;
|
||||
max_bank = std::max(max_bank, bank[bankId]);
|
||||
// Count the number of LDS banks accessed.
|
||||
// Since we have masked identical addresses all remaining
|
||||
// accesses will need to be serialized if they access
|
||||
// the same bank (bank conflict).
|
||||
(*numBankAccesses)++;
|
||||
}
|
||||
}
|
||||
bank_conflicts += max_bank;
|
||||
}
|
||||
panic_if(bank_conflicts > parent->wfSize(),
|
||||
"Max bank conflicts should match num of work items per instr");
|
||||
return bank_conflicts;
|
||||
}
|
||||
|
||||
/**
|
||||
* receive the packet from the CU
|
||||
*/
|
||||
bool
|
||||
LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
|
||||
{
|
||||
return ownerLds->processPacket(packet);
|
||||
}
|
||||
|
||||
GPUDynInstPtr
|
||||
LdsState::getDynInstr(PacketPtr packet)
|
||||
{
|
||||
ComputeUnit::LDSPort::SenderState *ss =
|
||||
dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
|
||||
packet->senderState);
|
||||
return ss->getMemInst();
|
||||
}
|
||||
|
||||
/**
|
||||
* process an incoming packet, add it to the return queue
|
||||
*/
|
||||
bool
|
||||
LdsState::processPacket(PacketPtr packet)
|
||||
{
|
||||
unsigned bankAccesses = 0;
|
||||
// the number of conflicts this packet will have when accessing the LDS
|
||||
unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
|
||||
// count the total number of physical LDS bank accessed
|
||||
parent->ldsBankAccesses += bankAccesses;
|
||||
// count the LDS bank conflicts. A number set to 1 indicates one
|
||||
// access per bank maximum so there are no bank conflicts
|
||||
parent->ldsBankConflictDist.sample(bankConflicts-1);
|
||||
|
||||
GPUDynInstPtr dynInst = getDynInstr(packet);
|
||||
// account for the LDS bank conflict overhead
|
||||
int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
|
||||
(dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
|
||||
parent->loadBusLength();
|
||||
// delay for accessing the LDS
|
||||
Tick processingTime =
|
||||
parent->shader->ticks(bankConflicts * bankConflictPenalty) +
|
||||
parent->shader->ticks(busLength);
|
||||
// choose (delay + last packet in queue) or (now + delay) as the time to
|
||||
// return this
|
||||
Tick doneAt = earliestReturnTime() + processingTime;
|
||||
// then store it for processing
|
||||
return returnQueuePush(std::make_pair(doneAt, packet));
|
||||
}
|
||||
|
||||
/**
|
||||
* add this to the queue of packets to be returned
|
||||
*/
|
||||
bool
|
||||
LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
|
||||
{
|
||||
// TODO add time limits (e.g. one packet per cycle) and queue size limits
|
||||
// and implement flow control
|
||||
returnQueue.push(thePair);
|
||||
|
||||
// if there is no set wakeup time, look through the queue
|
||||
if (!tickEvent.scheduled()) {
|
||||
process();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* receive a packet in functional mode
|
||||
*/
|
||||
void
|
||||
LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
|
||||
{
|
||||
fatal("not implemented");
|
||||
}
|
||||
|
||||
/**
|
||||
* receive a retry for a response
|
||||
*/
|
||||
void
|
||||
LdsState::CuSidePort::recvRespRetry()
|
||||
{
|
||||
// TODO verify that this is the right way to do this
|
||||
assert(ownerLds->isRetryResp());
|
||||
ownerLds->setRetryResp(false);
|
||||
ownerLds->process();
|
||||
}
|
||||
|
||||
/**
|
||||
* receive a retry
|
||||
*/
|
||||
void
|
||||
LdsState::CuSidePort::recvRetry()
|
||||
{
|
||||
fatal("not implemented");
|
||||
}
|
||||
|
||||
/**
|
||||
* look for packets to return at this time
|
||||
*/
|
||||
bool
|
||||
LdsState::process()
|
||||
{
|
||||
Tick now = clockEdge();
|
||||
|
||||
// send back completed packets
|
||||
while (!returnQueue.empty() && returnQueue.front().first <= now) {
|
||||
PacketPtr packet = returnQueue.front().second;
|
||||
|
||||
ComputeUnit::LDSPort::SenderState *ss =
|
||||
dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
|
||||
packet->senderState);
|
||||
|
||||
GPUDynInstPtr gpuDynInst = ss->getMemInst();
|
||||
|
||||
gpuDynInst->initiateAcc(gpuDynInst);
|
||||
|
||||
packet->makeTimingResponse();
|
||||
|
||||
returnQueue.pop();
|
||||
|
||||
bool success = cuPort.sendTimingResp(packet);
|
||||
|
||||
if (!success) {
|
||||
retryResp = true;
|
||||
panic("have not handled timing responses being NACK'd when sent"
|
||||
"back");
|
||||
}
|
||||
}
|
||||
|
||||
// determine the next wakeup time
|
||||
if (!returnQueue.empty()) {
|
||||
|
||||
Tick next = returnQueue.front().first;
|
||||
|
||||
if (tickEvent.scheduled()) {
|
||||
|
||||
if (next < tickEvent.when()) {
|
||||
|
||||
tickEvent.deschedule();
|
||||
tickEvent.schedule(next);
|
||||
}
|
||||
} else {
|
||||
tickEvent.schedule(next);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* wake up at this time and perform specified actions
|
||||
*/
|
||||
void
|
||||
LdsState::TickEvent::process()
|
||||
{
|
||||
ldsState->process();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
void
|
||||
LdsState::regStats()
|
||||
{
|
||||
}
|
512
src/gpu-compute/lds_state.hh
Normal file
512
src/gpu-compute/lds_state.hh
Normal file
|
@ -0,0 +1,512 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos, Joe Gross
|
||||
*/
|
||||
|
||||
#ifndef __LDS_STATE_HH__
|
||||
#define __LDS_STATE_HH__
|
||||
|
||||
#include <array>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "enums/MemOpType.hh"
|
||||
#include "enums/MemType.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "mem/mem_object.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "params/LdsState.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
|
||||
/**
|
||||
* this represents a slice of the overall LDS, intended to be associated with an
|
||||
* individual workgroup
|
||||
*/
|
||||
class LdsChunk
|
||||
{
|
||||
public:
|
||||
LdsChunk(const uint32_t x_size):
|
||||
chunk(x_size)
|
||||
{
|
||||
}
|
||||
|
||||
LdsChunk() {}
|
||||
|
||||
/**
|
||||
* a read operation
|
||||
*/
|
||||
template<class T>
|
||||
T
|
||||
read(const uint32_t index)
|
||||
{
|
||||
fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
|
||||
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
|
||||
T *p0 = (T *) (&(chunk.at(index)));
|
||||
return *p0;
|
||||
}
|
||||
|
||||
/**
|
||||
* a write operation
|
||||
*/
|
||||
template<class T>
|
||||
void
|
||||
write(const uint32_t index, const T value)
|
||||
{
|
||||
fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
|
||||
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
|
||||
T *p0 = (T *) (&(chunk.at(index)));
|
||||
*p0 = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the size of this chunk
|
||||
*/
|
||||
std::vector<uint8_t>::size_type
|
||||
size() const
|
||||
{
|
||||
return chunk.size();
|
||||
}
|
||||
|
||||
protected:
|
||||
// the actual data store for this slice of the LDS
|
||||
std::vector<uint8_t> chunk;
|
||||
};
|
||||
|
||||
// Local Data Share (LDS) State per Wavefront (contents of the LDS region
|
||||
// allocated to the WorkGroup of this Wavefront)
|
||||
class LdsState: public MemObject
|
||||
{
|
||||
protected:
|
||||
|
||||
/**
|
||||
* an event to allow event-driven execution
|
||||
*/
|
||||
class TickEvent: public Event
|
||||
{
|
||||
protected:
|
||||
|
||||
LdsState *ldsState = nullptr;
|
||||
|
||||
Tick nextTick = 0;
|
||||
|
||||
public:
|
||||
|
||||
TickEvent(LdsState *_ldsState) :
|
||||
ldsState(_ldsState)
|
||||
{
|
||||
}
|
||||
|
||||
virtual void
|
||||
process();
|
||||
|
||||
void
|
||||
schedule(Tick when)
|
||||
{
|
||||
mainEventQueue[0]->schedule(this, when);
|
||||
}
|
||||
|
||||
void
|
||||
deschedule()
|
||||
{
|
||||
mainEventQueue[0]->deschedule(this);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* CuSidePort is the LDS Port closer to the CU side
|
||||
*/
|
||||
class CuSidePort: public SlavePort
|
||||
{
|
||||
public:
|
||||
CuSidePort(const std::string &_name, LdsState *_ownerLds) :
|
||||
SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
LdsState *ownerLds;
|
||||
|
||||
virtual bool
|
||||
recvTimingReq(PacketPtr pkt);
|
||||
|
||||
virtual Tick
|
||||
recvAtomic(PacketPtr pkt)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
virtual void
|
||||
recvFunctional(PacketPtr pkt);
|
||||
|
||||
virtual void
|
||||
recvRangeChange()
|
||||
{
|
||||
}
|
||||
|
||||
virtual void
|
||||
recvRetry();
|
||||
|
||||
virtual void
|
||||
recvRespRetry();
|
||||
|
||||
virtual AddrRangeList
|
||||
getAddrRanges() const
|
||||
{
|
||||
AddrRangeList ranges;
|
||||
ranges.push_back(ownerLds->getAddrRange());
|
||||
return ranges;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
loadData(PacketPtr packet);
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
storeData(PacketPtr packet);
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
atomicOperation(PacketPtr packet);
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
// the lds reference counter
|
||||
// The key is the workgroup ID and dispatch ID
|
||||
// The value is the number of wavefronts that reference this LDS, as
|
||||
// wavefronts are launched, the counter goes up for that workgroup and when
|
||||
// they return it decreases, once it reaches 0 then this chunk of the LDS is
|
||||
// returned to the available pool. However,it is deallocated on the 1->0
|
||||
// transition, not whenever the counter is 0 as it always starts with 0 when
|
||||
// the workgroup asks for space
|
||||
std::unordered_map<uint32_t,
|
||||
std::unordered_map<uint32_t, int32_t>> refCounter;
|
||||
|
||||
// the map that allows workgroups to access their own chunk of the LDS
|
||||
std::unordered_map<uint32_t,
|
||||
std::unordered_map<uint32_t, LdsChunk>> chunkMap;
|
||||
|
||||
// an event to allow the LDS to wake up at a specified time
|
||||
TickEvent tickEvent;
|
||||
|
||||
// the queue of packets that are going back to the CU after a
|
||||
// read/write/atomic op
|
||||
// TODO need to make this have a maximum size to create flow control
|
||||
std::queue<std::pair<Tick, PacketPtr>> returnQueue;
|
||||
|
||||
// whether or not there are pending responses
|
||||
bool retryResp = false;
|
||||
|
||||
bool
|
||||
process();
|
||||
|
||||
GPUDynInstPtr
|
||||
getDynInstr(PacketPtr packet);
|
||||
|
||||
bool
|
||||
processPacket(PacketPtr packet);
|
||||
|
||||
unsigned
|
||||
countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
|
||||
|
||||
unsigned
|
||||
countBankConflicts(GPUDynInstPtr gpuDynInst,
|
||||
unsigned *numBankAccesses);
|
||||
|
||||
public:
|
||||
typedef LdsStateParams Params;
|
||||
|
||||
LdsState(const Params *params);
|
||||
|
||||
// prevent copy construction
|
||||
LdsState(const LdsState&) = delete;
|
||||
|
||||
~LdsState()
|
||||
{
|
||||
parent = nullptr;
|
||||
}
|
||||
|
||||
const Params *
|
||||
params() const
|
||||
{
|
||||
return dynamic_cast<const Params *>(_params);
|
||||
}
|
||||
|
||||
bool
|
||||
isRetryResp() const
|
||||
{
|
||||
return retryResp;
|
||||
}
|
||||
|
||||
void
|
||||
setRetryResp(const bool value)
|
||||
{
|
||||
retryResp = value;
|
||||
}
|
||||
|
||||
// prevent assignment
|
||||
LdsState &
|
||||
operator=(const LdsState &) = delete;
|
||||
|
||||
/**
|
||||
* use the dynamic wave id to create or just increase the reference count
|
||||
*/
|
||||
int
|
||||
increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
|
||||
{
|
||||
int refCount = getRefCounter(dispatchId, wgId);
|
||||
fatal_if(refCount < 0,
|
||||
"reference count should not be below zero");
|
||||
return ++refCounter[dispatchId][wgId];
|
||||
}
|
||||
|
||||
/**
|
||||
* decrease the reference count after making sure it is in the list
|
||||
* give back this chunk if the ref counter has reached 0
|
||||
*/
|
||||
int
|
||||
decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
|
||||
{
|
||||
int refCount = getRefCounter(dispatchId, wgId);
|
||||
|
||||
fatal_if(refCount <= 0,
|
||||
"reference count should not be below zero or at zero to"
|
||||
"decrement");
|
||||
|
||||
refCounter[dispatchId][wgId]--;
|
||||
|
||||
if (refCounter[dispatchId][wgId] == 0) {
|
||||
releaseSpace(dispatchId, wgId);
|
||||
return 0;
|
||||
} else {
|
||||
return refCounter[dispatchId][wgId];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* return the current reference count for this workgroup id
|
||||
*/
|
||||
int
|
||||
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
|
||||
{
|
||||
auto dispatchIter = chunkMap.find(dispatchId);
|
||||
fatal_if(dispatchIter == chunkMap.end(),
|
||||
"could not locate this dispatch id [%d]", dispatchId);
|
||||
|
||||
auto workgroup = dispatchIter->second.find(wgId);
|
||||
fatal_if(workgroup == dispatchIter->second.end(),
|
||||
"could not find this workgroup id within this dispatch id"
|
||||
" did[%d] wgid[%d]", dispatchId, wgId);
|
||||
|
||||
auto refCountIter = refCounter.find(dispatchId);
|
||||
if (refCountIter == refCounter.end()) {
|
||||
fatal("could not locate this dispatch id [%d]", dispatchId);
|
||||
} else {
|
||||
auto workgroup = refCountIter->second.find(wgId);
|
||||
if (workgroup == refCountIter->second.end()) {
|
||||
fatal("could not find this workgroup id within this dispatch id"
|
||||
" did[%d] wgid[%d]", dispatchId, wgId);
|
||||
} else {
|
||||
return refCounter.at(dispatchId).at(wgId);
|
||||
}
|
||||
}
|
||||
|
||||
fatal("should not reach this point");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* assign a parent and request this amount of space be set aside
|
||||
* for this wgid
|
||||
*/
|
||||
LdsChunk *
|
||||
reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
|
||||
const uint32_t size)
|
||||
{
|
||||
if (chunkMap.find(dispatchId) != chunkMap.end()) {
|
||||
fatal_if(
|
||||
chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
|
||||
"duplicate workgroup ID asking for space in the LDS "
|
||||
"did[%d] wgid[%d]", dispatchId, wgId);
|
||||
}
|
||||
|
||||
fatal_if(bytesAllocated + size > maximumSize,
|
||||
"request would ask for more space than is available");
|
||||
|
||||
bytesAllocated += size;
|
||||
|
||||
chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
|
||||
// make an entry for this workgroup
|
||||
refCounter[dispatchId][wgId] = 0;
|
||||
|
||||
return &chunkMap[dispatchId][wgId];
|
||||
}
|
||||
|
||||
bool
|
||||
returnQueuePush(std::pair<Tick, PacketPtr> thePair);
|
||||
|
||||
Tick
|
||||
earliestReturnTime() const
|
||||
{
|
||||
// TODO set to max(lastCommand+1, curTick())
|
||||
return returnQueue.empty() ? curTick() : returnQueue.back().first;
|
||||
}
|
||||
|
||||
void
|
||||
setParent(ComputeUnit *x_parent);
|
||||
|
||||
void
|
||||
regStats();
|
||||
|
||||
// accessors
|
||||
ComputeUnit *
|
||||
getParent() const
|
||||
{
|
||||
return parent;
|
||||
}
|
||||
|
||||
std::string
|
||||
getName()
|
||||
{
|
||||
return _name;
|
||||
}
|
||||
|
||||
int
|
||||
getBanks() const
|
||||
{
|
||||
return banks;
|
||||
}
|
||||
|
||||
ComputeUnit *
|
||||
getComputeUnit() const
|
||||
{
|
||||
return parent;
|
||||
}
|
||||
|
||||
int
|
||||
getBankConflictPenalty() const
|
||||
{
|
||||
return bankConflictPenalty;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the allocated size for this workgroup
|
||||
*/
|
||||
std::size_t
|
||||
ldsSize(const uint32_t x_wgId)
|
||||
{
|
||||
return chunkMap[x_wgId].size();
|
||||
}
|
||||
|
||||
AddrRange
|
||||
getAddrRange() const
|
||||
{
|
||||
return range;
|
||||
}
|
||||
|
||||
virtual BaseSlavePort &
|
||||
getSlavePort(const std::string& if_name, PortID idx)
|
||||
{
|
||||
if (if_name == "cuPort") {
|
||||
// TODO need to set name dynamically at this point?
|
||||
return cuPort;
|
||||
} else {
|
||||
fatal("cannot resolve the port name " + if_name);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* can this much space be reserved for a workgroup?
|
||||
*/
|
||||
bool
|
||||
canReserve(uint32_t x_size) const
|
||||
{
|
||||
return bytesAllocated + x_size <= maximumSize;
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* give back the space
|
||||
*/
|
||||
bool
|
||||
releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
|
||||
{
|
||||
auto dispatchIter = chunkMap.find(x_dispatchId);
|
||||
|
||||
if (dispatchIter == chunkMap.end()) {
|
||||
fatal("dispatch id not found [%d]", x_dispatchId);
|
||||
} else {
|
||||
auto workgroupIter = dispatchIter->second.find(x_wgId);
|
||||
if (workgroupIter == dispatchIter->second.end()) {
|
||||
fatal("workgroup id [%d] not found in dispatch id [%d]",
|
||||
x_wgId, x_dispatchId);
|
||||
}
|
||||
}
|
||||
|
||||
fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
|
||||
"releasing more space than was allocated");
|
||||
|
||||
bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
|
||||
chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
|
||||
return true;
|
||||
}
|
||||
|
||||
// the port that connects this LDS to its owner CU
|
||||
CuSidePort cuPort;
|
||||
|
||||
ComputeUnit* parent = nullptr;
|
||||
|
||||
std::string _name;
|
||||
|
||||
// the number of bytes currently reserved by all workgroups
|
||||
int bytesAllocated = 0;
|
||||
|
||||
// the size of the LDS, the most bytes available
|
||||
int maximumSize;
|
||||
|
||||
// Address range of this memory
|
||||
AddrRange range;
|
||||
|
||||
// the penalty, in cycles, for each LDS bank conflict
|
||||
int bankConflictPenalty = 0;
|
||||
|
||||
// the number of banks in the LDS underlying data store
|
||||
int banks = 0;
|
||||
};
|
||||
|
||||
#endif // __LDS_STATE_HH__
|
200
src/gpu-compute/local_memory_pipeline.cc
Normal file
200
src/gpu-compute/local_memory_pipeline.cc
Normal file
|
@ -0,0 +1,200 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/local_memory_pipeline.hh"
|
||||
|
||||
#include "debug/GPUPort.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) :
|
||||
computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
LocalMemPipeline::init(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".LocalMemPipeline";
|
||||
}
|
||||
|
||||
void
|
||||
LocalMemPipeline::exec()
|
||||
{
|
||||
// apply any returned shared (LDS) memory operations
|
||||
GPUDynInstPtr m = !lmReturnedRequests.empty() ?
|
||||
lmReturnedRequests.front() : nullptr;
|
||||
|
||||
bool accessVrf = true;
|
||||
if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
|
||||
Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
|
||||
|
||||
accessVrf =
|
||||
w->computeUnit->vrf[m->simdId]->
|
||||
vrfOperandAccessReady(m->seqNum(), w, m,
|
||||
VrfAccessType::WRITE);
|
||||
}
|
||||
|
||||
if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
|
||||
computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
|
||||
|| computeUnit->wfWait.at(m->pipeId).rdy())) {
|
||||
if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
|
||||
doSmReturn<uint32_t, uint8_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
|
||||
doSmReturn<uint32_t, uint16_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
|
||||
doSmReturn<uint32_t, uint32_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
|
||||
doSmReturn<int32_t, int8_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
|
||||
doSmReturn<int32_t, int16_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
|
||||
doSmReturn<int32_t, int32_t>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
|
||||
doSmReturn<float, Float16>(m);
|
||||
else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
|
||||
doSmReturn<float, float>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
|
||||
doSmReturn<uint64_t, uint8_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
|
||||
doSmReturn<uint64_t, uint16_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
|
||||
doSmReturn<uint64_t, uint32_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
|
||||
doSmReturn<uint64_t, uint64_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
|
||||
doSmReturn<int64_t, int8_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
|
||||
doSmReturn<int64_t, int16_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
|
||||
doSmReturn<int64_t, int32_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
|
||||
doSmReturn<int64_t, int64_t>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
|
||||
doSmReturn<double, Float16>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
|
||||
doSmReturn<double, float>(m);
|
||||
else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
|
||||
doSmReturn<double, double>(m);
|
||||
}
|
||||
|
||||
// If pipeline has executed a local memory instruction
|
||||
// execute local memory packet and issue the packets
|
||||
// to LDS
|
||||
if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
|
||||
|
||||
GPUDynInstPtr m = lmIssuedRequests.front();
|
||||
|
||||
bool returnVal = computeUnit->sendToLds(m);
|
||||
if (!returnVal) {
|
||||
DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
|
||||
}
|
||||
lmIssuedRequests.pop();
|
||||
}
|
||||
}
|
||||
|
||||
template<typename c0, typename c1>
|
||||
void
|
||||
LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
|
||||
{
|
||||
lmReturnedRequests.pop();
|
||||
Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
|
||||
|
||||
// Return data to registers
|
||||
if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
|
||||
std::vector<uint32_t> regVec;
|
||||
for (int k = 0; k < m->n_reg; ++k) {
|
||||
int dst = m->dst_reg+k;
|
||||
|
||||
if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
|
||||
dst = m->dst_reg_vec[k];
|
||||
// virtual->physical VGPR mapping
|
||||
int physVgpr = w->remap(dst,sizeof(c0),1);
|
||||
// save the physical VGPR index
|
||||
regVec.push_back(physVgpr);
|
||||
c1 *p1 = &((c1*)m->d_data)[k * VSZ];
|
||||
|
||||
for (int i = 0; i < VSZ; ++i) {
|
||||
if (m->exec_mask[i]) {
|
||||
// write the value into the physical VGPR. This is a purely
|
||||
// functional operation. No timing is modeled.
|
||||
w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
|
||||
*p1, i);
|
||||
}
|
||||
++p1;
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule the write operation of the load data on the VRF. This simply
|
||||
// models the timing aspect of the VRF write operation. It does not
|
||||
// modify the physical VGPR.
|
||||
loadVrfBankConflictCycles +=
|
||||
w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
|
||||
regVec, sizeof(c0), m->time);
|
||||
}
|
||||
|
||||
// Decrement outstanding request count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
|
||||
|
||||
if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op)
|
||||
|| MO_H(m->m_op)) {
|
||||
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
|
||||
computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
// Mark write bus busy for appropriate amount of time
|
||||
computeUnit->locMemToVrfBus.set(m->time);
|
||||
if (computeUnit->shader->coissue_return == 0)
|
||||
w->computeUnit->wfWait.at(m->pipeId).set(m->time);
|
||||
}
|
||||
|
||||
void
|
||||
LocalMemPipeline::regStats()
|
||||
{
|
||||
loadVrfBankConflictCycles
|
||||
.name(name() + ".load_vrf_bank_conflict_cycles")
|
||||
.desc("total number of cycles LDS data are delayed before updating "
|
||||
"the VRF")
|
||||
;
|
||||
}
|
98
src/gpu-compute/local_memory_pipeline.hh
Normal file
98
src/gpu-compute/local_memory_pipeline.hh
Normal file
|
@ -0,0 +1,98 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __LOCAL_MEMORY_PIPELINE_HH__
|
||||
#define __LOCAL_MEMORY_PIPELINE_HH__
|
||||
|
||||
#include <queue>
|
||||
#include <string>
|
||||
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
#include "sim/stats.hh"
|
||||
|
||||
/*
|
||||
* @file local_memory_pipeline.hh
|
||||
*
|
||||
* The local memory pipeline issues newly created local memory packets
|
||||
* from pipeline to the LDS. This stage also retires previously issued
|
||||
* loads and stores that have returned from the LDS.
|
||||
*/
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
class LocalMemPipeline
|
||||
{
|
||||
public:
|
||||
LocalMemPipeline(const ComputeUnitParams *params);
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
|
||||
|
||||
std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
|
||||
std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
|
||||
|
||||
bool
|
||||
isLMRespFIFOWrRdy() const
|
||||
{
|
||||
return lmReturnedRequests.size() < lmQueueSize;
|
||||
}
|
||||
|
||||
bool
|
||||
isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
|
||||
{
|
||||
return (lmIssuedRequests.size() + pendReqs) < lmQueueSize;
|
||||
}
|
||||
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
std::string _name;
|
||||
int lmQueueSize;
|
||||
Stats::Scalar loadVrfBankConflictCycles;
|
||||
// Local Memory Request Fifo: all shared memory requests
|
||||
// are issued to this FIFO from the memory pipelines
|
||||
std::queue<GPUDynInstPtr> lmIssuedRequests;
|
||||
|
||||
// Local Memory Response Fifo: all responses of shared memory
|
||||
// requests are sent to this FIFO from LDS
|
||||
std::queue<GPUDynInstPtr> lmReturnedRequests;
|
||||
};
|
||||
|
||||
#endif // __LOCAL_MEMORY_PIPELINE_HH__
|
162
src/gpu-compute/misc.hh
Normal file
162
src/gpu-compute/misc.hh
Normal file
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __MISC_HH__
|
||||
#define __MISC_HH__
|
||||
|
||||
#include <bitset>
|
||||
#include <memory>
|
||||
|
||||
#include "base/misc.hh"
|
||||
|
||||
class GPUDynInst;
|
||||
|
||||
// wavefront size of the machine
|
||||
static const int VSZ = 64;
|
||||
|
||||
/*
|
||||
This check is necessary because std::bitset only provides conversion to
|
||||
unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
|
||||
a few places in the code where to_ullong() is used, however if VSZ is larger
|
||||
than a value the host can support then bitset will throw a runtime exception.
|
||||
|
||||
we should remove all use of to_long() or to_ullong() so we can have VSZ
|
||||
greater than 64b, however until that is done this assert is required.
|
||||
*/
|
||||
static_assert(VSZ <= sizeof(unsigned long long) * 8,
|
||||
"VSZ is larger than the host can support");
|
||||
|
||||
typedef std::bitset<VSZ> VectorMask;
|
||||
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
|
||||
|
||||
class WaitClass
|
||||
{
|
||||
public:
|
||||
WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
|
||||
void init(uint64_t *_tcnt, uint32_t _numStages=0)
|
||||
{
|
||||
tcnt = _tcnt;
|
||||
numStages = _numStages;
|
||||
}
|
||||
|
||||
void set(uint32_t i)
|
||||
{
|
||||
fatal_if(nxtAvail > *tcnt,
|
||||
"Can't allocate resource because it is busy!!!");
|
||||
nxtAvail = *tcnt + i;
|
||||
}
|
||||
void preset(uint32_t delay)
|
||||
{
|
||||
lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
|
||||
}
|
||||
bool rdy() const { return *tcnt >= nxtAvail; }
|
||||
bool prerdy() const { return *tcnt >= lookAheadAvail; }
|
||||
|
||||
private:
|
||||
// timestamp indicating when resource will be available
|
||||
uint64_t nxtAvail;
|
||||
// timestamp indicating when resource will be available including
|
||||
// pending uses of the resource (when there is a cycle gap between
|
||||
// rdy() and set()
|
||||
uint64_t lookAheadAvail;
|
||||
// current timestamp
|
||||
uint64_t *tcnt;
|
||||
// number of stages between checking if a resource is ready and
|
||||
// setting the resource's utilization
|
||||
uint32_t numStages;
|
||||
};
|
||||
|
||||
class Float16
|
||||
{
|
||||
public:
|
||||
uint16_t val;
|
||||
|
||||
Float16() { val = 0; }
|
||||
|
||||
Float16(const Float16 &x) : val(x.val) { }
|
||||
|
||||
Float16(float x)
|
||||
{
|
||||
uint32_t ai = *(uint32_t *)&x;
|
||||
|
||||
uint32_t s = (ai >> 31) & 0x1;
|
||||
uint32_t exp = (ai >> 23) & 0xff;
|
||||
uint32_t mant = (ai >> 0) & 0x7fffff;
|
||||
|
||||
if (exp == 0 || exp <= 0x70) {
|
||||
exp = 0;
|
||||
mant = 0;
|
||||
} else if (exp == 0xff) {
|
||||
exp = 0x1f;
|
||||
} else if (exp >= 0x8f) {
|
||||
exp = 0x1f;
|
||||
mant = 0;
|
||||
} else {
|
||||
exp = exp - 0x7f + 0x0f;
|
||||
}
|
||||
|
||||
mant = mant >> 13;
|
||||
|
||||
val = 0;
|
||||
val |= (s << 15);
|
||||
val |= (exp << 10);
|
||||
val |= (mant << 0);
|
||||
}
|
||||
|
||||
operator float() const
|
||||
{
|
||||
uint32_t s = (val >> 15) & 0x1;
|
||||
uint32_t exp = (val >> 10) & 0x1f;
|
||||
uint32_t mant = (val >> 0) & 0x3ff;
|
||||
|
||||
if (!exp) {
|
||||
exp = 0;
|
||||
mant = 0;
|
||||
} else if (exp == 0x1f) {
|
||||
exp = 0xff;
|
||||
} else {
|
||||
exp = exp - 0x0f + 0x7f;
|
||||
}
|
||||
|
||||
uint32_t val1 = 0;
|
||||
val1 |= (s << 31);
|
||||
val1 |= (exp << 23);
|
||||
val1 |= (mant << 13);
|
||||
|
||||
return *(float*)&val1;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __MISC_HH__
|
70
src/gpu-compute/ndrange.hh
Normal file
70
src/gpu-compute/ndrange.hh
Normal file
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __NDRANGE_HH__
|
||||
#define __NDRANGE_HH__
|
||||
|
||||
#include "base/types.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
|
||||
struct NDRange
|
||||
{
|
||||
// copy of the queue entry provided at dispatch
|
||||
HsaQueueEntry q;
|
||||
|
||||
// The current workgroup id (3 dimensions)
|
||||
int wgId[3];
|
||||
// The number of workgroups in each dimension
|
||||
int numWg[3];
|
||||
// The total number of workgroups
|
||||
int numWgTotal;
|
||||
|
||||
// The number of completed work groups
|
||||
int numWgCompleted;
|
||||
// The global workgroup ID
|
||||
uint32_t globalWgId;
|
||||
|
||||
// flag indicating whether all work groups have been launched
|
||||
bool wg_disp_rem;
|
||||
// kernel complete
|
||||
bool execDone;
|
||||
bool userDoorBellSet;
|
||||
volatile bool *addrToNotify;
|
||||
volatile uint32_t *numDispLeft;
|
||||
int dispatchId;
|
||||
int curTid; // Current thread id
|
||||
};
|
||||
|
||||
#endif // __NDRANGE_HH__
|
76
src/gpu-compute/of_scheduling_policy.cc
Normal file
76
src/gpu-compute/of_scheduling_policy.cc
Normal file
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/of_scheduling_policy.hh"
|
||||
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
Wavefront*
|
||||
OFSchedulingPolicy::chooseWave()
|
||||
{
|
||||
// Set when policy choose a wave to schedule
|
||||
bool waveChosen = false;
|
||||
Wavefront *selectedWave = nullptr;
|
||||
int selectedWaveID = -1;
|
||||
uint32_t selectedPosition = 0;
|
||||
|
||||
for (int position = 0; position < scheduleList->size(); ++position) {
|
||||
Wavefront *curWave = scheduleList->at(position);
|
||||
uint32_t curWaveID = curWave->wfDynId;
|
||||
|
||||
// Choosed wave with the lowest wave ID
|
||||
if (selectedWaveID == -1 || curWaveID < selectedWaveID) {
|
||||
waveChosen = true;
|
||||
selectedWaveID = curWaveID;
|
||||
selectedWave = curWave;
|
||||
selectedPosition = position;
|
||||
}
|
||||
}
|
||||
|
||||
// Check to make sure ready list had atleast one schedulable wave
|
||||
if (waveChosen) {
|
||||
scheduleList->erase(scheduleList->begin() + selectedPosition);
|
||||
} else {
|
||||
panic("Empty ready list");
|
||||
}
|
||||
|
||||
return selectedWave;
|
||||
}
|
||||
|
||||
void
|
||||
OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
|
||||
{
|
||||
scheduleList = list;
|
||||
}
|
61
src/gpu-compute/of_scheduling_policy.hh
Normal file
61
src/gpu-compute/of_scheduling_policy.hh
Normal file
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __OF_SCHEDULING_POLICY_HH__
|
||||
#define __OF_SCHEDULING_POLICY_HH__
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
#include "base/misc.hh"
|
||||
|
||||
class Wavefront;
|
||||
|
||||
// Oldest First where age is marked by the wave id
|
||||
class OFSchedulingPolicy
|
||||
{
|
||||
public:
|
||||
OFSchedulingPolicy() : scheduleList(nullptr) { }
|
||||
|
||||
Wavefront* chooseWave();
|
||||
void bindList(std::vector<Wavefront*> *list);
|
||||
|
||||
private:
|
||||
// List of waves which are participating in scheduling.
|
||||
// This scheduler selects the oldest wave from this list
|
||||
std::vector<Wavefront*> *scheduleList;
|
||||
};
|
||||
|
||||
#endif // __OF_SCHEDULING_POLICY_HH__
|
42
src/gpu-compute/pool_manager.cc
Normal file
42
src/gpu-compute/pool_manager.cc
Normal file
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos
|
||||
*/
|
||||
|
||||
#include "gpu-compute/pool_manager.hh"
|
||||
|
||||
PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
|
||||
: _minAllocation(minAlloc), _poolSize(poolSize)
|
||||
{
|
||||
assert(poolSize > 0);
|
||||
}
|
66
src/gpu-compute/pool_manager.hh
Normal file
66
src/gpu-compute/pool_manager.hh
Normal file
|
@ -0,0 +1,66 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos
|
||||
*/
|
||||
|
||||
#ifndef __POOL_MANAGER_HH__
|
||||
#define __POOL_MANAGER_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
// Pool Manager Logic
|
||||
class PoolManager
|
||||
{
|
||||
public:
|
||||
PoolManager(uint32_t minAlloc, uint32_t poolSize);
|
||||
uint32_t minAllocation() { return _minAllocation; }
|
||||
virtual std::string printRegion() = 0;
|
||||
virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> ®ion) = 0;
|
||||
virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0;
|
||||
|
||||
virtual uint32_t allocateRegion(const uint32_t size,
|
||||
uint32_t *reserved) = 0;
|
||||
|
||||
virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
|
||||
uint32_t poolSize() { return _poolSize; }
|
||||
|
||||
private:
|
||||
// minimum size that can be reserved per allocation
|
||||
uint32_t _minAllocation;
|
||||
// pool size in number of elements
|
||||
uint32_t _poolSize;
|
||||
};
|
||||
|
||||
#endif // __POOL_MANAGER_HH__
|
201
src/gpu-compute/qstruct.hh
Normal file
201
src/gpu-compute/qstruct.hh
Normal file
|
@ -0,0 +1,201 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Brad Beckmann, Marc Orr
|
||||
*/
|
||||
|
||||
#ifndef __Q_STRUCT_HH__
|
||||
#define __Q_STRUCT_HH__
|
||||
|
||||
#include <bitset>
|
||||
#include <cstdint>
|
||||
|
||||
// Maximum number of arguments
|
||||
static const int KER_NUM_ARGS = 32;
|
||||
// Kernel argument buffer size
|
||||
static const int KER_ARGS_LENGTH = 512;
|
||||
|
||||
class LdsChunk;
|
||||
struct NDRange;
|
||||
|
||||
// Be very careful of alignment in this structure. The structure
|
||||
// must compile to the same layout in both 32-bit and 64-bit mode.
|
||||
struct HsaQueueEntry
|
||||
{
|
||||
// Base pointer for array of instruction pointers
|
||||
uint64_t code_ptr;
|
||||
// Grid Size (3 dimensions)
|
||||
uint32_t gdSize[3];
|
||||
// Workgroup Size (3 dimensions)
|
||||
uint32_t wgSize[3];
|
||||
uint16_t sRegCount;
|
||||
uint16_t dRegCount;
|
||||
uint16_t cRegCount;
|
||||
uint64_t privMemStart;
|
||||
uint32_t privMemPerItem;
|
||||
uint32_t privMemTotal;
|
||||
uint64_t spillMemStart;
|
||||
uint32_t spillMemPerItem;
|
||||
uint32_t spillMemTotal;
|
||||
uint64_t roMemStart;
|
||||
uint32_t roMemTotal;
|
||||
// Size (in bytes) of LDS
|
||||
uint32_t ldsSize;
|
||||
// Virtual Memory Id (unused right now)
|
||||
uint32_t vmId;
|
||||
|
||||
// Pointer to dependency chain (unused now)
|
||||
uint64_t depends;
|
||||
|
||||
// pointer to bool
|
||||
uint64_t addrToNotify;
|
||||
// pointer to uint32_t
|
||||
uint64_t numDispLeft;
|
||||
|
||||
// variables to pass arguments when running in standalone mode,
|
||||
// will be removed when run.py and sh.cpp have been updated to
|
||||
// use args and offset arrays
|
||||
uint64_t arg1;
|
||||
uint64_t arg2;
|
||||
uint64_t arg3;
|
||||
uint64_t arg4;
|
||||
|
||||
// variables to pass arguments when running in cpu+gpu mode
|
||||
uint8_t args[KER_ARGS_LENGTH];
|
||||
uint16_t offsets[KER_NUM_ARGS];
|
||||
uint16_t num_args;
|
||||
};
|
||||
|
||||
// State used to start (or restart) a WF
|
||||
struct WFContext
|
||||
{
|
||||
// 32 bit values
|
||||
// barrier state
|
||||
int bar_cnt[VSZ];
|
||||
|
||||
// id (which WF in the WG)
|
||||
int cnt;
|
||||
|
||||
// more barrier state
|
||||
int max_bar_cnt;
|
||||
int old_barrier_cnt;
|
||||
int barrier_cnt;
|
||||
|
||||
// More Program Counter Stuff
|
||||
uint32_t pc;
|
||||
|
||||
// Program counter of the immediate post-dominator instruction
|
||||
uint32_t rpc;
|
||||
|
||||
// WG wide state (I don't see how to avoid redundancy here)
|
||||
int cu_id;
|
||||
uint32_t wg_id;
|
||||
uint32_t barrier_id;
|
||||
|
||||
// 64 bit values (these values depend on the wavefront size)
|
||||
// masks
|
||||
uint64_t init_mask;
|
||||
uint64_t exec_mask;
|
||||
|
||||
// private memory;
|
||||
Addr privBase;
|
||||
Addr spillBase;
|
||||
|
||||
LdsChunk *ldsChunk;
|
||||
|
||||
/*
|
||||
* Kernel wide state
|
||||
* This is a hack. This state should be moved through simulated memory
|
||||
* during a yield. Though not much is being used here, so it's probably
|
||||
* probably not a big deal.
|
||||
*
|
||||
* Just to add to this comment... The ndr is derived from simulated
|
||||
* memory when the cl-runtime allocates an HsaQueueEntry and populates it
|
||||
* for a kernel launch. So in theory the runtime should be able to keep
|
||||
* that state around. Then a WF can reference it upon restart to derive
|
||||
* kernel wide state. The runtime can deallocate the state when the
|
||||
* kernel completes.
|
||||
*/
|
||||
NDRange *ndr;
|
||||
};
|
||||
|
||||
// State that needs to be passed between the simulation and simulated app, a
|
||||
// pointer to this struct can be passed through the depends field in the
|
||||
// HsaQueueEntry struct
|
||||
struct HostState
|
||||
{
|
||||
// cl_event* has original HsaQueueEntry for init
|
||||
uint64_t event;
|
||||
};
|
||||
|
||||
// Total number of HSA queues
|
||||
static const int HSAQ_NQUEUES = 8;
|
||||
|
||||
// These values will eventually live in memory mapped registers
|
||||
// and be settable by the kernel mode driver.
|
||||
|
||||
// Number of entries in each HSA queue
|
||||
static const int HSAQ_SIZE = 64;
|
||||
// Address of first HSA queue index
|
||||
static const int HSAQ_INDX_BASE = 0x10000ll;
|
||||
// Address of first HSA queue
|
||||
static const int HSAQ_BASE = 0x11000ll;
|
||||
// Suggested start of HSA code
|
||||
static const int HSA_CODE_BASE = 0x18000ll;
|
||||
|
||||
// These are shortcuts for deriving the address of a specific
|
||||
// HSA queue or queue index
|
||||
#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n)
|
||||
#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue))
|
||||
#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0))
|
||||
#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1))
|
||||
#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2))
|
||||
|
||||
/*
|
||||
* Example code for writing to a queue
|
||||
*
|
||||
* void
|
||||
* ToQueue(int n,struct fsaQueue *val)
|
||||
* {
|
||||
* int wi = *(int*)HSAQ_WI(n);
|
||||
* int ri = *(int*)HSAQ_RI(n);
|
||||
* int ci = *(int*)HSAQ_CI(n);
|
||||
*
|
||||
* if (ci - ri < HSAQ_SIZE) {
|
||||
* (*(int*)HSAQ_CI(n))++;
|
||||
* *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val;
|
||||
* (*(int*)HSAQ_WI(n))++;
|
||||
* }
|
||||
* }
|
||||
*/
|
||||
|
||||
#endif // __Q_STRUCT_HH__
|
67
src/gpu-compute/rr_scheduling_policy.cc
Normal file
67
src/gpu-compute/rr_scheduling_policy.cc
Normal file
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/rr_scheduling_policy.hh"
|
||||
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
Wavefront*
|
||||
RRSchedulingPolicy::chooseWave()
|
||||
{
|
||||
Wavefront *selectedWave = nullptr;
|
||||
|
||||
// Check to make sure ready list had atleast one schedulable wave
|
||||
if (scheduleList->size()) {
|
||||
// For RR policy, select the wave which is at the
|
||||
// front of the list. The selected wave is popped
|
||||
// out from the schedule list immediately after selection
|
||||
// to avoid starvation. It is the responsibility of the
|
||||
// module invoking the RR scheduler to make surei scheduling
|
||||
// eligible waves are added to the back of the schedule
|
||||
// list
|
||||
selectedWave = scheduleList->front();
|
||||
scheduleList->erase(scheduleList->begin() + 0);
|
||||
} else {
|
||||
panic("Empty ready list");
|
||||
}
|
||||
|
||||
return selectedWave;
|
||||
}
|
||||
|
||||
void
|
||||
RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
|
||||
{
|
||||
scheduleList = list;
|
||||
}
|
65
src/gpu-compute/rr_scheduling_policy.hh
Normal file
65
src/gpu-compute/rr_scheduling_policy.hh
Normal file
|
@ -0,0 +1,65 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __RR_SCHEDULING_POLICY_HH__
|
||||
#define __RR_SCHEDULING_POLICY_HH__
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "base/misc.hh"
|
||||
|
||||
class Wavefront;
|
||||
|
||||
// Round-Robin pick among the list of ready waves
|
||||
class RRSchedulingPolicy
|
||||
{
|
||||
public:
|
||||
RRSchedulingPolicy() : scheduleList(nullptr) { }
|
||||
|
||||
Wavefront* chooseWave();
|
||||
void bindList(std::vector<Wavefront*> *list);
|
||||
|
||||
private:
|
||||
// List of waves which are participating in scheduling.
|
||||
// This scheduler selects one wave from this list based on
|
||||
// round robin policy
|
||||
std::vector<Wavefront*> *scheduleList;
|
||||
};
|
||||
|
||||
#endif // __RR_SCHEDULING_POLICY_HH__
|
151
src/gpu-compute/schedule_stage.cc
Normal file
151
src/gpu-compute/schedule_stage.cc
Normal file
|
@ -0,0 +1,151 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/schedule_stage.hh"
|
||||
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
|
||||
: numSIMDs(p->num_SIMDs),
|
||||
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
|
||||
{
|
||||
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
|
||||
Scheduler newScheduler(p);
|
||||
scheduler.push_back(newScheduler);
|
||||
}
|
||||
}
|
||||
|
||||
ScheduleStage::~ScheduleStage()
|
||||
{
|
||||
scheduler.clear();
|
||||
waveStatusList.clear();
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::init(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".ScheduleStage";
|
||||
|
||||
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
|
||||
scheduler[j].bindList(&computeUnit->readyList[j]);
|
||||
}
|
||||
|
||||
for (int j = 0; j < numSIMDs; ++j) {
|
||||
waveStatusList.push_back(&computeUnit->waveStatusList[j]);
|
||||
}
|
||||
|
||||
dispatchList = &computeUnit->dispatchList;
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::arbitrate()
|
||||
{
|
||||
// iterate over all Memory pipelines
|
||||
for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
|
||||
if (dispatchList->at(j).first) {
|
||||
Wavefront *waveToMemPipe = dispatchList->at(j).first;
|
||||
// iterate over all execution pipelines
|
||||
for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
|
||||
if ((i != j) && (dispatchList->at(i).first)) {
|
||||
Wavefront *waveToExePipe = dispatchList->at(i).first;
|
||||
// if the two selected wavefronts are mapped to the same
|
||||
// SIMD unit then they share the VRF
|
||||
if (waveToMemPipe->simdId == waveToExePipe->simdId) {
|
||||
int simdId = waveToMemPipe->simdId;
|
||||
// Read VRF port arbitration:
|
||||
// If there are read VRF port conflicts between the
|
||||
// a memory and another instruction we drop the other
|
||||
// instruction. We don't need to check for write VRF
|
||||
// port conflicts because the memory instruction either
|
||||
// does not need to write to the VRF (store) or will
|
||||
// write to the VRF when the data comes back (load) in
|
||||
// which case the arbiter of the memory pipes will
|
||||
// resolve any conflicts
|
||||
if (computeUnit->vrf[simdId]->
|
||||
isReadConflict(waveToMemPipe->wfSlotId,
|
||||
waveToExePipe->wfSlotId)) {
|
||||
// FIXME: The "second" member variable is never
|
||||
// used in the model. I am setting it to READY
|
||||
// simply to follow the protocol of setting it
|
||||
// when the WF has an instruction ready to issue
|
||||
waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
|
||||
.second = READY;
|
||||
|
||||
dispatchList->at(i).first = nullptr;
|
||||
dispatchList->at(i).second = EMPTY;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::exec()
|
||||
{
|
||||
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
|
||||
uint32_t readyListSize = computeUnit->readyList[j].size();
|
||||
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
|
||||
dispatchList->at(j).first = waveToBeDispatched;
|
||||
waveToBeDispatched->updateResources();
|
||||
dispatchList->at(j).second = FILLED;
|
||||
|
||||
waveStatusList[waveToBeDispatched->simdId]->at(
|
||||
waveToBeDispatched->wfSlotId).second = BLOCKED;
|
||||
|
||||
assert(computeUnit->readyList[j].size() == readyListSize - 1);
|
||||
}
|
||||
// arbitrate over all shared resources among instructions being issued
|
||||
// simultaneously
|
||||
arbitrate();
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::regStats()
|
||||
{
|
||||
}
|
95
src/gpu-compute/schedule_stage.hh
Normal file
95
src/gpu-compute/schedule_stage.hh
Normal file
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __SCHEDULE_STAGE_HH__
|
||||
#define __SCHEDULE_STAGE_HH__
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
#include "gpu-compute/scheduler.hh"
|
||||
#include "gpu-compute/scoreboard_check_stage.hh"
|
||||
|
||||
// Schedule or execution arbitration stage.
|
||||
// From the pool of ready waves in the ready list,
|
||||
// one wave is selected for each execution resource.
|
||||
// The selection is made based on a scheduling policy
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
struct ComputeUnitParams;
|
||||
|
||||
class ScheduleStage
|
||||
{
|
||||
public:
|
||||
ScheduleStage(const ComputeUnitParams *params);
|
||||
~ScheduleStage();
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
void arbitrate();
|
||||
// Stats related variables and methods
|
||||
std::string name() { return _name; }
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
uint32_t numSIMDs;
|
||||
uint32_t numMemUnits;
|
||||
|
||||
// Each execution resource will have its own
|
||||
// scheduler and a dispatch list
|
||||
std::vector<Scheduler> scheduler;
|
||||
|
||||
// Stores the status of waves. A READY implies the
|
||||
// wave is ready to be scheduled this cycle and
|
||||
// is already present in the readyList
|
||||
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
|
||||
waveStatusList;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. A FILLED implies
|
||||
// dispatch list is non-empty and
|
||||
// execution unit has something to execute
|
||||
// this cycle. Currently, the dispatch list of
|
||||
// an execution resource can hold only one wave because
|
||||
// an execution resource can execute only one wave in a cycle.
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
|
||||
|
||||
std::string _name;
|
||||
};
|
||||
|
||||
#endif // __SCHEDULE_STAGE_HH__
|
71
src/gpu-compute/scheduler.cc
Normal file
71
src/gpu-compute/scheduler.cc
Normal file
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/scheduler.hh"
|
||||
|
||||
Scheduler::Scheduler(const ComputeUnitParams *p)
|
||||
{
|
||||
if (p->execPolicy == "OLDEST-FIRST") {
|
||||
schedPolicy = SCHED_POLICY::OF_POLICY;
|
||||
} else if (p->execPolicy == "ROUND-ROBIN") {
|
||||
schedPolicy = SCHED_POLICY::RR_POLICY;
|
||||
} else {
|
||||
fatal("Unimplemented scheduling policy");
|
||||
}
|
||||
}
|
||||
|
||||
Wavefront*
|
||||
Scheduler::chooseWave()
|
||||
{
|
||||
if (schedPolicy == SCHED_POLICY::OF_POLICY) {
|
||||
return OFSchedPolicy.chooseWave();
|
||||
} else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
|
||||
return RRSchedPolicy.chooseWave();
|
||||
} else {
|
||||
fatal("Unimplemented scheduling policy");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Scheduler::bindList(std::vector<Wavefront*> *list)
|
||||
{
|
||||
if (schedPolicy == SCHED_POLICY::OF_POLICY) {
|
||||
OFSchedPolicy.bindList(list);
|
||||
} else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
|
||||
RRSchedPolicy.bindList(list);
|
||||
} else {
|
||||
fatal("Unimplemented scheduling policy");
|
||||
}
|
||||
}
|
63
src/gpu-compute/scheduler.hh
Normal file
63
src/gpu-compute/scheduler.hh
Normal file
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __SCHEDULER_HH__
|
||||
#define __SCHEDULER_HH__
|
||||
|
||||
#include "gpu-compute/of_scheduling_policy.hh"
|
||||
#include "gpu-compute/rr_scheduling_policy.hh"
|
||||
#include "gpu-compute/scheduling_policy.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
|
||||
enum SCHED_POLICY
|
||||
{
|
||||
OF_POLICY = 0,
|
||||
RR_POLICY
|
||||
};
|
||||
|
||||
class Scheduler
|
||||
{
|
||||
public:
|
||||
Scheduler(const ComputeUnitParams *params);
|
||||
Wavefront *chooseWave();
|
||||
void bindList(std::vector<Wavefront*> *list);
|
||||
|
||||
private:
|
||||
SCHED_POLICY schedPolicy;
|
||||
SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy;
|
||||
SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy;
|
||||
};
|
||||
|
||||
#endif // __SCHEDULER_HH__
|
57
src/gpu-compute/scheduling_policy.hh
Normal file
57
src/gpu-compute/scheduling_policy.hh
Normal file
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __SCHEDULING_POLICY_HH__
|
||||
#define __SCHEDULING_POLICY_HH__
|
||||
|
||||
#include <vector>
|
||||
|
||||
template<typename Impl>
|
||||
class SchedulingPolicy
|
||||
{
|
||||
public:
|
||||
Wavefront* chooseWave() { return policyImpl.chooseWave(); }
|
||||
|
||||
void
|
||||
bindList(std::vector<Wavefront*> *list)
|
||||
{
|
||||
return policyImpl.bindList(list);
|
||||
}
|
||||
|
||||
private:
|
||||
Impl policyImpl;
|
||||
};
|
||||
|
||||
#endif // __SCHEDULING_POLICY_HH__
|
173
src/gpu-compute/scoreboard_check_stage.cc
Normal file
173
src/gpu-compute/scoreboard_check_stage.cc
Normal file
|
@ -0,0 +1,173 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#include "gpu-compute/scoreboard_check_stage.hh"
|
||||
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
|
||||
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
|
||||
: numSIMDs(p->num_SIMDs),
|
||||
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
|
||||
numGlbMemPipes(p->num_global_mem_pipes),
|
||||
numShrMemPipes(p->num_shared_mem_pipes),
|
||||
vectorAluInstAvail(nullptr),
|
||||
lastGlbMemSimd(-1),
|
||||
lastShrMemSimd(-1), glbMemInstAvail(nullptr),
|
||||
shrMemInstAvail(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
ScoreboardCheckStage::~ScoreboardCheckStage()
|
||||
{
|
||||
readyList.clear();
|
||||
waveStatusList.clear();
|
||||
shrMemInstAvail = nullptr;
|
||||
glbMemInstAvail = nullptr;
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::init(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".ScoreboardCheckStage";
|
||||
|
||||
for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
|
||||
readyList.push_back(&computeUnit->readyList[unitId]);
|
||||
}
|
||||
|
||||
for (int unitId = 0; unitId < numSIMDs; ++unitId) {
|
||||
waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
|
||||
}
|
||||
|
||||
vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
|
||||
glbMemInstAvail= &computeUnit->glbMemInstAvail;
|
||||
shrMemInstAvail= &computeUnit->shrMemInstAvail;
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::initStatistics()
|
||||
{
|
||||
lastGlbMemSimd = -1;
|
||||
lastShrMemSimd = -1;
|
||||
*glbMemInstAvail = 0;
|
||||
*shrMemInstAvail = 0;
|
||||
|
||||
for (int unitId = 0; unitId < numSIMDs; ++unitId)
|
||||
vectorAluInstAvail->at(unitId) = false;
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
|
||||
{
|
||||
if (curWave->instructionBuffer.empty())
|
||||
return;
|
||||
|
||||
// track which vector SIMD unit has at least one WV with a vector
|
||||
// ALU as the oldest instruction in its Instruction buffer
|
||||
vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
|
||||
curWave->isOldestInstALU();
|
||||
|
||||
// track how many vector SIMD units have at least one WV with a
|
||||
// vector Global memory instruction as the oldest instruction
|
||||
// in its Instruction buffer
|
||||
if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
|
||||
curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
|
||||
*glbMemInstAvail <= 1) {
|
||||
(*glbMemInstAvail)++;
|
||||
lastGlbMemSimd = unitId;
|
||||
}
|
||||
|
||||
// track how many vector SIMD units have at least one WV with a
|
||||
// vector shared memory (LDS) instruction as the oldest instruction
|
||||
// in its Instruction buffer
|
||||
// TODO: parametrize the limit of the LDS units
|
||||
if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
|
||||
lastShrMemSimd != unitId) {
|
||||
(*shrMemInstAvail)++;
|
||||
lastShrMemSimd = unitId;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::exec()
|
||||
{
|
||||
initStatistics();
|
||||
|
||||
// reset the ready list for all execution units; it will be
|
||||
// constructed every cycle since resource availability may change
|
||||
for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
|
||||
readyList[unitId]->clear();
|
||||
}
|
||||
|
||||
// iterate over the Wavefronts of all SIMD units
|
||||
for (int unitId = 0; unitId < numSIMDs; ++unitId) {
|
||||
for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
|
||||
// reset the ready status of each wavefront
|
||||
waveStatusList[unitId]->at(wvId).second = BLOCKED;
|
||||
Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
|
||||
collectStatistics(curWave, unitId);
|
||||
|
||||
if (curWave->ready(Wavefront::I_ALU)) {
|
||||
readyList[unitId]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
} else if (curWave->ready(Wavefront::I_GLOBAL)) {
|
||||
if (computeUnit->cedeSIMD(unitId, wvId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
} else if (curWave->ready(Wavefront::I_SHARED)) {
|
||||
readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
} else if (curWave->ready(Wavefront::I_FLAT)) {
|
||||
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
} else if (curWave->ready(Wavefront::I_PRIVATE)) {
|
||||
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::regStats()
|
||||
{
|
||||
}
|
106
src/gpu-compute/scoreboard_check_stage.hh
Normal file
106
src/gpu-compute/scoreboard_check_stage.hh
Normal file
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor
|
||||
*/
|
||||
|
||||
#ifndef __SCOREBOARD_CHECK_STAGE_HH__
|
||||
#define __SCOREBOARD_CHECK_STAGE_HH__
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
struct ComputeUnitParams;
|
||||
|
||||
enum WAVE_STATUS
|
||||
{
|
||||
BLOCKED = 0,
|
||||
READY
|
||||
};
|
||||
|
||||
/*
|
||||
* Scoreboard check stage.
|
||||
* All wavefronts are analyzed to see if they are ready
|
||||
* to be executed this cycle. Both structural and data
|
||||
* hazards are considered while marking a wave "ready"
|
||||
* for execution. After analysis, the ready waves are
|
||||
* added to readyList.
|
||||
*/
|
||||
class ScoreboardCheckStage
|
||||
{
|
||||
public:
|
||||
ScoreboardCheckStage(const ComputeUnitParams* params);
|
||||
~ScoreboardCheckStage();
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
// Stats related variables and methods
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
void collectStatistics(Wavefront *curWave, int unitId);
|
||||
void initStatistics();
|
||||
ComputeUnit *computeUnit;
|
||||
uint32_t numSIMDs;
|
||||
uint32_t numMemUnits;
|
||||
uint32_t numGlbMemPipes;
|
||||
uint32_t numShrMemPipes;
|
||||
|
||||
// flag per vector SIMD unit that is set when there is at least one
|
||||
// WF that has a vector ALU instruction as the oldest in its
|
||||
// Instruction Buffer
|
||||
std::vector<bool> *vectorAluInstAvail;
|
||||
int lastGlbMemSimd;
|
||||
int lastShrMemSimd;
|
||||
|
||||
int *glbMemInstAvail;
|
||||
int *shrMemInstAvail;
|
||||
// List of waves which are ready to be scheduled.
|
||||
// Each execution resource has a ready list
|
||||
std::vector<std::vector<Wavefront*>*> readyList;
|
||||
|
||||
// Stores the status of waves. A READY implies the
|
||||
// wave is ready to be scheduled this cycle and
|
||||
// is already present in the readyList
|
||||
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
|
||||
waveStatusList;
|
||||
|
||||
std::string _name;
|
||||
};
|
||||
|
||||
#endif // __SCOREBOARD_CHECK_STAGE_HH__
|
412
src/gpu-compute/shader.cc
Normal file
412
src/gpu-compute/shader.cc
Normal file
|
@ -0,0 +1,412 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#include "gpu-compute/shader.hh"
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "arch/x86/linux/linux.hh"
|
||||
#include "base/chunk_generator.hh"
|
||||
#include "debug/GPUDisp.hh"
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "debug/HSAIL.hh"
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "mem/packet.hh"
|
||||
#include "mem/ruby/system/RubySystem.hh"
|
||||
#include "sim/sim_exit.hh"
|
||||
|
||||
Shader::Shader(const Params *p) : SimObject(p),
|
||||
clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
|
||||
cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
|
||||
hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
|
||||
separate_acquire_release(p->separate_acquire_release), coissue_return(1),
|
||||
trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
|
||||
globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
|
||||
box_tick_cnt(0), start_tick_cnt(0)
|
||||
{
|
||||
|
||||
cuList.resize(n_cu);
|
||||
|
||||
for (int i = 0; i < n_cu; ++i) {
|
||||
cuList[i] = p->CUs[i];
|
||||
assert(i == cuList[i]->cu_id);
|
||||
cuList[i]->shader = this;
|
||||
}
|
||||
}
|
||||
|
||||
Addr
|
||||
Shader::mmap(int length)
|
||||
{
|
||||
|
||||
Addr start;
|
||||
|
||||
// round up length to the next page
|
||||
length = roundUp(length, TheISA::PageBytes);
|
||||
|
||||
if (X86Linux64::mmapGrowsDown()) {
|
||||
DPRINTF(HSAIL, "GROWS DOWN");
|
||||
start = gpuTc->getProcessPtr()->mmap_end -length;
|
||||
gpuTc->getProcessPtr()->mmap_end = start;
|
||||
} else {
|
||||
DPRINTF(HSAIL, "GROWS UP");
|
||||
start = gpuTc->getProcessPtr()->mmap_end;
|
||||
gpuTc->getProcessPtr()->mmap_end += length;
|
||||
|
||||
// assertion to make sure we don't overwrite the stack (it grows down)
|
||||
assert(gpuTc->getProcessPtr()->mmap_end <
|
||||
gpuTc->getProcessPtr()->stack_base -
|
||||
gpuTc->getProcessPtr()->max_stack_size);
|
||||
|
||||
}
|
||||
|
||||
DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
|
||||
|
||||
gpuTc->getProcessPtr()->allocateMem(start,length);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
void
|
||||
Shader::init()
|
||||
{
|
||||
// grab the threadContext of the thread running on the CPU
|
||||
assert(cpuPointer);
|
||||
gpuTc = cpuPointer->getContext(0);
|
||||
assert(gpuTc);
|
||||
}
|
||||
|
||||
Shader::~Shader()
|
||||
{
|
||||
for (int j = 0; j < n_cu; ++j)
|
||||
delete cuList[j];
|
||||
}
|
||||
|
||||
void
|
||||
Shader::updateThreadContext(int tid) {
|
||||
// thread context of the thread which dispatched work
|
||||
assert(cpuPointer);
|
||||
gpuTc = cpuPointer->getContext(tid);
|
||||
assert(gpuTc);
|
||||
}
|
||||
|
||||
void
|
||||
Shader::hostWakeUp(BaseCPU *cpu) {
|
||||
if (cpuPointer == cpu) {
|
||||
if (gpuTc->status() == ThreadContext::Suspended)
|
||||
cpu->activateContext(gpuTc->threadId());
|
||||
} else {
|
||||
//Make sure both dispatcher and shader are trying to
|
||||
//wakeup same host. Hack here to enable kernel launch
|
||||
//from multiple CPUs
|
||||
panic("Dispatcher wants to wakeup a different host");
|
||||
}
|
||||
}
|
||||
|
||||
Shader*
|
||||
ShaderParams::create()
|
||||
{
|
||||
return new Shader(this);
|
||||
}
|
||||
|
||||
void
|
||||
Shader::exec()
|
||||
{
|
||||
tick_cnt = curTick();
|
||||
box_tick_cnt = curTick() - start_tick_cnt;
|
||||
|
||||
// apply any scheduled adds
|
||||
for (int i = 0; i < sa_n; ++i) {
|
||||
if (sa_when[i] <= tick_cnt) {
|
||||
*sa_val[i] += sa_x[i];
|
||||
sa_val.erase(sa_val.begin() + i);
|
||||
sa_x.erase(sa_x.begin() + i);
|
||||
sa_when.erase(sa_when.begin() + i);
|
||||
--sa_n;
|
||||
--i;
|
||||
}
|
||||
}
|
||||
|
||||
// clock all of the cu's
|
||||
for (int i = 0; i < n_cu; ++i)
|
||||
cuList[i]->exec();
|
||||
}
|
||||
|
||||
bool
|
||||
Shader::dispatch_workgroups(NDRange *ndr)
|
||||
{
|
||||
bool scheduledSomething = false;
|
||||
int cuCount = 0;
|
||||
int curCu = nextSchedCu;
|
||||
|
||||
while (cuCount < n_cu) {
|
||||
//Every time we try a CU, update nextSchedCu
|
||||
nextSchedCu = (nextSchedCu + 1) % n_cu;
|
||||
|
||||
// dispatch workgroup iff the following two conditions are met:
|
||||
// (a) wg_rem is true - there are unassigned workgroups in the grid
|
||||
// (b) there are enough free slots in cu cuList[i] for this wg
|
||||
if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
|
||||
scheduledSomething = true;
|
||||
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
|
||||
|
||||
// ticks() member function translates cycles to simulation ticks.
|
||||
if (!tickEvent.scheduled()) {
|
||||
schedule(tickEvent, curTick() + this->ticks(1));
|
||||
}
|
||||
|
||||
cuList[curCu]->StartWorkgroup(ndr);
|
||||
ndr->wgId[0]++;
|
||||
ndr->globalWgId++;
|
||||
if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
|
||||
ndr->wgId[0] = 0;
|
||||
ndr->wgId[1]++;
|
||||
|
||||
if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
|
||||
ndr->wgId[1] = 0;
|
||||
ndr->wgId[2]++;
|
||||
|
||||
if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
|
||||
ndr->wg_disp_rem = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++cuCount;
|
||||
curCu = nextSchedCu;
|
||||
}
|
||||
|
||||
return scheduledSomething;
|
||||
}
|
||||
|
||||
void
|
||||
Shader::handshake(GpuDispatcher *_dispatcher)
|
||||
{
|
||||
dispatcher = _dispatcher;
|
||||
}
|
||||
|
||||
void
|
||||
Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
|
||||
bool suppress_func_errors, int cu_id)
|
||||
{
|
||||
unsigned block_size = RubySystem::getBlockSizeBytes();
|
||||
unsigned size = req->getSize();
|
||||
|
||||
Addr tmp_addr;
|
||||
BaseTLB::Mode trans_mode;
|
||||
|
||||
if (cmd == MemCmd::ReadReq) {
|
||||
trans_mode = BaseTLB::Read;
|
||||
} else if (cmd == MemCmd::WriteReq) {
|
||||
trans_mode = BaseTLB::Write;
|
||||
} else {
|
||||
fatal("unexcepted MemCmd\n");
|
||||
}
|
||||
|
||||
tmp_addr = req->getVaddr();
|
||||
Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
|
||||
|
||||
assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
|
||||
|
||||
// Misaligned access
|
||||
if (split_addr > tmp_addr) {
|
||||
RequestPtr req1, req2;
|
||||
req->splitOnVaddr(split_addr, req1, req2);
|
||||
|
||||
|
||||
PacketPtr pkt1 = new Packet(req2, cmd);
|
||||
PacketPtr pkt2 = new Packet(req1, cmd);
|
||||
|
||||
functionalTLBAccess(pkt1, cu_id, trans_mode);
|
||||
functionalTLBAccess(pkt2, cu_id, trans_mode);
|
||||
|
||||
PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
|
||||
PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
|
||||
|
||||
new_pkt1->dataStatic(data);
|
||||
new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
|
||||
|
||||
if (suppress_func_errors) {
|
||||
new_pkt1->setSuppressFuncError();
|
||||
new_pkt2->setSuppressFuncError();
|
||||
}
|
||||
|
||||
// fixme: this should be cuList[cu_id] if cu_id != n_cu
|
||||
// The latter requires a memPort in the dispatcher
|
||||
cuList[0]->memPort[0]->sendFunctional(new_pkt1);
|
||||
cuList[0]->memPort[0]->sendFunctional(new_pkt2);
|
||||
|
||||
delete new_pkt1;
|
||||
delete new_pkt2;
|
||||
delete pkt1;
|
||||
delete pkt2;
|
||||
} else {
|
||||
PacketPtr pkt = new Packet(req, cmd);
|
||||
functionalTLBAccess(pkt, cu_id, trans_mode);
|
||||
PacketPtr new_pkt = new Packet(pkt->req, cmd);
|
||||
new_pkt->dataStatic(data);
|
||||
|
||||
if (suppress_func_errors) {
|
||||
new_pkt->setSuppressFuncError();
|
||||
};
|
||||
|
||||
// fixme: this should be cuList[cu_id] if cu_id != n_cu
|
||||
// The latter requires a memPort in the dispatcher
|
||||
cuList[0]->memPort[0]->sendFunctional(new_pkt);
|
||||
|
||||
delete new_pkt;
|
||||
delete pkt;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
Shader::busy()
|
||||
{
|
||||
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
|
||||
if (!cuList[i_cu]->isDone()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
|
||||
{
|
||||
sa_val.push_back(val);
|
||||
sa_when.push_back(tick_cnt + when);
|
||||
sa_x.push_back(x);
|
||||
++sa_n;
|
||||
}
|
||||
|
||||
Shader::TickEvent::TickEvent(Shader *_shader)
|
||||
: Event(CPU_Tick_Pri), shader(_shader)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Shader::TickEvent::process()
|
||||
{
|
||||
if (shader->busy()) {
|
||||
shader->exec();
|
||||
shader->schedule(this, curTick() + shader->ticks(1));
|
||||
}
|
||||
}
|
||||
|
||||
const char*
|
||||
Shader::TickEvent::description() const
|
||||
{
|
||||
return "Shader tick";
|
||||
}
|
||||
|
||||
void
|
||||
Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
||||
MemCmd cmd, bool suppress_func_errors)
|
||||
{
|
||||
uint8_t *data_buf = (uint8_t*)ptr;
|
||||
|
||||
for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
|
||||
!gen.done(); gen.next()) {
|
||||
Request *req = new Request(0, gen.addr(), gen.size(), 0,
|
||||
cuList[0]->masterId(), 0, 0, 0);
|
||||
|
||||
doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
|
||||
data_buf += gen.size();
|
||||
delete req;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
|
||||
{
|
||||
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
|
||||
}
|
||||
|
||||
void
|
||||
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
||||
bool suppress_func_errors)
|
||||
{
|
||||
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
|
||||
}
|
||||
|
||||
void
|
||||
Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
|
||||
{
|
||||
AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
|
||||
}
|
||||
|
||||
void
|
||||
Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
||||
bool suppress_func_errors)
|
||||
{
|
||||
AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
|
||||
suppress_func_errors);
|
||||
}
|
||||
|
||||
/*
|
||||
* Send a packet through the appropriate TLB functional port.
|
||||
* If cu_id=n_cu, then this is the dispatcher's TLB.
|
||||
* Otherwise it's the TLB of the cu_id compute unit.
|
||||
*/
|
||||
void
|
||||
Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
|
||||
{
|
||||
// update senderState. Need to know the gpuTc and the TLB mode
|
||||
pkt->senderState =
|
||||
new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
|
||||
|
||||
if (cu_id == n_cu) {
|
||||
dispatcher->tlbPort->sendFunctional(pkt);
|
||||
} else {
|
||||
// even when the perLaneTLB flag is turned on
|
||||
// it's ok tp send all accesses through lane 0
|
||||
// since the lane # is not known here,
|
||||
// This isn't important since these are functional accesses.
|
||||
cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
|
||||
}
|
||||
|
||||
/* safe_cast the senderState */
|
||||
TheISA::GpuTLB::TranslationState *sender_state =
|
||||
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
|
||||
|
||||
delete sender_state->tlbEntry;
|
||||
delete pkt->senderState;
|
||||
}
|
212
src/gpu-compute/shader.hh
Normal file
212
src/gpu-compute/shader.hh
Normal file
|
@ -0,0 +1,212 @@
|
|||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __SHADER_HH__
|
||||
#define __SHADER_HH__
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
#include "arch/isa.hh"
|
||||
#include "arch/isa_traits.hh"
|
||||
#include "base/types.hh"
|
||||
#include "cpu/simple/atomic.hh"
|
||||
#include "cpu/simple/timing.hh"
|
||||
#include "cpu/simple_thread.hh"
|
||||
#include "cpu/thread_context.hh"
|
||||
#include "cpu/thread_state.hh"
|
||||
#include "enums/MemOpType.hh"
|
||||
#include "enums/MemType.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_tlb.hh"
|
||||
#include "gpu-compute/lds_state.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
#include "mem/page_table.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "mem/request.hh"
|
||||
#include "params/Shader.hh"
|
||||
#include "sim/faults.hh"
|
||||
#include "sim/process.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
class BaseTLB;
|
||||
class GpuDispatcher;
|
||||
|
||||
namespace TheISA
|
||||
{
|
||||
class GpuTLB;
|
||||
}
|
||||
|
||||
static const int LDS_SIZE = 65536;
|
||||
|
||||
// Class Shader: This describes a single shader instance. Most
|
||||
// configurations will only have a single shader.
|
||||
|
||||
class Shader : public SimObject
|
||||
{
|
||||
protected:
|
||||
// Shader's clock period in terms of number of ticks of curTime,
|
||||
// aka global simulation clock
|
||||
Tick clock;
|
||||
|
||||
public:
|
||||
typedef ShaderParams Params;
|
||||
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
|
||||
|
||||
// clock related functions ; maps to-and-from
|
||||
// Simulation ticks and shader clocks.
|
||||
Tick frequency() const { return SimClock::Frequency / clock; }
|
||||
|
||||
Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
|
||||
|
||||
Tick getClock() const { return clock; }
|
||||
Tick curCycle() const { return curTick() / clock; }
|
||||
Tick tickToCycles(Tick val) const { return val / clock;}
|
||||
|
||||
|
||||
SimpleThread *cpuThread;
|
||||
ThreadContext *gpuTc;
|
||||
BaseCPU *cpuPointer;
|
||||
|
||||
class TickEvent : public Event
|
||||
{
|
||||
private:
|
||||
Shader *shader;
|
||||
|
||||
public:
|
||||
TickEvent(Shader*);
|
||||
void process();
|
||||
const char* description() const;
|
||||
};
|
||||
|
||||
TickEvent tickEvent;
|
||||
|
||||
// is this simulation going to be timing mode in the memory?
|
||||
bool timingSim;
|
||||
hsail_mode_e hsail_mode;
|
||||
|
||||
// If set, issue acq packet @ kernel launch
|
||||
int impl_kern_boundary_sync;
|
||||
// If set, generate a separate packet for acquire/release on
|
||||
// ld_acquire/st_release/atomic operations
|
||||
int separate_acquire_release;
|
||||
// If set, fetch returns may be coissued with instructions
|
||||
int coissue_return;
|
||||
// If set, always dump all 64 gprs to trace
|
||||
int trace_vgpr_all;
|
||||
// Number of cu units in the shader
|
||||
int n_cu;
|
||||
// Number of wavefront slots per cu
|
||||
int n_wf;
|
||||
// The size of global memory
|
||||
int globalMemSize;
|
||||
|
||||
/*
|
||||
* Bytes/work-item for call instruction
|
||||
* The number of arguments for an hsail function will
|
||||
* vary. We simply determine the maximum # of arguments
|
||||
* required by any hsail function up front before the
|
||||
* simulation (during parsing of the Brig) and record
|
||||
* that number here.
|
||||
*/
|
||||
int funcargs_size;
|
||||
|
||||
// Tracks CU that rr dispatcher should attempt scheduling
|
||||
int nextSchedCu;
|
||||
|
||||
// Size of scheduled add queue
|
||||
uint32_t sa_n;
|
||||
|
||||
// Pointer to value to be increments
|
||||
std::vector<uint32_t*> sa_val;
|
||||
// When to do the increment
|
||||
std::vector<uint64_t> sa_when;
|
||||
// Amount to increment by
|
||||
std::vector<int32_t> sa_x;
|
||||
|
||||
// List of Compute Units (CU's)
|
||||
std::vector<ComputeUnit*> cuList;
|
||||
|
||||
uint64_t tick_cnt;
|
||||
uint64_t box_tick_cnt;
|
||||
uint64_t start_tick_cnt;
|
||||
|
||||
GpuDispatcher *dispatcher;
|
||||
|
||||
Shader(const Params *p);
|
||||
~Shader();
|
||||
virtual void init();
|
||||
|
||||
// Run shader
|
||||
void exec();
|
||||
|
||||
// Check to see if shader is busy
|
||||
bool busy();
|
||||
|
||||
// Schedule a 32-bit value to be incremented some time in the future
|
||||
void ScheduleAdd(uint32_t *val, Tick when, int x);
|
||||
bool processTimingPacket(PacketPtr pkt);
|
||||
|
||||
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
||||
MemCmd cmd, bool suppress_func_errors);
|
||||
|
||||
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
|
||||
|
||||
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
|
||||
bool suppress_func_errors);
|
||||
|
||||
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
|
||||
|
||||
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
|
||||
bool suppress_func_errors);
|
||||
|
||||
void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
|
||||
bool suppress_func_errors, int cu_id);
|
||||
|
||||
void
|
||||
registerCU(int cu_id, ComputeUnit *compute_unit)
|
||||
{
|
||||
cuList[cu_id] = compute_unit;
|
||||
}
|
||||
|
||||
void handshake(GpuDispatcher *dispatcher);
|
||||
bool dispatch_workgroups(NDRange *ndr);
|
||||
Addr mmap(int length);
|
||||
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
|
||||
void updateThreadContext(int tid);
|
||||
void hostWakeUp(BaseCPU *cpu);
|
||||
};
|
||||
|
||||
#endif // __SHADER_HH__
|
108
src/gpu-compute/simple_pool_manager.cc
Normal file
108
src/gpu-compute/simple_pool_manager.cc
Normal file
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos
|
||||
*/
|
||||
|
||||
#include "gpu-compute/simple_pool_manager.hh"
|
||||
|
||||
#include "base/misc.hh"
|
||||
|
||||
// return the min number of elements that the manager can reserve given
|
||||
// a request for "size" elements
|
||||
uint32_t
|
||||
SimplePoolManager::minAllocatedElements(uint32_t size)
|
||||
{
|
||||
fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
|
||||
size);
|
||||
|
||||
return size % minAllocation() > 0 ?
|
||||
(minAllocation() - (size % minAllocation())) + size : size;
|
||||
}
|
||||
|
||||
std::string
|
||||
SimplePoolManager::printRegion()
|
||||
{
|
||||
std::string _cout;
|
||||
if (_reservedGroups == 0)
|
||||
_cout = "VRF is empty\n";
|
||||
else if (_reservedGroups > 0) {
|
||||
uint32_t reservedEntries = _reservedGroups * _regionSize;
|
||||
_cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
|
||||
}
|
||||
|
||||
return _cout;
|
||||
}
|
||||
|
||||
bool
|
||||
SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
|
||||
{
|
||||
assert(numRegions * minAllocatedElements(size) <= poolSize());
|
||||
|
||||
return _reservedGroups == 0;
|
||||
}
|
||||
|
||||
void
|
||||
SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx)
|
||||
{
|
||||
assert(_reservedGroups > 0);
|
||||
--_reservedGroups;
|
||||
|
||||
if (!_reservedGroups)
|
||||
_nxtFreeIdx = 0;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
SimplePoolManager::allocateRegion(const uint32_t size,
|
||||
uint32_t *reservedPoolSize)
|
||||
{
|
||||
uint32_t actualSize = minAllocatedElements(size);
|
||||
uint32_t startIdx = _nxtFreeIdx;
|
||||
_nxtFreeIdx += actualSize;
|
||||
_regionSize = actualSize;
|
||||
assert(_nxtFreeIdx < poolSize());
|
||||
*reservedPoolSize = actualSize;
|
||||
++_reservedGroups;
|
||||
|
||||
return startIdx;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> ®ion)
|
||||
{
|
||||
bool wrapAround = (region.first > region.second);
|
||||
if (!wrapAround) {
|
||||
return region.second - region.first + 1;
|
||||
} else {
|
||||
return region.second + poolSize() - region.first + 1;
|
||||
}
|
||||
}
|
72
src/gpu-compute/simple_pool_manager.hh
Normal file
72
src/gpu-compute/simple_pool_manager.hh
Normal file
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: John Kalamatianos
|
||||
*/
|
||||
|
||||
#ifndef __SIMPLE_POOL_MANAGER_HH__
|
||||
#define __SIMPLE_POOL_MANAGER_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
|
||||
#include "gpu-compute/pool_manager.hh"
|
||||
|
||||
// Simple Pool Manager: allows one region per pool. No region merging is
|
||||
// supported.
|
||||
class SimplePoolManager : public PoolManager
|
||||
{
|
||||
public:
|
||||
SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
|
||||
: PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
|
||||
_reservedGroups(0)
|
||||
{
|
||||
}
|
||||
|
||||
uint32_t minAllocatedElements(uint32_t size);
|
||||
std::string printRegion();
|
||||
bool canAllocate(uint32_t numRegions, uint32_t size);
|
||||
uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize);
|
||||
void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
|
||||
uint32_t regionSize(std::pair<uint32_t,uint32_t> ®ion);
|
||||
|
||||
private:
|
||||
// actual size of a region (normalized to the minimum size that can
|
||||
// be reserved)
|
||||
uint32_t _regionSize;
|
||||
// next index to allocate a region
|
||||
uint8_t _nxtFreeIdx;
|
||||
// number of groups that reserve a region
|
||||
uint32_t _reservedGroups;
|
||||
};
|
||||
|
||||
#endif // __SIMPLE_POOL_MANAGER_HH__
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue