config: Updates for distributed gem5 simulations
This commit is contained in:
parent
5dec4e07b8
commit
64ca31976f
7 changed files with 575 additions and 319 deletions
|
@ -654,3 +654,39 @@ def makeDualRoot(full_system, testSystem, driveSystem, dumpfile):
|
|||
self.etherlink.dump = Parent.etherdump
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def makeDistRoot(testSystem,
|
||||
rank,
|
||||
size,
|
||||
server_name,
|
||||
server_port,
|
||||
sync_repeat,
|
||||
sync_start,
|
||||
linkspeed,
|
||||
linkdelay,
|
||||
dumpfile):
|
||||
self = Root(full_system = True)
|
||||
self.testsys = testSystem
|
||||
|
||||
self.etherlink = DistEtherLink(speed = linkspeed,
|
||||
delay = linkdelay,
|
||||
dist_rank = rank,
|
||||
dist_size = size,
|
||||
server_name = server_name,
|
||||
server_port = server_port,
|
||||
sync_start = sync_start,
|
||||
sync_repeat = sync_repeat)
|
||||
|
||||
if hasattr(testSystem, 'realview'):
|
||||
self.etherlink.int0 = Parent.testsys.realview.ethernet.interface
|
||||
elif hasattr(testSystem, 'tsunami'):
|
||||
self.etherlink.int0 = Parent.testsys.tsunami.ethernet.interface
|
||||
else:
|
||||
fatal("Don't know how to connect DistEtherLink to this system")
|
||||
|
||||
if dumpfile:
|
||||
self.etherdump = EtherDump(file=dumpfile)
|
||||
self.etherlink.dump = Parent.etherdump
|
||||
|
||||
return self
|
||||
|
|
|
@ -297,10 +297,41 @@ def addFSOptions(parser):
|
|||
# Benchmark options
|
||||
parser.add_option("--dual", action="store_true",
|
||||
help="Simulate two systems attached with an ethernet link")
|
||||
parser.add_option("--dist", action="store_true",
|
||||
help="Parallel distributed gem5 simulation.")
|
||||
parser.add_option("--is-switch", action="store_true",
|
||||
help="Select the network switch simulator process for a"\
|
||||
"distributed gem5 run")
|
||||
parser.add_option("--dist-rank", default=0, action="store", type="int",
|
||||
help="Rank of this system within the dist gem5 run.")
|
||||
parser.add_option("--dist-size", default=0, action="store", type="int",
|
||||
help="Number of gem5 processes within the dist gem5 run.")
|
||||
parser.add_option("--dist-server-name",
|
||||
default="127.0.0.1",
|
||||
action="store", type="string",
|
||||
help="Name of the message server host\nDEFAULT: localhost")
|
||||
parser.add_option("--dist-server-port",
|
||||
default=2200,
|
||||
action="store", type="int",
|
||||
help="Message server listen port\nDEFAULT: 2200")
|
||||
parser.add_option("--dist-sync-repeat",
|
||||
default="0us",
|
||||
action="store", type="string",
|
||||
help="Repeat interval for synchronisation barriers among dist-gem5 processes\nDEFAULT: --ethernet-linkdelay")
|
||||
parser.add_option("--dist-sync-start",
|
||||
default="5200000000000t",
|
||||
action="store", type="string",
|
||||
help="Time to schedule the first dist synchronisation barrier\nDEFAULT:5200000000000t")
|
||||
parser.add_option("-b", "--benchmark", action="store", type="string",
|
||||
dest="benchmark",
|
||||
help="Specify the benchmark to run. Available benchmarks: %s"\
|
||||
% DefinedBenchmarks)
|
||||
parser.add_option("--ethernet-linkspeed", default="10Gbps",
|
||||
action="store", type="string",
|
||||
help="Link speed in bps\nDEFAULT: 10Gbps")
|
||||
parser.add_option("--ethernet-linkdelay", default="10us",
|
||||
action="store", type="string",
|
||||
help="Link delay in seconds\nDEFAULT: 10us")
|
||||
|
||||
# Metafile options
|
||||
parser.add_option("--etherdump", action="store", type="string", dest="etherdump",
|
||||
|
|
|
@ -340,6 +340,18 @@ test_sys = build_test_system(np)
|
|||
if len(bm) == 2:
|
||||
drive_sys = build_drive_system(np)
|
||||
root = makeDualRoot(True, test_sys, drive_sys, options.etherdump)
|
||||
elif len(bm) == 1 and options.dist:
|
||||
# This system is part of a dist-gem5 simulation
|
||||
root = makeDistRoot(test_sys,
|
||||
options.dist_rank,
|
||||
options.dist_size,
|
||||
options.dist_server_name,
|
||||
options.dist_server_port,
|
||||
options.dist_sync_repeat,
|
||||
options.dist_sync_start,
|
||||
options.ethernet_linkspeed,
|
||||
options.ethernet_linkdelay,
|
||||
options.etherdump);
|
||||
elif len(bm) == 1:
|
||||
root = Root(full_system=True, system=test_sys)
|
||||
else:
|
||||
|
|
385
util/dist/gem5-dist.sh
vendored
Executable file
385
util/dist/gem5-dist.sh
vendored
Executable file
|
@ -0,0 +1,385 @@
|
|||
#! /bin/bash
|
||||
|
||||
#
|
||||
# Copyright (c) 2015 ARM Limited
|
||||
# All rights reserved
|
||||
#
|
||||
# The license below extends only to copyright in the software and shall
|
||||
# not be construed as granting a license to any other intellectual
|
||||
# property including but not limited to intellectual property relating
|
||||
# to a hardware implementation of the functionality of the software
|
||||
# licensed hereunder. You may use the software subject to the license
|
||||
# terms below provided that you ensure that this notice is replicated
|
||||
# unmodified and in its entirety in all distributions of the software,
|
||||
# modified or unmodified, in source code or in binary form.
|
||||
#
|
||||
# Copyright (c) 2015 University of Illinois Urbana Champaign
|
||||
# All rights reserved
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met: redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer;
|
||||
# redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution;
|
||||
# neither the name of the copyright holders nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Authors: Gabor Dozsa
|
||||
# Mohammad Alian
|
||||
|
||||
|
||||
# This is a wrapper script to run a dist gem5 simulations.
|
||||
# See the usage_func() below for hints on how to use it. Also,
|
||||
# there are some examples in the util/dist directory (e.g.
|
||||
# see util/dist/test-2nodes-AArch64.sh)
|
||||
#
|
||||
#
|
||||
# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
|
||||
# environment variable (which is what LSF does by default).
|
||||
# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
|
||||
# allocated to launch the gem5 processes, 2 of them are on host hname1
|
||||
# and 4 of them are on host hname2.
|
||||
# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
|
||||
# processes on the localhost.
|
||||
#
|
||||
# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
|
||||
# boot params. The total number of gem5 processes is also passed in.
|
||||
# These values can be used in the boot script to configure the MAC/IP
|
||||
# addresses - among other things (see util/dist/bootscript.rcS).
|
||||
#
|
||||
# Each gem5 process will create an m5out.$GEM5_RANK directory for
|
||||
# the usual output files. Furthermore, there will be a separate log file
|
||||
# for each ssh session (we use ssh to start gem5 processes) and one for
|
||||
# the server. These are called log.$GEM5_RANK and log.switch.
|
||||
#
|
||||
|
||||
|
||||
# print help
|
||||
usage_func ()
|
||||
{
|
||||
echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe "
|
||||
echo " -debug : debug mode (start gem5 in gdb)"
|
||||
echo " nnodes : number of gem5 processes"
|
||||
echo " rundir : run simulation under this path. If not specified, current dir will be used"
|
||||
echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used"
|
||||
|
||||
echo " fullsystem: fullsystem config file"
|
||||
echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..."
|
||||
echo " port : switch listen port"
|
||||
echo " switch : switch config file"
|
||||
echo " sw_args : switch config specific argument list: arg1 arg2 ..."
|
||||
echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..."
|
||||
echo " gem5_exe : gem5 executable (full path required)"
|
||||
echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..."
|
||||
echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
|
||||
}
|
||||
|
||||
# Process (optional) command line options
|
||||
FS_ARGS=" "
|
||||
SW_ARGS=" "
|
||||
CF_ARGS=" "
|
||||
M5_ARGS=" "
|
||||
while (($# > 0))
|
||||
do
|
||||
case "x$1" in
|
||||
x-debug)
|
||||
GEM5_DEBUG="-debug"
|
||||
shift 1
|
||||
;;
|
||||
x-n|x-nodes)
|
||||
NNODES=$2
|
||||
shift 2
|
||||
;;
|
||||
x-r|x-rundir)
|
||||
RUN_DIR=$2
|
||||
shift 2
|
||||
;;
|
||||
x-c|x-ckptdir)
|
||||
CKPT_DIR=$2
|
||||
shift 2
|
||||
;;
|
||||
x-p|x-port)
|
||||
SW_PORT=$2
|
||||
shift 2
|
||||
;;
|
||||
x-s|x-switch)
|
||||
SW_CONFIG=$2
|
||||
shift 2
|
||||
;;
|
||||
x--sw-args)
|
||||
CUR_ARGS="SW_ARGS"
|
||||
shift 1
|
||||
;;
|
||||
x-f|x-fullsystem)
|
||||
FS_CONFIG=$2
|
||||
shift 2
|
||||
;;
|
||||
x--fs-args)
|
||||
CUR_ARGS="FS_ARGS"
|
||||
shift 1
|
||||
;;
|
||||
x--cf-args)
|
||||
CUR_ARGS="CF_ARGS"
|
||||
shift 1
|
||||
;;
|
||||
x--m5-args)
|
||||
CUR_ARGS="M5_ARGS"
|
||||
shift 1
|
||||
;;
|
||||
x-x)
|
||||
GEM5_EXE=$2
|
||||
shift 2
|
||||
;;
|
||||
x-*)
|
||||
[ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; }
|
||||
case "x$2" in
|
||||
x-*|x)
|
||||
eval $CUR_ARGS=\"${!CUR_ARGS} $1\"
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\"
|
||||
shift 2
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
*)
|
||||
echo "Unknown arg: $1"
|
||||
usage_func
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Default values to use (in case they are not defined as command line options)
|
||||
DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py
|
||||
DEFAULT_SW_CONFIG=$M5_PATH/configs/example/sw.py
|
||||
DEFAULT_SW_PORT=2200
|
||||
|
||||
[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG
|
||||
[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG
|
||||
[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT
|
||||
[ -z "$NNODES" ] && NNODES=2
|
||||
[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd)
|
||||
[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd)
|
||||
|
||||
# Check if all the executables we need exist
|
||||
[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; }
|
||||
[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; }
|
||||
[ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
|
||||
# make sure that RUN_DIR exists
|
||||
mkdir -p $RUN_DIR > /dev/null 2>&1
|
||||
|
||||
declare -a SSH_PIDS
|
||||
declare -a HOSTS
|
||||
declare -a NCORES
|
||||
|
||||
# Find out which cluster hosts/slots are allocated or
|
||||
# use localhost if there is no LSF allocation.
|
||||
# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
|
||||
# environment variable in the form:
|
||||
# host1 nslots1 host2 nslots2 ...
|
||||
# (This is what LSF does by default.)
|
||||
NH=0
|
||||
[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES"
|
||||
host=""
|
||||
for hc in $LSB_MCPU_HOSTS
|
||||
do
|
||||
if [ "x$host" == "x" ]
|
||||
then
|
||||
host=$hc
|
||||
HOSTS+=($hc)
|
||||
else
|
||||
NCORES+=($hc)
|
||||
((NH+=hc))
|
||||
host=""
|
||||
fi
|
||||
done
|
||||
((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }
|
||||
|
||||
# function to clean up and abort if something goes wrong
|
||||
abort_func ()
|
||||
{
|
||||
echo
|
||||
echo "KILLED $(date)"
|
||||
# Try to Kill the server first. That should trigger an exit for all connected
|
||||
# gem5 processes.
|
||||
[ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
|
||||
sleep 20
|
||||
# (try to) kill gem5 processes - just in case something went wrong with the
|
||||
# server triggered exit
|
||||
bname=$(basename $GEM5_EXE)
|
||||
killall -q -s SIGKILL $bname
|
||||
for h in ${HOSTS[@]}
|
||||
do
|
||||
ssh $h killall -q -s SIGKILL $bname
|
||||
done
|
||||
sleep 5
|
||||
# kill the watchdog
|
||||
[ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
|
||||
exit -1
|
||||
}
|
||||
|
||||
# We need a watchdog to trigger full clean up if a gem5 process dies
|
||||
watchdog_func ()
|
||||
{
|
||||
while true
|
||||
do
|
||||
sleep 30
|
||||
((NDEAD=0))
|
||||
for p in ${SSH_PIDS[*]}
|
||||
do
|
||||
kill -0 $p 2>/dev/null || ((NDEAD+=1))
|
||||
done
|
||||
kill -0 $SW_PID || ((NDEAD+=1))
|
||||
if ((NDEAD>0))
|
||||
then
|
||||
# we may be in the middle of an orderly termination,
|
||||
# give it some time to complete before reporting abort
|
||||
sleep 60
|
||||
echo -n "(I) (some) gem5 process(es) exited"
|
||||
abort_func
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# This function launches the gem5 processes. The only purpose is to enable
|
||||
# launching gem5 processes under gdb control for debugging
|
||||
start_func ()
|
||||
{
|
||||
local N=$1
|
||||
local HOST=$2
|
||||
local ENV_ARGS=$3
|
||||
shift 3
|
||||
if [ "x$GEM5_DEBUG" != "x" ]
|
||||
then
|
||||
echo "DEBUG starting terminal..."
|
||||
MY_ARGS="$@"
|
||||
xterm -e "gdb --args $MY_ARGS" &
|
||||
else
|
||||
ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
|
||||
fi
|
||||
}
|
||||
|
||||
# block till the gem5 process starts
|
||||
connected ()
|
||||
{
|
||||
FILE=$1
|
||||
STRING=$2
|
||||
echo -n "waiting for $3 to start "
|
||||
while : ;
|
||||
do
|
||||
kill -0 $4 || { echo "Failed to start $3"; exit -1; }
|
||||
[[ -f "$FILE" ]] && \
|
||||
grep -q "$STRING" "$FILE" && \
|
||||
echo -e "\nnode #$3 started" && \
|
||||
break
|
||||
|
||||
sleep 2
|
||||
echo -n "."
|
||||
done
|
||||
}
|
||||
|
||||
# Trigger full clean up in case we are being killed by external signal
|
||||
trap 'abort_func' INT TERM
|
||||
|
||||
# env args to be passed explicitly to gem5 processes started via ssh
|
||||
ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"
|
||||
|
||||
#cleanup log files before starting gem5 processes
|
||||
rm $RUN_DIR/log.switch > /dev/null 2>&1
|
||||
|
||||
# make sure that CKPT_DIR exists
|
||||
mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
|
||||
# launch switch gem5
|
||||
SW_HOST=${HOSTS[0]}
|
||||
echo "launch switch gem5 process on $SW_HOST ..."
|
||||
start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \
|
||||
$M5_ARGS \
|
||||
$SW_CONFIG \
|
||||
$SW_ARGS \
|
||||
$CF_ARGS \
|
||||
--checkpoint-dir=$CKPT_DIR/m5out.switch \
|
||||
--is-switch \
|
||||
--dist-size=$NNODES \
|
||||
--dist-server-port=$SW_PORT
|
||||
SW_PID=$!
|
||||
|
||||
# block here till switch process starts
|
||||
connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
|
||||
LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch)
|
||||
|
||||
IFS=' ' read -ra ADDR <<< "$LINE"
|
||||
# actual port that switch is listening on may be different
|
||||
# from what we specified if the port was busy
|
||||
SW_PORT=${ADDR[5]}
|
||||
|
||||
# Now launch all the gem5 processes with ssh.
|
||||
echo "START $(date)"
|
||||
n=0
|
||||
for ((i=0; i < ${#HOSTS[@]}; i++))
|
||||
do
|
||||
h=${HOSTS[$i]}
|
||||
for ((j=0; j < ${NCORES[i]}; j++))
|
||||
do
|
||||
#cleanup log files before starting gem5 processes
|
||||
rm $RUN_DIR/log.$n > /dev/null 2>&1
|
||||
# make sure that CKPT_DIR exists
|
||||
mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
|
||||
echo "starting gem5 on $h ..."
|
||||
start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \
|
||||
$M5_ARGS \
|
||||
$FS_CONFIG \
|
||||
$FS_ARGS \
|
||||
$CF_ARGS \
|
||||
--checkpoint-dir=$CKPT_DIR/m5out.$n \
|
||||
--dist \
|
||||
--dist-rank=$n \
|
||||
--dist-size=$NNODES \
|
||||
--dist-server-name=${HOSTS[0]} \
|
||||
--dist-server-port=$SW_PORT
|
||||
SSH_PIDS[$n]=$!
|
||||
((n+=1))
|
||||
done
|
||||
done
|
||||
|
||||
# Wait here if it is a debug session
|
||||
[ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; }
|
||||
|
||||
# start watchdog to trigger complete abort (after a grace period) if any
|
||||
# gem5 process dies
|
||||
watchdog_func &
|
||||
WATCHDOG_PID=$!
|
||||
|
||||
# wait for exit statuses
|
||||
((NFAIL=0))
|
||||
for p in ${SSH_PIDS[*]}
|
||||
do
|
||||
wait $p || ((NFAIL+=1))
|
||||
done
|
||||
wait $SW_PID || ((NFAIL+=1))
|
||||
|
||||
# all done, let's terminate the watchdog
|
||||
kill $WATCHDOG_PID 2>/dev/null
|
||||
|
||||
if ((NFAIL==0))
|
||||
then
|
||||
echo "EXIT $(date)"
|
||||
else
|
||||
echo "ABORT $(date)"
|
||||
fi
|
|
@ -40,33 +40,28 @@
|
|||
# Authors: Gabor Dozsa
|
||||
#
|
||||
#
|
||||
# This is an example boot script to use for muti gem5 runs. The important
|
||||
# task here is to extract the rank and size information from the kernel
|
||||
# boot args and use those to configure MAC/IP addresses and hostname.
|
||||
# Then we can kick off our (parallel) workload ...
|
||||
# This is an example boot script to use for dist-gem5 runs. The important
|
||||
# task here is to extract the rank and size information through the m5
|
||||
# initparam utility and use those to configure MAC/IP addresses and hostname.
|
||||
#
|
||||
# You are expected to costumize this scipt for your needs (e.g. change
|
||||
# You are expected to customize this scipt for your needs (e.g. change
|
||||
# the command at the end of the scipt to run your tests/workloads.
|
||||
|
||||
source /root/.bashrc
|
||||
echo "bootscript.rcS is running"
|
||||
|
||||
m='GEM5\_RANK=([0-9]+) GEM5\_SIZE=([0-9]+)'
|
||||
if [[ $(cat /proc/cmdline) =~ $m ]]
|
||||
then
|
||||
MY_RANK=${BASH_REMATCH[1]}
|
||||
MY_SIZE=${BASH_REMATCH[2]}
|
||||
else
|
||||
echo "(E) GEM5_RANK/GEM5_SIZE was not defined in bootargs, exiting ..."
|
||||
/sbin/m5 abort
|
||||
fi
|
||||
# Retrieve dist-gem5 rank and size parameters using the 'm5' utility
|
||||
MY_RANK=$(/sbin/m5 initparam dist-rank)
|
||||
[ $? = 0 ] || { echo "m5 initparam failed"; exit -1; }
|
||||
MY_SIZE=$(/sbin/m5 initparam dist-size)
|
||||
[ $? = 0 ] || { echo "m5 initparam failed"; exit -1; }
|
||||
|
||||
/bin/hostname node${MY_RANK}
|
||||
|
||||
# Keep MAC address assignment simple for now ...
|
||||
(($MY_RANK>97)) && { echo "(E) Rank must be less than 98"; /sbin/m5 abort; }
|
||||
((MY_ADDR=MY_RANK+2))
|
||||
if (($MY_ADDR<10))
|
||||
(($MY_RANK > 97)) && { echo "(E) Rank must be less than 98"; /sbin/m5 abort; }
|
||||
((MY_ADDR = MY_RANK + 2))
|
||||
if (($MY_ADDR < 10))
|
||||
then
|
||||
MY_ADDR_PADDED=0${MY_ADDR}
|
||||
else
|
||||
|
@ -78,45 +73,35 @@ fi
|
|||
|
||||
/sbin/ifconfig -a
|
||||
|
||||
# Prepare host lists for mpirun
|
||||
MY_MPI_HOSTS="192.168.0.2"
|
||||
for ((i=1; i<MY_SIZE; i++))
|
||||
do
|
||||
MY_MPI_HOSTS+=",192.168.0.$((i+2))"
|
||||
done
|
||||
echo "Hello from $MY_RANK of $MY_SIZE"
|
||||
|
||||
# Check that Ethernet links work, then take a checkpoint
|
||||
if [ "$MY_RANK" == "0" ]
|
||||
# Now that our network interface is configured we can use the usual commands to
|
||||
# contact the other systems, e.g. let's try to ping a "neighbour" system
|
||||
if ((MY_RANK < MY_SIZE - 1))
|
||||
then
|
||||
OLDIFS=$IFS
|
||||
IFS=","
|
||||
for i in $MY_MPI_HOSTS
|
||||
do
|
||||
ping -c 1 $i || { echo "ping $i failed, exiting ..."; exit -1; }
|
||||
ssh $i hostname || { echo "ssh $i failed, exiting ..."; exit -1; }
|
||||
done
|
||||
IFS=$OLDIFS
|
||||
/sbin/m5 checkpoint
|
||||
ping -c 1 192.168.0.$((MY_ADDR + 1))
|
||||
else
|
||||
ping -c 1 192.168.0.2
|
||||
fi
|
||||
|
||||
# --------------------------------------------
|
||||
# ------ Start your tests below ... ---------
|
||||
# --------------------------------------------
|
||||
|
||||
if [ "$MY_RANK" == "0" ]
|
||||
then
|
||||
echo "MPI test"
|
||||
#mpirun -H 192.168.0.3,192.168.0.2 hostname
|
||||
cd /benchmarks
|
||||
mpirun -H $MY_MPI_HOSTS lulesh/lulesh2.0-mpi -s 5
|
||||
# Trigger an immediate checkpoint at the next sync (by passing a non-zero
|
||||
# delay param to m5 ckpt)
|
||||
/sbin/m5 checkpoint 1
|
||||
echo "A real multi node workload might start here ..."
|
||||
# Trigger an immediate exit at the next sync (by passing a non-zero delay
|
||||
# param to m5 exit)
|
||||
/sbin/m5 exit 1
|
||||
else
|
||||
# This is to avoid other (rank!=0) gem5 processes exiting
|
||||
# before the test (started by rank 0) completes. When rank 0 completes the
|
||||
# test it will exit and that will trigger a notification to all the peer
|
||||
# gem5 peocesses to stop the simulation.
|
||||
# gem5 processes to stop the simulation.
|
||||
echo "sleep forever..."
|
||||
while /bin/true
|
||||
do
|
||||
sleep 5
|
||||
sleep 5
|
||||
done
|
||||
fi
|
82
util/dist/test/test-2nodes-AArch64.sh
vendored
Normal file
82
util/dist/test/test-2nodes-AArch64.sh
vendored
Normal file
|
@ -0,0 +1,82 @@
|
|||
#! /bin/bash
|
||||
|
||||
#
|
||||
# Copyright (c) 2015 ARM Limited
|
||||
# All rights reserved
|
||||
#
|
||||
# The license below extends only to copyright in the software and shall
|
||||
# not be construed as granting a license to any other intellectual
|
||||
# property including but not limited to intellectual property relating
|
||||
# to a hardware implementation of the functionality of the software
|
||||
# licensed hereunder. You may use the software subject to the license
|
||||
# terms below provided that you ensure that this notice is replicated
|
||||
# unmodified and in its entirety in all distributions of the software,
|
||||
# modified or unmodified, in source code or in binary form.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met: redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer;
|
||||
# redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution;
|
||||
# neither the name of the copyright holders nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Authors: Gabor Dozsa
|
||||
#
|
||||
#
|
||||
# This is an example script to start a dist gem5 simulations using
|
||||
# two AArch64 systems. It is also uses the example
|
||||
# dist gem5 bootscript util/dist/test/simple_bootscript.rcS that will
|
||||
# run the linux ping command to check if we can see the peer system
|
||||
# connected via the simulated Ethernet link.
|
||||
|
||||
GEM5_DIR=$(pwd)/$(dirname $0)/../../..
|
||||
|
||||
IMG=$M5_PATH/disks/aarch64-ubuntu-trusty-headless.img
|
||||
VMLINUX=$M5_PATH/binaries/vmlinux.aarch64.20140821
|
||||
DTB=$M5_PATH/binaries/vexpress.aarch64.20140821.dtb
|
||||
|
||||
FS_CONFIG=$GEM5_DIR/configs/example/fs.py
|
||||
SW_CONFIG=$GEM5_DIR/configs/example/sw.py
|
||||
GEM5_EXE=$GEM5_DIR/build/ARM/gem5.opt
|
||||
|
||||
BOOT_SCRIPT=$GEM5_DIR/util/dist/test/simple_bootscript.rcS
|
||||
GEM5_DIST_SH=$GEM5_DIR/util/dist/gem5-dist.sh
|
||||
|
||||
DEBUG_FLAGS="--debug-flags=DistEthernet"
|
||||
#CHKPT_RESTORE="-r1"
|
||||
|
||||
NNODES=2
|
||||
|
||||
$GEM5_DIST_SH -n $NNODES \
|
||||
-x $GEM5_EXE \
|
||||
-s $SW_CONFIG \
|
||||
-f $FS_CONFIG \
|
||||
--m5-args \
|
||||
$DEBUG_FLAGS \
|
||||
--fs-args \
|
||||
--cpu-type=atomic \
|
||||
--num-cpus=1 \
|
||||
--machine-type=VExpress_EMM64 \
|
||||
--disk-image=$IMG \
|
||||
--kernel=$VMLINUX \
|
||||
--dtb-filename=$DTB \
|
||||
--script=$BOOT_SCRIPT \
|
||||
--cf-args \
|
||||
$CHKPT_RESTORE
|
||||
|
|
@ -1,275 +0,0 @@
|
|||
#! /bin/bash
|
||||
|
||||
#
|
||||
# Copyright (c) 2015 ARM Limited
|
||||
# All rights reserved
|
||||
#
|
||||
# The license below extends only to copyright in the software and shall
|
||||
# not be construed as granting a license to any other intellectual
|
||||
# property including but not limited to intellectual property relating
|
||||
# to a hardware implementation of the functionality of the software
|
||||
# licensed hereunder. You may use the software subject to the license
|
||||
# terms below provided that you ensure that this notice is replicated
|
||||
# unmodified and in its entirety in all distributions of the software,
|
||||
# modified or unmodified, in source code or in binary form.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met: redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer;
|
||||
# redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution;
|
||||
# neither the name of the copyright holders nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Authors: Gabor Dozsa
|
||||
|
||||
|
||||
# This is a wrapper script to run a multi gem5 simulations.
|
||||
# See the usage_func() below for hints on how to use it. Also,
|
||||
# there are some examples in the util/multi directory (e.g.
|
||||
# see util/multi/test-2nodes-AArch64.sh)
|
||||
#
|
||||
#
|
||||
# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
|
||||
# environment variable (which is what LSF does by default).
|
||||
# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
|
||||
# allocated to launch the gem5 processes, 2 of them are on host hname1
|
||||
# and 4 of them are on host hname2.
|
||||
# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
|
||||
# processes on the localhost.
|
||||
#
|
||||
# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
|
||||
# boot params. The total number of gem5 processes is also passed in.
|
||||
# These values can be used in the boot script to configure the MAC/IP
|
||||
# addresses - among other things (see util/multi/bootscript.rcS).
|
||||
#
|
||||
# Each gem5 process will create an m5out.$GEM5_RANK directory for
|
||||
# the usual output files. Furthermore, there will be a separate log file
|
||||
# for each ssh session (we use ssh to start gem5 processes) and one for
|
||||
# the server. These are called log.$GEM5_RANK and log.server.
|
||||
#
|
||||
|
||||
|
||||
# print help
|
||||
usage_func ()
|
||||
{
|
||||
echo "Usage:$0 [-debug] [-n nnodes] [-s server] [-p port] gem5_exe gem5_args"
|
||||
echo " -debug : debug mode (start gem5 in gdb)"
|
||||
echo " nnodes : number of gem5 processes"
|
||||
echo " server : message server executable"
|
||||
echo " port : message server listen port"
|
||||
echo " gem5_exe : gem5 executable (full path required)"
|
||||
echo " gem5_args: usual gem5 arguments ( m5 options, config script options)"
|
||||
echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
|
||||
}
|
||||
|
||||
|
||||
# Process (optional) command line options
|
||||
|
||||
while true
|
||||
do
|
||||
case "x$1" in
|
||||
x-n|x-nodes)
|
||||
NNODES=$2
|
||||
shift 2
|
||||
;;
|
||||
x-s|x-server)
|
||||
TCP_SERVER=$2
|
||||
shift 2
|
||||
;;
|
||||
x-p|x-port)
|
||||
SERVER_PORT=$2
|
||||
shift 2
|
||||
;;
|
||||
x-debug)
|
||||
GEM5_DEBUG="-debug"
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
break
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# The remaining command line args must be the usual gem5 command
|
||||
(($# < 2)) && { usage_func; exit -1; }
|
||||
GEM5_EXE=$1
|
||||
shift
|
||||
GEM5_ARGS="$*"
|
||||
|
||||
# Default values to use (in case they are not defined as command line options)
|
||||
DEFAULT_TCP_SERVER=$(dirname $0)/../../util/multi/tcp_server
|
||||
DEFAULT_SERVER_PORT=2200
|
||||
|
||||
[ -z "$TCP_SERVER" ] && TCP_SERVER=$DEFAULT_TCP_SERVER
|
||||
[ -z "$SERVER_PORT" ] && SERVER_PORT=$DEFAULT_SERVER_PORT
|
||||
[ -z "$NNODES" ] && NNODES=2
|
||||
|
||||
|
||||
# Check if all the executables we need exist
|
||||
[ -x "$TCP_SERVER" ] || { echo "Executable ${TCP_SERVER} not found"; exit 1; }
|
||||
[ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
|
||||
|
||||
|
||||
declare -a SSH_PIDS
|
||||
declare -a HOSTS
|
||||
declare -a NCORES
|
||||
|
||||
# Find out which cluster hosts/slots are allocated or
|
||||
# use localhost if there is no LSF allocation.
|
||||
# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
|
||||
# environment variable in the form:
|
||||
# host1 nslots1 host2 nslots2 ...
|
||||
# (This is what LSF does by default.)
|
||||
NH=0
|
||||
[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="localhost $NNODES"
|
||||
host=""
|
||||
for hc in $LSB_MCPU_HOSTS
|
||||
do
|
||||
if [ "x$host" == "x" ]
|
||||
then
|
||||
host=$hc
|
||||
HOSTS+=($hc)
|
||||
else
|
||||
NCORES+=($hc)
|
||||
((NH+=hc))
|
||||
host=""
|
||||
fi
|
||||
done
|
||||
((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }
|
||||
#echo "hosts: ${HOSTS[@]}"
|
||||
#echo "hosts: ${NCORES[@]}"
|
||||
#echo ${#HOSTS[@]}
|
||||
|
||||
|
||||
# function to clean up and abort if something goes wrong
|
||||
abort_func ()
|
||||
{
|
||||
echo
|
||||
echo "KILLED $(date)"
|
||||
# (try to) kill all gem5 processes on all hosts
|
||||
bname=$(basename $GEM5_EXE)
|
||||
killall -q $bname
|
||||
for h in ${HOSTS[@]}
|
||||
do
|
||||
ssh $h killall -q $bname
|
||||
done
|
||||
sleep 3
|
||||
# kill the message server and the watchdog
|
||||
[ "x$SERVER_PID" != "x" ] && kill $SERVER_PID 2>/dev/null
|
||||
[ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
|
||||
exit -1
|
||||
}
|
||||
|
||||
|
||||
# We need a watchdog to trigger full clean up if a gem5 process dies
|
||||
watchdog_func ()
|
||||
{
|
||||
while true
|
||||
do
|
||||
sleep 30
|
||||
((NDEAD=0))
|
||||
for p in ${SSH_PIDS[*]}
|
||||
do
|
||||
kill -0 $p 2>/dev/null || ((NDEAD+=1))
|
||||
done
|
||||
kill -0 $SERVER_PID || ((NDEAD+=1))
|
||||
if ((NDEAD>0))
|
||||
then
|
||||
# we may be in the middle of an orderly termination,
|
||||
# give it some time to complete before reporting abort
|
||||
sleep 60
|
||||
echo -n "(I) (some) gem5 process(es) exited"
|
||||
abort_func
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# This function launches the gem5 processes. We use it only to allow launching
|
||||
# gem5 processes under gdb control (in the foreground) for debugging
|
||||
start_func ()
|
||||
{
|
||||
local N=$1
|
||||
local HOST=$2
|
||||
local ENV_ARGS=$3
|
||||
shift 3
|
||||
if [ "x$GEM5_DEBUG" != "x" ]
|
||||
then
|
||||
gdb --args "$@"
|
||||
else
|
||||
ssh $HOST $ENV_ARGS "$@" &>log.$N &
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# Trigger full clean up in case we are being killed by external signal
|
||||
trap 'abort_func' INT TERM
|
||||
|
||||
# env args to be passed explicitly to gem5 processes started via ssh
|
||||
ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"
|
||||
|
||||
# launch the mesage server and check if it has started okay
|
||||
$TCP_SERVER $GEM5_DEBUG $NNODES $SERVER_PORT &>log.server &
|
||||
SERVER_PID=$!
|
||||
sleep 2
|
||||
kill -0 $SERVER_PID || { echo "Failed to start message server"; exit -1; }
|
||||
|
||||
# Now launch all the gem5 processes with ssh.
|
||||
echo "START $(date)"
|
||||
n=0
|
||||
for ((i=0; i < ${#HOSTS[@]}; i++))
|
||||
do
|
||||
h=${HOSTS[$i]}
|
||||
for ((j=0; j < ${NCORES[i]}; j++))
|
||||
do
|
||||
echo "starting gem5 on $h ..."
|
||||
start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $(pwd)/m5out.$n $GEM5_ARGS \
|
||||
--multi \
|
||||
--multi-rank=$n \
|
||||
--multi-server-name=${HOSTS[0]} \
|
||||
--multi-server-port=$SERVER_PORT \
|
||||
--testsys-toplevel-LinuxArmSystem.boot_osflags="\"GEM5_RANK=$n GEM5_SIZE=$NNODES\""
|
||||
SSH_PIDS[$n]=$!
|
||||
((n+=1))
|
||||
done
|
||||
done
|
||||
|
||||
[ "x$GEM5_DEBUG" == "x" ] || { kill $SERVER_PID; echo "DEBUG exit"; exit -1; }
|
||||
|
||||
# start watchdog to trigger complete abort (after a grace period) if any
|
||||
# gem5 process dies
|
||||
watchdog_func &
|
||||
WATCHDOG_PID=$!
|
||||
|
||||
# wait for exit statuses
|
||||
((NFAIL=0))
|
||||
for p in ${SSH_PIDS[*]}
|
||||
do
|
||||
wait $p || ((NFAIL+=1))
|
||||
done
|
||||
wait $SERVER_PID || ((NFAIL+=1))
|
||||
|
||||
# all done, let's terminate the watchdog
|
||||
kill $WATCHDOG_PID 2>/dev/null
|
||||
|
||||
if ((NFAIL==0))
|
||||
then
|
||||
echo "EXIT $(date)"
|
||||
else
|
||||
echo "ABORT $(date)"
|
||||
fi
|
Loading…
Reference in a new issue