2017-02-10 16:00:18 +01:00
|
|
|
#!/usr/bin/env python2
|
2005-02-05 19:49:17 +01:00
|
|
|
# Copyright (c) 2005 The Regents of The University of Michigan
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are
|
|
|
|
# met: redistributions of source code must retain the above copyright
|
|
|
|
# notice, this list of conditions and the following disclaimer;
|
|
|
|
# redistributions in binary form must reproduce the above copyright
|
|
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution;
|
|
|
|
# neither the name of the copyright holders nor the names of its
|
|
|
|
# contributors may be used to endorse or promote products derived from
|
|
|
|
# this software without specific prior written permission.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#
|
|
|
|
# Authors: Ali Saidi
|
|
|
|
# Nathan Binkert
|
|
|
|
|
2005-02-10 05:55:21 +01:00
|
|
|
import os, os.path, re, socket, sys
|
2005-02-05 19:49:17 +01:00
|
|
|
from os import environ as env, listdir
|
2005-08-16 17:27:49 +02:00
|
|
|
from os.path import basename, isdir, isfile, islink, join as joinpath, normpath
|
2005-02-05 19:49:17 +01:00
|
|
|
from filecmp import cmp as filecmp
|
2005-08-16 17:27:49 +02:00
|
|
|
from shutil import copy
|
2005-02-05 19:49:17 +01:00
|
|
|
|
2005-02-10 05:55:21 +01:00
|
|
|
def nfspath(dir):
|
|
|
|
if dir.startswith('/.automount/'):
|
|
|
|
dir = '/n/%s' % dir[12:]
|
|
|
|
elif not dir.startswith('/n/'):
|
|
|
|
dir = '/n/%s%s' % (socket.gethostname().split('.')[0], dir)
|
|
|
|
return dir
|
|
|
|
|
2005-08-16 17:27:49 +02:00
|
|
|
def syncdir(srcdir, destdir):
|
|
|
|
srcdir = normpath(srcdir)
|
|
|
|
destdir = normpath(destdir)
|
|
|
|
if not isdir(destdir):
|
|
|
|
sys.exit('destination directory "%s" does not exist' % destdir)
|
|
|
|
|
|
|
|
for root, dirs, files in os.walk(srcdir):
|
|
|
|
root = normpath(root)
|
|
|
|
prefix = os.path.commonprefix([root, srcdir])
|
|
|
|
root = root[len(prefix):]
|
|
|
|
if root.startswith('/'):
|
|
|
|
root = root[1:]
|
|
|
|
for rem in [ d for d in dirs if d.startswith('.') or d == 'SCCS']:
|
|
|
|
dirs.remove(rem)
|
|
|
|
|
|
|
|
for entry in dirs:
|
|
|
|
newdir = joinpath(destdir, root, entry)
|
|
|
|
if not isdir(newdir):
|
|
|
|
os.mkdir(newdir)
|
|
|
|
print 'mkdir', newdir
|
|
|
|
|
|
|
|
for i,d in enumerate(dirs):
|
|
|
|
if islink(joinpath(srcdir, root, d)):
|
|
|
|
dirs[i] = joinpath(d, '.')
|
|
|
|
|
|
|
|
for entry in files:
|
|
|
|
dest = normpath(joinpath(destdir, root, entry))
|
|
|
|
src = normpath(joinpath(srcdir, root, entry))
|
|
|
|
if not isfile(dest) or not filecmp(src, dest):
|
|
|
|
print 'copy %s %s' % (dest, src)
|
|
|
|
copy(src, dest)
|
|
|
|
|
2005-02-10 05:55:21 +01:00
|
|
|
progpath = nfspath(sys.path[0])
|
2005-02-05 19:49:17 +01:00
|
|
|
progname = basename(sys.argv[0])
|
|
|
|
usage = """\
|
|
|
|
Usage:
|
2005-03-24 18:25:34 +01:00
|
|
|
%(progname)s [-c] [-e] [-f] [-j <jobfile>] [-q queue] [-v] <regexp>
|
2005-02-05 19:49:17 +01:00
|
|
|
-c clean directory if job can be run
|
2005-11-02 20:54:21 +01:00
|
|
|
-C submit the checkpointing runs
|
|
|
|
-d Make jobs be dependent on the completion of the checkpoint runs
|
2005-02-05 19:49:17 +01:00
|
|
|
-e only echo pbs command info, don't actually send the job
|
|
|
|
-f force the job to run regardless of state
|
|
|
|
-q <queue> submit job to the named queue
|
2005-10-18 21:05:01 +02:00
|
|
|
-j <jobfile> specify the jobfile (default is <rootdir>/Test.py)
|
2005-02-05 19:49:17 +01:00
|
|
|
-v be verbose
|
|
|
|
|
2005-03-24 18:25:34 +01:00
|
|
|
%(progname)s [-j <jobfile>] -l [-v] <regexp>
|
2005-10-18 21:05:01 +02:00
|
|
|
-j <jobfile> specify the jobfile (default is <rootdir>/Test.py)
|
2005-02-05 19:49:17 +01:00
|
|
|
-l list job names, don't submit
|
|
|
|
-v be verbose (list job parameters)
|
|
|
|
|
|
|
|
%(progname)s -h
|
|
|
|
-h display this help
|
|
|
|
""" % locals()
|
|
|
|
|
|
|
|
try:
|
|
|
|
import getopt
|
2005-11-02 18:19:08 +01:00
|
|
|
opts, args = getopt.getopt(sys.argv[1:], '-Ccdefhj:lnq:Rt:v')
|
2005-02-05 19:49:17 +01:00
|
|
|
except getopt.GetoptError:
|
|
|
|
sys.exit(usage)
|
|
|
|
|
2005-10-18 21:05:01 +02:00
|
|
|
depend = False
|
2005-02-05 19:49:17 +01:00
|
|
|
clean = False
|
|
|
|
onlyecho = False
|
|
|
|
exprs = []
|
|
|
|
force = False
|
|
|
|
listonly = False
|
|
|
|
queue = ''
|
|
|
|
verbose = False
|
2005-10-18 21:05:01 +02:00
|
|
|
jfile = 'Test.py'
|
2005-09-17 22:51:26 +02:00
|
|
|
docpts = False
|
|
|
|
doruns = True
|
|
|
|
runflag = False
|
2005-10-18 21:05:01 +02:00
|
|
|
node_type = 'FAST'
|
2005-11-02 18:19:08 +01:00
|
|
|
update = True
|
2005-09-17 22:51:26 +02:00
|
|
|
|
2005-02-09 22:20:53 +01:00
|
|
|
for opt,arg in opts:
|
2005-09-17 22:51:26 +02:00
|
|
|
if opt == '-C':
|
|
|
|
docpts = True
|
2005-02-09 22:20:53 +01:00
|
|
|
if opt == '-c':
|
2005-02-05 19:49:17 +01:00
|
|
|
clean = True
|
2005-10-18 21:05:01 +02:00
|
|
|
if opt == '-d':
|
|
|
|
depend = True
|
2005-02-09 22:20:53 +01:00
|
|
|
if opt == '-e':
|
2005-02-05 19:49:17 +01:00
|
|
|
onlyecho = True
|
2005-02-09 22:20:53 +01:00
|
|
|
if opt == '-f':
|
2005-02-05 19:49:17 +01:00
|
|
|
force = True
|
2005-02-09 22:20:53 +01:00
|
|
|
if opt == '-h':
|
2005-02-05 19:49:17 +01:00
|
|
|
print usage
|
|
|
|
sys.exit(0)
|
2005-03-24 18:25:34 +01:00
|
|
|
if opt == '-j':
|
|
|
|
jfile = arg
|
2005-02-09 22:20:53 +01:00
|
|
|
if opt == '-l':
|
2005-02-05 19:49:17 +01:00
|
|
|
listonly = True
|
2005-11-02 18:19:08 +01:00
|
|
|
if opt == '-n':
|
|
|
|
update = False
|
2005-02-09 22:20:53 +01:00
|
|
|
if opt == '-q':
|
|
|
|
queue = arg
|
2005-10-18 21:05:01 +02:00
|
|
|
if opt == '-R':
|
|
|
|
runflag = True
|
|
|
|
if opt == '-t':
|
|
|
|
node_type = arg
|
2005-02-09 22:20:53 +01:00
|
|
|
if opt == '-v':
|
2005-02-05 19:49:17 +01:00
|
|
|
verbose = True
|
|
|
|
|
2005-09-17 22:51:26 +02:00
|
|
|
if docpts:
|
|
|
|
doruns = runflag
|
2005-02-09 22:20:53 +01:00
|
|
|
|
2005-02-05 19:49:17 +01:00
|
|
|
for arg in args:
|
|
|
|
exprs.append(re.compile(arg))
|
|
|
|
|
2005-09-17 22:51:26 +02:00
|
|
|
import jobfile, pbs
|
|
|
|
from job import JobDir, date
|
|
|
|
|
|
|
|
conf = jobfile.JobFile(jfile)
|
|
|
|
|
2005-11-02 18:19:08 +01:00
|
|
|
if update and not listonly and not onlyecho and isdir(conf.linkdir):
|
2005-02-10 05:55:21 +01:00
|
|
|
if verbose:
|
|
|
|
print 'Checking for outdated files in Link directory'
|
2005-10-18 21:05:01 +02:00
|
|
|
if not isdir(conf.basedir):
|
|
|
|
os.mkdir(conf.basedir)
|
2005-09-17 22:51:26 +02:00
|
|
|
syncdir(conf.linkdir, conf.basedir)
|
2005-02-05 19:49:17 +01:00
|
|
|
|
2005-09-17 22:51:26 +02:00
|
|
|
jobnames = {}
|
|
|
|
joblist = []
|
2005-02-05 19:49:17 +01:00
|
|
|
|
2005-09-17 22:51:26 +02:00
|
|
|
if docpts and doruns:
|
|
|
|
gen = conf.alljobs()
|
|
|
|
elif docpts:
|
|
|
|
gen = conf.checkpoints()
|
|
|
|
elif doruns:
|
|
|
|
gen = conf.jobs()
|
2005-02-05 19:49:17 +01:00
|
|
|
|
2005-09-17 22:51:26 +02:00
|
|
|
for job in gen:
|
|
|
|
if job.name in jobnames:
|
2005-02-05 19:49:17 +01:00
|
|
|
continue
|
|
|
|
|
2005-09-17 22:51:26 +02:00
|
|
|
if exprs:
|
|
|
|
for expr in exprs:
|
|
|
|
if expr.match(job.name):
|
|
|
|
joblist.append(job)
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
joblist.append(job)
|
2005-02-05 19:49:17 +01:00
|
|
|
|
|
|
|
if listonly:
|
|
|
|
if verbose:
|
2005-09-17 22:51:26 +02:00
|
|
|
for job in joblist:
|
|
|
|
job.printinfo()
|
2005-02-05 19:49:17 +01:00
|
|
|
else:
|
2005-09-17 22:51:26 +02:00
|
|
|
for job in joblist:
|
|
|
|
print job.name
|
2005-02-05 19:49:17 +01:00
|
|
|
sys.exit(0)
|
|
|
|
|
|
|
|
if not onlyecho:
|
2005-09-17 22:51:26 +02:00
|
|
|
newlist = []
|
|
|
|
for job in joblist:
|
|
|
|
jobdir = JobDir(joinpath(conf.rootdir, job.name))
|
|
|
|
if jobdir.exists():
|
2005-02-05 19:49:17 +01:00
|
|
|
if not force:
|
2005-09-17 22:51:26 +02:00
|
|
|
status = jobdir.getstatus()
|
|
|
|
if status == 'queued':
|
|
|
|
continue
|
|
|
|
|
|
|
|
if status == 'running':
|
2005-02-05 19:49:17 +01:00
|
|
|
continue
|
|
|
|
|
2005-09-17 22:51:26 +02:00
|
|
|
if status == 'success':
|
2005-02-05 19:49:17 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
if not clean:
|
2005-09-17 22:51:26 +02:00
|
|
|
sys.exit('job directory %s not clean!' % jobdir)
|
2005-02-05 19:49:17 +01:00
|
|
|
|
2005-09-17 22:51:26 +02:00
|
|
|
jobdir.clean()
|
|
|
|
newlist.append(job)
|
|
|
|
joblist = newlist
|
|
|
|
|
|
|
|
class NameHack(object):
|
|
|
|
def __init__(self, host='pbs.pool', port=24465):
|
|
|
|
self.host = host
|
|
|
|
self.port = port
|
|
|
|
self.socket = None
|
|
|
|
|
|
|
|
def setname(self, jobid, jobname):
|
|
|
|
try:
|
|
|
|
jobid = int(jobid)
|
|
|
|
except ValueError:
|
|
|
|
jobid = int(jobid.strip().split('.')[0])
|
|
|
|
|
|
|
|
jobname = jobname.strip()
|
|
|
|
# since pbs can handle jobnames of 15 characters or less,
|
|
|
|
# don't use the raj hack.
|
|
|
|
if len(jobname) <= 15:
|
|
|
|
return
|
|
|
|
|
|
|
|
if self.socket is None:
|
|
|
|
import socket
|
|
|
|
self.socket = socket.socket()
|
|
|
|
# Connect to pbs.pool and send the jobid/jobname pair to port
|
|
|
|
# 24465 (Raj didn't realize that there are only 64k ports and
|
|
|
|
# setup inetd to point to port 90001)
|
|
|
|
self.socket.connect((self.host, self.port))
|
|
|
|
|
|
|
|
self.socket.send("%s %s\n" % (jobid, jobname))
|
|
|
|
|
|
|
|
namehack = NameHack()
|
|
|
|
|
|
|
|
for job in joblist:
|
|
|
|
jobdir = JobDir(joinpath(conf.rootdir, job.name))
|
2005-10-18 21:05:01 +02:00
|
|
|
if depend:
|
|
|
|
cptdir = JobDir(joinpath(conf.rootdir, job.checkpoint.name))
|
|
|
|
cptjob = cptdir.readval('.pbs_jobid')
|
2005-09-17 22:51:26 +02:00
|
|
|
|
|
|
|
if not onlyecho:
|
|
|
|
jobdir.create()
|
|
|
|
|
|
|
|
print 'Job name: %s' % job.name
|
2005-02-10 05:55:21 +01:00
|
|
|
print 'Job directory: %s' % jobdir
|
2005-02-05 19:49:17 +01:00
|
|
|
|
|
|
|
qsub = pbs.qsub()
|
|
|
|
qsub.pbshost = 'simpool.eecs.umich.edu'
|
2005-09-17 22:51:26 +02:00
|
|
|
qsub.stdout = jobdir.file('jobout')
|
|
|
|
qsub.name = job.name[:15]
|
2005-02-05 19:49:17 +01:00
|
|
|
qsub.join = True
|
2005-10-18 21:05:01 +02:00
|
|
|
qsub.node_type = node_type
|
2005-09-17 22:51:26 +02:00
|
|
|
qsub.env['ROOTDIR'] = conf.rootdir
|
|
|
|
qsub.env['JOBNAME'] = job.name
|
2005-10-18 21:05:01 +02:00
|
|
|
if depend:
|
|
|
|
qsub.afterok = cptjob
|
|
|
|
if queue:
|
2005-02-05 19:49:17 +01:00
|
|
|
qsub.queue = queue
|
2005-02-10 05:55:21 +01:00
|
|
|
qsub.build(joinpath(progpath, 'job.py'))
|
2005-02-05 19:49:17 +01:00
|
|
|
|
2005-02-10 05:55:21 +01:00
|
|
|
if verbose:
|
|
|
|
print 'PBS Command: %s' % qsub.command
|
|
|
|
|
|
|
|
if not onlyecho:
|
|
|
|
ec = qsub.do()
|
|
|
|
if ec == 0:
|
2005-08-16 17:27:49 +02:00
|
|
|
jobid = qsub.result
|
|
|
|
print 'PBS Jobid: %s' % jobid
|
2005-09-17 22:51:26 +02:00
|
|
|
namehack.setname(jobid, job.name)
|
|
|
|
queued = date()
|
2005-10-12 19:01:55 +02:00
|
|
|
jobdir.echofile('.pbs_jobid', jobid)
|
|
|
|
jobdir.echofile('.pbs_jobname', job.name)
|
2005-09-17 22:51:26 +02:00
|
|
|
jobdir.echofile('.queued', queued)
|
|
|
|
jobdir.setstatus('queued on %s' % queued)
|
2005-02-10 05:55:21 +01:00
|
|
|
else:
|
|
|
|
print 'PBS Failed'
|