-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpysub
More file actions
103 lines (96 loc) · 5.5 KB
/
pysub
File metadata and controls
103 lines (96 loc) · 5.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
import subprocess
import argparse
import sys
import os
def subwrap(call):
'''
A helper function that calls shell commands and returns the stdout for python processing.
Input:
* call - the shell command to run and acquire the stdout of
Returns the stdout of the command
'''
return subprocess.run(call, shell=True, stdout=subprocess.PIPE).stdout.decode('utf-8').rstrip()
def parse_args():
#the various slurm options are thankfully clear, so we match them pretty close
#the only real differences are accepting a log file folder
#along with gpus as a number and time as number of hours
parser = argparse.ArgumentParser()
parser.add_argument("-p", dest="p", default="icelake-himem", help="The partition to use, in practice icelake-himem for CPU or ampere for GPU. Default: icelake-himem")
parser.add_argument("-A", dest="A", help="The account to use. Default: Your (POLONIUS-)TEICHMANN-SL3 account for CPU or GPU")
parser.add_argument("-N", dest="N", type=int, default=1, help="Number of nodes to request. Default: 1")
parser.add_argument("-n", dest="n", type=int, default=1, help="Number of tasks, relevant for MPI parallelisation. Default: 1")
parser.add_argument("-c", dest="c", type=int, help="CPU jobs - number of CPUs to request (per task, you get n*c total cores). Translates to memory, with ~6.6GB of RAM per core on icelake-himem. Up to 72 cores per icelake-himem node.")
parser.add_argument("-g", dest="g", type=int, help="GPU jobs - number of GPUs to request. A GPU comes with 32 cores and 250GB RAM. Up to four GPUs per node.")
parser.add_argument("-t", dest="t", type=int, required=True, help="Number of hours of wall time to request for the job.")
parser.add_argument("-J", dest="J", type=str, required=True, help="Job name.")
parser.add_argument("-l", dest="l", type=str, default=".", help="Directory to write the logs to.")
parser.add_argument("-e", dest="e", help="Logging error file. Default: JobName.eJobID in the log directory")
parser.add_argument("-o", dest="o", help="Logging output file. Default: JobName.oJobID in the log directory")
parser.add_argument("-conda", dest="conda", help="Optional, Conda environment to activate prior to running command")
parser.add_argument("-postjob", dest="postjob", action="store_true", help="Flag, if provided will spawn follow-up job that will augment the output file with sacct usage statistics for job")
parser.add_argument("command", nargs="+")
args = parser.parse_args()
#sanity check the arguments. are we GPU or not?
if args.g is not None:
#we are GPU. set partition to ampere
args.p = "ampere"
elif args.c is not None:
#we're CPU. override partition with icelake-himem if set to ampere
if args.p == "ampere":
args.p = "icelake-himem"
else:
raise ValueError("one of -c or -g needs to be set")
#set up default account if needed
if args.A is None:
#some folks are TEICHMANN-SL3 and some are POLONIUS-TEICHMANN-SL3
#check which they are via mybalance and set the default accordingly
queue_prefix = ""
if subwrap("mybalance | grep POLONIUS | wc -l") != "0":
queue_prefix = "POLONIUS-"
#the queue suffix depends entirely on the partition
if args.p == "ampere":
queue_suffix = "GPU"
else:
queue_suffix = "CPU"
args.A = queue_prefix+"TEICHMANN-SL3-"+queue_suffix
#if -e and -o are provided, take them and run with it
#otherwise set them to spawn in log dir
if (args.e is None) and (args.o is None):
args.e = os.path.join(args.l, "%x.e%A")
args.o = os.path.join(args.l, "%x.o%A")
return args
def main():
args = parse_args()
sys.stdout.write("#!/bin/bash\n")
sys.stdout.write("#SBATCH -p "+args.p+"\n")
sys.stdout.write("#SBATCH -A "+args.A+"\n")
sys.stdout.write("#SBATCH -N "+str(args.N)+"\n")
sys.stdout.write("#SBATCH -n "+str(args.n)+"\n")
#GPU specifies GPUs, CPU specifies cores
if args.p == "ampere":
sys.stdout.write("#SBATCH --gres=gpu:"+str(args.g)+"\n")
else:
sys.stdout.write("#SBATCH -c "+str(args.c)+"\n")
#args.t is number of hours, turn to hh:mm:ss for slurm to be happy
sys.stdout.write("#SBATCH -t "+str(args.t)+":00:00\n")
sys.stdout.write("#SBATCH -J "+args.J+"\n")
sys.stdout.write("#SBATCH -e "+args.e+"\n")
sys.stdout.write("#SBATCH -o "+args.o+"\n")
sys.stdout.write("\n")
#there may or may not be a postjob
if args.postjob:
#park postjob's logs into dumpster location on private RDS
sys.stdout.write("mkdir -p ~/rds/hpc-work/postjob_sacct\n")
#the slurm job sees its own ID in SLURM_JOB_ID
#the only reliable way to pull the stdout file is to use scontrol on a running job
#direct the postjob to both of these
sys.stdout.write('LOGFILE=$(scontrol show job ${SLURM_JOB_ID} | grep StdOut | cut -f 2 -d "=")\n')
sys.stdout.write('sbatch -d afterany:${SLURM_JOB_ID} -p icelake-himem -A TEICHLAB-SL2-CPU -c 1 -t 0:05:00 -J postjob_sacct -o ~/rds/hpc-work/postjob_sacct/${SLURM_JOB_ID}.%j.out --wrap="bash /rfs/project/rfs-iCNyzSAaucw/ktp27/csd3-scripts/postjob_sacct.sh ${SLURM_JOB_ID} ${LOGFILE}"\n')
#there may or may not be a conda
if args.conda:
sys.stdout.write('eval "$(conda shell.bash hook)"\n')
sys.stdout.write("conda activate "+args.conda+"\n")
sys.stdout.write(" ".join(args.command)+"\n")
if __name__ == "__main__":
main()