-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathrestart_runs.py
170 lines (146 loc) · 5.93 KB
/
restart_runs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#%% Imports
import os
import shutil
import subprocess
import argparse
import pandas as pd
from glob import glob
#%% Argument inputs
parser = argparse.ArgumentParser(description='Restart failed runs on the HPC')
parser.add_argument('batch_name', type=str, help='batch name (case prefix) to search for')
parser.add_argument('--copy_cplex', '-c', type=int, default=0,
help='Which cplex.opt file to copy (or 0 for none)')
parser.add_argument('--copy_srun_template', '-s', action='store_true',
help='Copy current srun_template.sh to sbatch file')
parser.add_argument('--force', '-f', action='store_true',
help='Proceed without double-checking')
parser.add_argument('--more_copyfiles', '-m', type=str, default='',
help=',-delimited list of additional files to copy from reeds_path')
parser.add_argument('--include_finished', '-i', action='store_true',
help='Also restart finished runs (e.g. to redo postprocessing)')
args = parser.parse_args()
batch_name = args.batch_name
copy_cplex = args.copy_cplex
copy_srun_template = args.copy_srun_template
force = args.force
more_copyfiles = [i for i in args.more_copyfiles.split(',') if len(i)]
include_finished = args.include_finished
# #%% Inputs for debugging
# batch_name = 'v20231113_yamM0'
# copy_cplex = 1
# copy_srun_template = True
# force = True
# more_copyfiles = ['e_report.gms']
# include_finished = False
###### Procedure
#%% Shared parameters
reeds_path = os.path.dirname(os.path.abspath(__file__))
#%% Get all runs
runs_all = sorted(glob(os.path.join(reeds_path,'runs',batch_name+'*')))
### Identify finished runs
runs_finished = [
i for i in runs_all
if os.path.exists(os.path.join(i, 'outputs', 'reeds-report', 'report.xlsx'))
]
### Keep unfinished runs
runs_unfinished = [i for i in runs_all if i not in runs_finished]
### Get failed runs by identifying and excluding active runs
sq = f'squeue -u {os.environ["USER"]} -o "%.200j"'
sqout = subprocess.run(sq, capture_output=True, shell=True)
runs_running = [
os.path.splitext(i.decode())[0] for i in sqout.stdout.split()
if i.decode().startswith(batch_name)
]
runs_failed = [
i for i in (runs_all if include_finished else runs_unfinished)
if os.path.basename(i) not in runs_running
]
### Take a look
print('unfinished:', len(runs_unfinished))
print('running:', len(runs_running))
print('failed:', len(runs_failed))
#%% Double check
if not force:
for i in runs_failed:
print(os.path.basename(i))
print(f'Restarting the {len(runs_failed)} runs listed above.')
confirm_local = str(input('Proceed? [y]/n: ') or 'y')
if confirm_local not in ['y','Y','yes','Yes','YES']:
quit()
#%% Get the cplex file to copy
if copy_cplex:
if copy_cplex == 1:
cplex_file = os.path.join(reeds_path,'cplex.opt')
else:
cplex_file = os.path.join(reeds_path,f'cplex.op{copy_cplex}')
else:
cplex_file = None
#%% Copy the header from the srun_template.sh file if desired
if copy_srun_template:
srun_template = os.path.join(reeds_path,'srun_template.sh')
writelines_srun = list()
with open(srun_template, 'r') as f:
for line in f:
writelines_srun.append(line.strip())
else:
writelines_srun = list()
#%%### Loop through runs, figure out when they failed, and restart
for case in runs_failed:
casename = os.path.basename(case)
#%% Copy the cplex file if desired
if copy_cplex:
shutil.copy(cplex_file, os.path.join(case,''))
#%% Copy additional files if desired
for f in more_copyfiles:
shutil.copy(os.path.join(reeds_path,f), os.path.join(case,f))
#%% Get solveyears
solveyears = pd.read_csv(
os.path.join(case,'inputs_case','modeledyears.csv')
).columns.astype(int).tolist()
#%% Make a backup copy of the original bash and sbatch scripts
callfile = os.path.join(case,f'call_{casename}.sh')
shutil.copy(callfile, os.path.join(case,f'ORIGINAL_call_{casename}.sh'))
sbatchfile = os.path.join(case,f'{casename}.sh')
shutil.copy(sbatchfile, os.path.join(case,f'ORIGINAL_{casename}.sh'))
#%% Get last .lst file and restart from there
lstfiles = sorted(glob(os.path.join(case,'lstfiles','*.lst')))
if any([os.path.basename(i).startswith('report') for i in lstfiles]):
restart_tag = '# Output processing'
elif len(lstfiles) == 2:
## if there are only 2 lst files, then one of them will be environment.csv and the other will be 1_inputs.lst so the run failed during the model compilation
restart_tag = '# Compile model'
else:
lastfile = lstfiles[-1]
restart_year = int(os.path.splitext(lastfile)[0].split('_')[-1].split('i')[0])
restart_tag = f'# Year: {restart_year}'
#%% Comment out the unnecessary lines
writelines = []
with open(callfile, 'r') as f:
comment = 0
for line in f:
## Start commenting at input processing
if '# Input processing' in line:
comment = 1
## Stop commenting at restart_tag
if line.startswith(restart_tag):
comment = 0
## Record it
writelines.append(('# ' if comment else '') + line.strip())
### Write it
with open(callfile, 'w') as f:
for line in writelines:
f.writelines(line + '\n')
#%% Write the sbatch file with new header, if desired
if copy_srun_template:
writelines_srun_case = writelines_srun.copy()
writelines_srun_case.append(f"\n#SBATCH --job-name={casename}\n")
writelines_srun_case.append(f"sh {callfile}")
with open(sbatchfile, 'w') as f:
for line in writelines_srun_case:
f.writelines(line + '\n')
#%% Run it
sbatch = f'sbatch {sbatchfile}'
sbatchout = subprocess.run(sbatch, capture_output=True, shell=True)
if len(sbatchout.stderr):
print(sbatchout.stderr.decode())
print(f"{casename}: {sbatchout.stdout.decode()}")