Skip to content

Commit

Permalink
Add support for performance archiving on Compy
Browse files Browse the repository at this point in the history
Added logic for compy to provenance.py, specified
the location of the performance archive in the
compy entry in config_machines.xml (enabled for
all projects) and added the job progress monitoring
script syslog.compy. Also removed performance
archiving support for edison, now that it has been
decommissioned.

[BFB] - Bit-For-Bit
  • Loading branch information
worleyph committed May 24, 2019
1 parent fe07217 commit bec2df0
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 98 deletions.
2 changes: 2 additions & 0 deletions cime/config/e3sm/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2000,6 +2000,8 @@
<OS>LINUX</OS>
<COMPILERS>intel,pgi</COMPILERS>
<MPILIBS>mvapich2</MPILIBS>
<SAVE_TIMING_DIR>/compyfs</SAVE_TIMING_DIR>
<SAVE_TIMING_DIR_PROJECTS>.*</SAVE_TIMING_DIR_PROJECTS>
<CIME_OUTPUT_ROOT>/compyfs/$USER/e3sm_scratch</CIME_OUTPUT_ROOT>
<DIN_LOC_ROOT>/compyfs/inputdata</DIN_LOC_ROOT>
<DIN_LOC_ROOT_CLMFORC>/compyfs/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC>
Expand Down
116 changes: 116 additions & 0 deletions cime/config/e3sm/machines/syslog.compy
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/csh -f
# compy syslog script:
# mach_syslog <sampling interval (in seconds)> <job identifier> <timestamp> <run directory> <timing directory> <output directory>

set sample_interval = $1
set jid = $2
set lid = $3
set run = $4
set timing = $5
set dir = $6

# Wait until job task-to-node mapping information is output before saving output file.
# Target length was determined empirically (maximum number of lines before job mapping
# information starts + number of nodes), and it may need to be adjusted in the future.
# (Note that calling script 'touch'es the e3sm log file before spawning this script, so that 'wc' does not fail.)
set nnodes = `squeue --noheader -o '%D' --job $jid | sed 's/^0*\([0-9]*\)/\1/' `
if ("X$nnodes" == "X") set nnodes = 0
@ target_lines = 150 + $nnodes
sleep 10
set outlth = `wc \-l $run/e3sm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
while ($outlth < $target_lines)
sleep 60
set outlth = `wc \-l $run/e3sm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLeft = `squeue --noheader -O 'timeleft' --job $jid `
set TimeLeftwday = `echo $TimeLeft | grep '-' `
if ("X$TimeLeftwday" == "X") then
set left_days = 0
set TimeLeftwhour = `echo $TimeLeft | grep '.*:.*:.*' `
if ("X$TimeLeftwhour" == "X") then
set left_hours = 0
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\2/' `
else
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
endif
else
set left_days = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\4/' `
endif

if ("X$left_days" == "X") set left_days = 0
if ("X$left_hours" == "X") set left_hours = 0
if ("X$left_mins" == "X") set left_mins = 0
if ("X$left_secs" == "X") set left_secs = 0
@ remaining = 86400 * $left_days + 3600 * $left_hours + 60 * $left_mins + $left_secs
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp --preserve=timestamps $run/e3sm.log.$lid $dir/e3sm.log.$lid.$remaining
if ($remaining <= 0) then
squeue -t R -o "%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %j" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
endif

while ($remaining > 0)
echo "Wallclock time remaining: $remaining" >> $dir/atm.log.$lid.step
grep -Fa -e "nstep" -e "model date" $run/*atm.log.$lid | tail -n 4 >> $dir/atm.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/lnd.log.$lid.step
grep -Fa -e "timestep" -e "model date" $run/*lnd.log.$lid | tail -n 4 >> $dir/lnd.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ocn.log.$lid.step
grep -Fa -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail -n 4 >> $dir/ocn.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ice.log.$lid.step
grep -Fa -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail -n 4 >> $dir/ice.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*rof.log.$lid | tail -n 4 >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*cpl.log.$lid > $dir/cpl.log.$lid.step-all
echo "Wallclock time remaining: $remaining" >> $dir/cpl.log.$lid.step
tail -n 4 $dir/cpl.log.$lid.step-all >> $dir/cpl.log.$lid.step
/bin/cp --preserve=timestamps -u $timing/* $dir
squeue -t R -o "%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %j" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
chmod a+r $dir/*
# sleep $sample_interval
set sleep_remaining = $sample_interval
while ($sleep_remaining > 120)
sleep 120
@ sleep_remaining = $sleep_remaining - 120
end
sleep $sleep_remaining
# query remaining time
set TimeLeft = `squeue --noheader -O 'timeleft' --job $jid `
set TimeLeftwday = `echo $TimeLeft | grep '-' `
if ("X$TimeLeftwday" == "X") then
set left_days = 0
set TimeLeftwhour = `echo $TimeLeft | grep '.*:.*:.*' `
if ("X$TimeLeftwhour" == "X") then
set left_hours = 0
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\2/' `
else
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
endif
else
set left_days = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\4/' `
endif
if ("X$left_days" == "X") set left_days = 0
if ("X$left_hours" == "X") set left_hours = 0
if ("X$left_mins" == "X") set left_mins = 0
if ("X$left_secs" == "X") set left_secs = 0
@ remaining = 86400 * $left_days + 3600 * $left_hours + 60 * $left_mins + $left_secs
cat > $run/Walltime.Remaining << EOF2
$remaining $sample_interval
EOF2

end
94 changes: 0 additions & 94 deletions cime/config/e3sm/machines/syslog.edison

This file was deleted.

11 changes: 7 additions & 4 deletions cime/scripts/lib/CIME/provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def _get_batch_job_id_for_syslog(case):
try:
if mach in ['titan']:
return os.environ["PBS_JOBID"]
elif mach in ['anvil', 'edison', 'cori-haswell', 'cori-knl']:
elif mach in ['anvil', 'compy', 'cori-haswell', 'cori-knl']:
return os.environ["SLURM_JOB_ID"]
elif mach in ['mira', 'theta']:
return os.environ["COBALT_JOBID"]
Expand Down Expand Up @@ -169,7 +169,7 @@ def _save_prerun_timing_e3sm(case, lid):
filename = "%s.%s" % (filename, lid)
run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
gzip_existing_file(os.path.join(full_timing_dir, filename))
elif mach in ["edison", "cori-haswell", "cori-knl"]:
elif mach in ["cori-haswell", "cori-knl"]:
for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"),
# ("sqs -f", "sqsf"),
("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"),
Expand All @@ -186,7 +186,7 @@ def _save_prerun_timing_e3sm(case, lid):
full_cmd = cmd + " " + filename
run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir)
gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid))
elif mach == "anvil":
elif mach in ["anvil", "compy"]:
for cmd, filename in [("sinfo -l", "sinfol"),
("squeue -o '%all' --job {}".format(job_id), "squeueall_jobid"),
("squeue -o '%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %.20S %.20V %j'", "squeuef"),
Expand Down Expand Up @@ -360,11 +360,14 @@ def _save_postrun_timing_e3sm(case, lid):
globs_to_copy.append("%s*OU" % job_id)
elif mach == "anvil":
globs_to_copy.append("%s*run*%s" % (case.get_value("CASE"), job_id))
elif mach == "compy":
globs_to_copy.append("slurm.err")
globs_to_copy.append("slurm.out")
elif mach in ["mira", "theta"]:
globs_to_copy.append("%s*error" % job_id)
globs_to_copy.append("%s*output" % job_id)
globs_to_copy.append("%s*cobaltlog" % job_id)
elif mach in ["edison", "cori-haswell", "cori-knl"]:
elif mach in ["cori-haswell", "cori-knl"]:
globs_to_copy.append("%s*run*%s" % (case.get_value("CASE"), job_id))
elif mach == "summit":
globs_to_copy.append("e3sm.stderr.%s" % job_id)
Expand Down

0 comments on commit bec2df0

Please sign in to comment.