Skip to content

Commit

Permalink
Merge branch 'worleyph/cime/compy_perf_arch'
Browse files Browse the repository at this point in the history
Add support for performance archiving on Compy #2943

Added logic for compy to provenance.py, specified
the location of the performance archive in the
compy entry in config_machines.xml (enabled for
all projects) and added the job progress monitoring
script syslog.compy. Also removed performance
archiving support for edison, now that it has been
decommissioned.

[BFB] - Bit-For-Bit
  • Loading branch information
bibiraju committed May 30, 2019
2 parents e783325 + bec2df0 commit 9b9cfe4
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 98 deletions.
2 changes: 2 additions & 0 deletions cime/config/e3sm/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,8 @@
<OS>LINUX</OS>
<COMPILERS>intel,pgi</COMPILERS>
<MPILIBS>mvapich2</MPILIBS>
<SAVE_TIMING_DIR>/compyfs</SAVE_TIMING_DIR>
<SAVE_TIMING_DIR_PROJECTS>.*</SAVE_TIMING_DIR_PROJECTS>
<CIME_OUTPUT_ROOT>/compyfs/$USER/e3sm_scratch</CIME_OUTPUT_ROOT>
<DIN_LOC_ROOT>/compyfs/inputdata</DIN_LOC_ROOT>
<DIN_LOC_ROOT_CLMFORC>/compyfs/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC>
Expand Down
116 changes: 116 additions & 0 deletions cime/config/e3sm/machines/syslog.compy
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/csh -f
# compy syslog script:
# mach_syslog <sampling interval (in seconds)> <job identifier> <timestamp> <run directory> <timing directory> <output directory>

set sample_interval = $1
set jid = $2
set lid = $3
set run = $4
set timing = $5
set dir = $6

# Wait until job task-to-node mapping information is output before saving output file.
# Target length was determined empirically (maximum number of lines before job mapping
# information starts + number of nodes), and it may need to be adjusted in the future.
# (Note that calling script 'touch'es the e3sm log file before spawning this script, so that 'wc' does not fail.)
set nnodes = `squeue --noheader -o '%D' --job $jid | sed 's/^0*\([0-9]*\)/\1/' `
if ("X$nnodes" == "X") set nnodes = 0
@ target_lines = 150 + $nnodes
sleep 10
set outlth = `wc \-l $run/e3sm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
while ($outlth < $target_lines)
sleep 60
set outlth = `wc \-l $run/e3sm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLeft = `squeue --noheader -O 'timeleft' --job $jid `
set TimeLeftwday = `echo $TimeLeft | grep '-' `
if ("X$TimeLeftwday" == "X") then
set left_days = 0
set TimeLeftwhour = `echo $TimeLeft | grep '.*:.*:.*' `
if ("X$TimeLeftwhour" == "X") then
set left_hours = 0
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\2/' `
else
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
endif
else
set left_days = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\4/' `
endif

if ("X$left_days" == "X") set left_days = 0
if ("X$left_hours" == "X") set left_hours = 0
if ("X$left_mins" == "X") set left_mins = 0
if ("X$left_secs" == "X") set left_secs = 0
@ remaining = 86400 * $left_days + 3600 * $left_hours + 60 * $left_mins + $left_secs
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp --preserve=timestamps $run/e3sm.log.$lid $dir/e3sm.log.$lid.$remaining
if ($remaining <= 0) then
squeue -t R -o "%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %j" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
endif

while ($remaining > 0)
echo "Wallclock time remaining: $remaining" >> $dir/atm.log.$lid.step
grep -Fa -e "nstep" -e "model date" $run/*atm.log.$lid | tail -n 4 >> $dir/atm.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/lnd.log.$lid.step
grep -Fa -e "timestep" -e "model date" $run/*lnd.log.$lid | tail -n 4 >> $dir/lnd.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ocn.log.$lid.step
grep -Fa -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail -n 4 >> $dir/ocn.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ice.log.$lid.step
grep -Fa -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail -n 4 >> $dir/ice.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*rof.log.$lid | tail -n 4 >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*cpl.log.$lid > $dir/cpl.log.$lid.step-all
echo "Wallclock time remaining: $remaining" >> $dir/cpl.log.$lid.step
tail -n 4 $dir/cpl.log.$lid.step-all >> $dir/cpl.log.$lid.step
/bin/cp --preserve=timestamps -u $timing/* $dir
squeue -t R -o "%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %j" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
chmod a+r $dir/*
# sleep $sample_interval
set sleep_remaining = $sample_interval
while ($sleep_remaining > 120)
sleep 120
@ sleep_remaining = $sleep_remaining - 120
end
sleep $sleep_remaining
# query remaining time
set TimeLeft = `squeue --noheader -O 'timeleft' --job $jid `
set TimeLeftwday = `echo $TimeLeft | grep '-' `
if ("X$TimeLeftwday" == "X") then
set left_days = 0
set TimeLeftwhour = `echo $TimeLeft | grep '.*:.*:.*' `
if ("X$TimeLeftwhour" == "X") then
set left_hours = 0
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\2/' `
else
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
endif
else
set left_days = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\4/' `
endif
if ("X$left_days" == "X") set left_days = 0
if ("X$left_hours" == "X") set left_hours = 0
if ("X$left_mins" == "X") set left_mins = 0
if ("X$left_secs" == "X") set left_secs = 0
@ remaining = 86400 * $left_days + 3600 * $left_hours + 60 * $left_mins + $left_secs
cat > $run/Walltime.Remaining << EOF2
$remaining $sample_interval
EOF2

end
94 changes: 0 additions & 94 deletions cime/config/e3sm/machines/syslog.edison

This file was deleted.

11 changes: 7 additions & 4 deletions cime/scripts/lib/CIME/provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def _get_batch_job_id_for_syslog(case):
try:
if mach in ['titan']:
return os.environ["PBS_JOBID"]
elif mach in ['anvil', 'edison', 'cori-haswell', 'cori-knl']:
elif mach in ['anvil', 'compy', 'cori-haswell', 'cori-knl']:
return os.environ["SLURM_JOB_ID"]
elif mach in ['mira', 'theta']:
return os.environ["COBALT_JOBID"]
Expand Down Expand Up @@ -169,7 +169,7 @@ def _save_prerun_timing_e3sm(case, lid):
filename = "%s.%s" % (filename, lid)
run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
gzip_existing_file(os.path.join(full_timing_dir, filename))
elif mach in ["edison", "cori-haswell", "cori-knl"]:
elif mach in ["cori-haswell", "cori-knl"]:
for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"),
# ("sqs -f", "sqsf"),
("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"),
Expand All @@ -186,7 +186,7 @@ def _save_prerun_timing_e3sm(case, lid):
full_cmd = cmd + " " + filename
run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir)
gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid))
elif mach == "anvil":
elif mach in ["anvil", "compy"]:
for cmd, filename in [("sinfo -l", "sinfol"),
("squeue -o '%all' --job {}".format(job_id), "squeueall_jobid"),
("squeue -o '%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %.20S %.20V %j'", "squeuef"),
Expand Down Expand Up @@ -360,11 +360,14 @@ def _save_postrun_timing_e3sm(case, lid):
globs_to_copy.append("%s*OU" % job_id)
elif mach == "anvil":
globs_to_copy.append("%s*run*%s" % (case.get_value("CASE"), job_id))
elif mach == "compy":
globs_to_copy.append("slurm.err")
globs_to_copy.append("slurm.out")
elif mach in ["mira", "theta"]:
globs_to_copy.append("%s*error" % job_id)
globs_to_copy.append("%s*output" % job_id)
globs_to_copy.append("%s*cobaltlog" % job_id)
elif mach in ["edison", "cori-haswell", "cori-knl"]:
elif mach in ["cori-haswell", "cori-knl"]:
globs_to_copy.append("%s*run*%s" % (case.get_value("CASE"), job_id))
elif mach == "summit":
globs_to_copy.append("e3sm.stderr.%s" % job_id)
Expand Down

0 comments on commit 9b9cfe4

Please sign in to comment.