Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for performance archiving on Compy #2943

Merged
merged 1 commit into from
May 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cime/config/e3sm/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2000,6 +2000,8 @@
<OS>LINUX</OS>
<COMPILERS>intel,pgi</COMPILERS>
<MPILIBS>mvapich2</MPILIBS>
<SAVE_TIMING_DIR>/compyfs</SAVE_TIMING_DIR>
<SAVE_TIMING_DIR_PROJECTS>.*</SAVE_TIMING_DIR_PROJECTS>
<CIME_OUTPUT_ROOT>/compyfs/$USER/e3sm_scratch</CIME_OUTPUT_ROOT>
<DIN_LOC_ROOT>/compyfs/inputdata</DIN_LOC_ROOT>
<DIN_LOC_ROOT_CLMFORC>/compyfs/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC>
Expand Down
116 changes: 116 additions & 0 deletions cime/config/e3sm/machines/syslog.compy
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/csh -f
# compy syslog script:
# mach_syslog <sampling interval (in seconds)> <job identifier> <timestamp> <run directory> <timing directory> <output directory>

set sample_interval = $1
set jid = $2
set lid = $3
set run = $4
set timing = $5
set dir = $6

# Wait until job task-to-node mapping information is output before saving output file.
# Target length was determined empirically (maximum number of lines before job mapping
# information starts + number of nodes), and it may need to be adjusted in the future.
# (Note that calling script 'touch'es the e3sm log file before spawning this script, so that 'wc' does not fail.)
set nnodes = `squeue --noheader -o '%D' --job $jid | sed 's/^0*\([0-9]*\)/\1/' `
if ("X$nnodes" == "X") set nnodes = 0
@ target_lines = 150 + $nnodes
sleep 10
set outlth = `wc \-l $run/e3sm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
while ($outlth < $target_lines)
sleep 60
set outlth = `wc \-l $run/e3sm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLeft = `squeue --noheader -O 'timeleft' --job $jid `
set TimeLeftwday = `echo $TimeLeft | grep '-' `
if ("X$TimeLeftwday" == "X") then
set left_days = 0
set TimeLeftwhour = `echo $TimeLeft | grep '.*:.*:.*' `
if ("X$TimeLeftwhour" == "X") then
set left_hours = 0
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\2/' `
else
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
endif
else
set left_days = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\4/' `
endif

if ("X$left_days" == "X") set left_days = 0
if ("X$left_hours" == "X") set left_hours = 0
if ("X$left_mins" == "X") set left_mins = 0
if ("X$left_secs" == "X") set left_secs = 0
@ remaining = 86400 * $left_days + 3600 * $left_hours + 60 * $left_mins + $left_secs
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp --preserve=timestamps $run/e3sm.log.$lid $dir/e3sm.log.$lid.$remaining
if ($remaining <= 0) then
squeue -t R -o "%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %j" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
endif

while ($remaining > 0)
echo "Wallclock time remaining: $remaining" >> $dir/atm.log.$lid.step
grep -Fa -e "nstep" -e "model date" $run/*atm.log.$lid | tail -n 4 >> $dir/atm.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/lnd.log.$lid.step
grep -Fa -e "timestep" -e "model date" $run/*lnd.log.$lid | tail -n 4 >> $dir/lnd.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ocn.log.$lid.step
grep -Fa -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail -n 4 >> $dir/ocn.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/ice.log.$lid.step
grep -Fa -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail -n 4 >> $dir/ice.log.$lid.step
echo "Wallclock time remaining: $remaining" >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*rof.log.$lid | tail -n 4 >> $dir/rof.log.$lid.step
grep -Fa "model date" $run/*cpl.log.$lid > $dir/cpl.log.$lid.step-all
echo "Wallclock time remaining: $remaining" >> $dir/cpl.log.$lid.step
tail -n 4 $dir/cpl.log.$lid.step-all >> $dir/cpl.log.$lid.step
/bin/cp --preserve=timestamps -u $timing/* $dir
squeue -t R -o "%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %j" > $dir/squeuef.$lid.$remaining
squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
chmod a+r $dir/*
# sleep $sample_interval
set sleep_remaining = $sample_interval
while ($sleep_remaining > 120)
sleep 120
@ sleep_remaining = $sleep_remaining - 120
end
sleep $sleep_remaining
# query remaining time
set TimeLeft = `squeue --noheader -O 'timeleft' --job $jid `
set TimeLeftwday = `echo $TimeLeft | grep '-' `
if ("X$TimeLeftwday" == "X") then
set left_days = 0
set TimeLeftwhour = `echo $TimeLeft | grep '.*:.*:.*' `
if ("X$TimeLeftwhour" == "X") then
set left_hours = 0
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\)/\2/' `
else
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
endif
else
set left_days = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set left_hours = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set left_mins = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
set left_secs = `echo $TimeLeft | sed 's/^0*\([0-9]*\)-0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\4/' `
endif
if ("X$left_days" == "X") set left_days = 0
if ("X$left_hours" == "X") set left_hours = 0
if ("X$left_mins" == "X") set left_mins = 0
if ("X$left_secs" == "X") set left_secs = 0
@ remaining = 86400 * $left_days + 3600 * $left_hours + 60 * $left_mins + $left_secs
cat > $run/Walltime.Remaining << EOF2
$remaining $sample_interval
EOF2

end
94 changes: 0 additions & 94 deletions cime/config/e3sm/machines/syslog.edison

This file was deleted.

11 changes: 7 additions & 4 deletions cime/scripts/lib/CIME/provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def _get_batch_job_id_for_syslog(case):
try:
if mach in ['titan']:
return os.environ["PBS_JOBID"]
elif mach in ['anvil', 'edison', 'cori-haswell', 'cori-knl']:
elif mach in ['anvil', 'compy', 'cori-haswell', 'cori-knl']:
return os.environ["SLURM_JOB_ID"]
elif mach in ['mira', 'theta']:
return os.environ["COBALT_JOBID"]
Expand Down Expand Up @@ -169,7 +169,7 @@ def _save_prerun_timing_e3sm(case, lid):
filename = "%s.%s" % (filename, lid)
run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
gzip_existing_file(os.path.join(full_timing_dir, filename))
elif mach in ["edison", "cori-haswell", "cori-knl"]:
elif mach in ["cori-haswell", "cori-knl"]:
for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"),
# ("sqs -f", "sqsf"),
("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"),
Expand All @@ -186,7 +186,7 @@ def _save_prerun_timing_e3sm(case, lid):
full_cmd = cmd + " " + filename
run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir)
gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid))
elif mach == "anvil":
elif mach in ["anvil", "compy"]:
for cmd, filename in [("sinfo -l", "sinfol"),
("squeue -o '%all' --job {}".format(job_id), "squeueall_jobid"),
("squeue -o '%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %.20S %.20V %j'", "squeuef"),
Expand Down Expand Up @@ -360,11 +360,14 @@ def _save_postrun_timing_e3sm(case, lid):
globs_to_copy.append("%s*OU" % job_id)
elif mach == "anvil":
globs_to_copy.append("%s*run*%s" % (case.get_value("CASE"), job_id))
elif mach == "compy":
globs_to_copy.append("slurm.err")
globs_to_copy.append("slurm.out")
elif mach in ["mira", "theta"]:
globs_to_copy.append("%s*error" % job_id)
globs_to_copy.append("%s*output" % job_id)
globs_to_copy.append("%s*cobaltlog" % job_id)
elif mach in ["edison", "cori-haswell", "cori-knl"]:
elif mach in ["cori-haswell", "cori-knl"]:
globs_to_copy.append("%s*run*%s" % (case.get_value("CASE"), job_id))
elif mach == "summit":
globs_to_copy.append("e3sm.stderr.%s" % job_id)
Expand Down