From fa65c6a16b741bcd44012427eeafbf0a7233ad57 Mon Sep 17 00:00:00 2001 From: David Huber Date: Tue, 4 Feb 2025 14:21:47 -0600 Subject: [PATCH 1/8] Increase GDASApp build wallclock --- workflow/build_opts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index 464701c2f3..29bcc965ec 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -91,4 +91,4 @@ build: command: "./build_gdas.sh -j 12" log: "build_gdas.log" cores: 12 - walltime: "01:00:00" + walltime: "01:30:00" From 02f5d7b6e6edcd85ae33736610dc96bab8e00f78 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 06:13:41 -0600 Subject: [PATCH 2/8] Increase CPU count --- workflow/build_opts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index 29bcc965ec..18911bbefb 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -90,5 +90,5 @@ build: gdas: command: "./build_gdas.sh -j 12" log: "build_gdas.log" - cores: 12 + cores: 24 walltime: "01:30:00" From f60a8e1734538bed3c4415ceb2e6b47fd5117e58 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 06:30:59 -0600 Subject: [PATCH 3/8] Adjust the command as well --- workflow/build_opts.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index 18911bbefb..a0cb1dd575 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -88,7 +88,7 @@ build: walltime: "00:10:00" gdas: - command: "./build_gdas.sh -j 12" + command: "./build_gdas.sh -j 24" log: "build_gdas.log" cores: 24 - walltime: "01:30:00" + walltime: "00:05:00" From ddac3cb201944484226ee94d70afd9c4658d4fec Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Wed, 5 Feb 2025 08:08:12 -0500 Subject: [PATCH 4/8] Update workflow/build_opts.yaml --- workflow/build_opts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index a0cb1dd575..f2074c7cb0 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -91,4 +91,4 @@ build: command: "./build_gdas.sh -j 24" log: "build_gdas.log" cores: 24 - walltime: "00:05:00" + walltime: "01:30:00" From c8467befec0f538c0e730d425f858ff5b4ffce93 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 07:41:03 -0600 Subject: [PATCH 5/8] Create a log file for Jenkins to parse --- sorc/build_compute.sh | 28 ++++++++++++++++++++++------ workflow/build_opts.yaml | 4 ++-- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index 794b4fa350..7bd33add4b 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -87,7 +87,7 @@ finished=false ${runcmd} echo "Running builds on compute nodes" while [[ "${finished}" == "false" ]]; do - sleep 3m + sleep 10s ${runcmd} state="$("${HOMEgfs}/ci/scripts/utils/rocotostat.py" -w "${build_xml}" -d "${build_db}")" if [[ "${verbose_opt}" == "true" ]]; then @@ -100,13 +100,29 @@ while [[ "${finished}" == "false" ]]; do finished=true elif [[ "${state}" == "RUNNING" ]]; then finished=false - elif [[ "${state}" == "DEAD" ]]; then - echo "FATAL ERROR: ${BASH_SOURCE[0]} one or more builds failed!" - # TODO add capability to determine which build(s) failed - exit 2 else echo "FATAL ERROR: ${BASH_SOURCE[0]} rocoto failed with state '${state}'" - exit 3 + # Determine which builds failed + echo "$(rocotostat -w "${build_xml}" -d "${build_db}")" > rocotostat.out + line_number=0 + rm -f logs/error.logs + set -x + while read -r line; do + (( line_number += 1 )) + # Skip the first two lines (header) + if [[ ${line_number} -lt 3 ]]; then + continue + fi + + if [[ "${line}" =~ "DEAD" || "${line}" =~ "UNKNOWN" || + "${line}" =~ "UNAVAILABLE" || "${line}" =~ "FAIL" ]]; then + job=$(echo "${line}" | awk '{ print $2 }') + log_file="logs/build_${job}" + echo "${log_file}" >> logs/error.logs + echo "Rocoto reported that the build failed for ${job}" + fi + done < rocotostat.out + exit 2 fi done diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index f2074c7cb0..eb6018e0cc 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -90,5 +90,5 @@ build: gdas: command: "./build_gdas.sh -j 24" log: "build_gdas.log" - cores: 24 - walltime: "01:30:00" + cores: 1 + walltime: "00:00:10" From 68c52c66a6f4b23c6e5f363ac1836e5e9c3a5478 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 07:44:27 -0600 Subject: [PATCH 6/8] Fix whitespace --- sorc/build_compute.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index 7bd33add4b..3c9b28bc89 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -104,22 +104,22 @@ while [[ "${finished}" == "false" ]]; do echo "FATAL ERROR: ${BASH_SOURCE[0]} rocoto failed with state '${state}'" # Determine which builds failed echo "$(rocotostat -w "${build_xml}" -d "${build_db}")" > rocotostat.out - line_number=0 - rm -f logs/error.logs + line_number=0 + rm -f logs/error.logs set -x - while read -r line; do + while read -r line; do (( line_number += 1 )) - # Skip the first two lines (header) + # Skip the first two lines (header) if [[ ${line_number} -lt 3 ]]; then continue - fi + fi if [[ "${line}" =~ "DEAD" || "${line}" =~ "UNKNOWN" || "${line}" =~ "UNAVAILABLE" || "${line}" =~ "FAIL" ]]; then job=$(echo "${line}" | awk '{ print $2 }') log_file="logs/build_${job}" - echo "${log_file}" >> logs/error.logs - echo "Rocoto reported that the build failed for ${job}" + echo "${log_file}" >> logs/error.logs + echo "Rocoto reported that the build failed for ${job}" fi done < rocotostat.out exit 2 From c149e1174936ced06c0571e6782b76ffa2eb6686 Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 08:09:22 -0600 Subject: [PATCH 7/8] Correct log names --- sorc/build_compute.sh | 7 ++++--- workflow/build_opts.yaml | 12 ------------ 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index 3c9b28bc89..cf43efc1f2 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -103,10 +103,11 @@ while [[ "${finished}" == "false" ]]; do else echo "FATAL ERROR: ${BASH_SOURCE[0]} rocoto failed with state '${state}'" # Determine which builds failed - echo "$(rocotostat -w "${build_xml}" -d "${build_db}")" > rocotostat.out + set -x + stat_out="$(rocotostat -w "${build_xml}" -d "${build_db}")" + echo "${stat_out}" > rocotostat.out line_number=0 rm -f logs/error.logs - set -x while read -r line; do (( line_number += 1 )) # Skip the first two lines (header) @@ -117,7 +118,7 @@ while [[ "${finished}" == "false" ]]; do if [[ "${line}" =~ "DEAD" || "${line}" =~ "UNKNOWN" || "${line}" =~ "UNAVAILABLE" || "${line}" =~ "FAIL" ]]; then job=$(echo "${line}" | awk '{ print $2 }') - log_file="logs/build_${job}" + log_file="logs/${job}.log" echo "${log_file}" >> logs/error.logs echo "Rocoto reported that the build failed for ${job}" fi diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index 39db9c2d9c..2643ba932b 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -23,72 +23,60 @@ systems: build: gfs_model: command: "./build_ufs.sh -e gfs_model.x -j 12" - log: "build_ufs_gfs.log" cores: 12 walltime: "00:30:00" gfs_ww3prepost: command: "./build_ww3prepost.sh -j 4" - log: "build_ww3prepost_gfs.log" cores: 4 walltime: "00:10:00" gefs_model: command: "./build_ufs.sh -w -e gefs_model.x -j 12" - log: "build_ufs_gefs.log" cores: 12 walltime: "00:30:00" gefs_ww3_prepost: command: "./build_ww3prepost.sh -w -j 4" - log: "build_ww3prepost_gefs.log" cores: 4 walltime: "00:10:00" sfs_model: command: "./build_ufs.sh -y -e sfs_model.x -j 12" - log: "build_ufs_sfs.log" cores: 12 walltime: "00:30:00" upp: command: "./build_upp.sh -j 8" - log: "build_upp.log" cores: 8 walltime: "00:10:00" gsi_enkf: command: "./build_gsi_enkf.sh -j 8" - log: "build_gsi_enkf.log" cores: 8 walltime: "00:15:00" gsi_monitor: command: "./build_gsi_monitor.sh -j 4" - log: "build_gsi_monitor.log" cores: 4 walltime: "00:10:00" gsi_utils: command: "./build_gsi_utils.sh -j 6" - log: "build_gsi_utils.log" cores: 6 walltime: "00:10:00" ufs_utils: command: "./build_ufs_utils.sh -j 8" - log: "build_ufs_utils.log" cores: 8 walltime: "00:10:00" gfs_utils: command: "./build_gfs_utils.sh -j 6" - log: "build_gfs_utils.log" cores: 6 walltime: "00:10:00" gdas: command: "./build_gdas.sh -j 1" - log: "build_gdas.log" cores: 1 walltime: "00:00:10" From 2687e73b395da5f179ceae4adbe2af04f326629b Mon Sep 17 00:00:00 2001 From: David Huber Date: Wed, 5 Feb 2025 08:24:19 -0600 Subject: [PATCH 8/8] Remove debug statements --- sorc/build_compute.sh | 7 +++---- workflow/build_opts.yaml | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index cf43efc1f2..0a56c1ba13 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -87,7 +87,7 @@ finished=false ${runcmd} echo "Running builds on compute nodes" while [[ "${finished}" == "false" ]]; do - sleep 10s + sleep 3m ${runcmd} state="$("${HOMEgfs}/ci/scripts/utils/rocotostat.py" -w "${build_xml}" -d "${build_db}")" if [[ "${verbose_opt}" == "true" ]]; then @@ -102,12 +102,11 @@ while [[ "${finished}" == "false" ]]; do finished=false else echo "FATAL ERROR: ${BASH_SOURCE[0]} rocoto failed with state '${state}'" - # Determine which builds failed - set -x + rm -f logs/error.logs + # Determine which build(s) failed stat_out="$(rocotostat -w "${build_xml}" -d "${build_db}")" echo "${stat_out}" > rocotostat.out line_number=0 - rm -f logs/error.logs while read -r line; do (( line_number += 1 )) # Skip the first two lines (header) diff --git a/workflow/build_opts.yaml b/workflow/build_opts.yaml index 2643ba932b..c17432d4a2 100644 --- a/workflow/build_opts.yaml +++ b/workflow/build_opts.yaml @@ -77,6 +77,6 @@ build: walltime: "00:10:00" gdas: - command: "./build_gdas.sh -j 1" - cores: 1 - walltime: "00:00:10" + command: "./build_gdas.sh -j 24" + cores: 24 + walltime: "01:30:00"