diff --git a/.cicd/Jenkinsfile b/.cicd/Jenkinsfile index 095eecb200..79174c4d20 100644 --- a/.cicd/Jenkinsfile +++ b/.cicd/Jenkinsfile @@ -2,19 +2,21 @@ pipeline { agent none options { + disableConcurrentBuilds() + overrideIndexTriggers(false) skipDefaultCheckout(true) } parameters { // Allow job runner to filter based on platform - // choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'pcluster_noaa_v2_use1', 'azcluster_noaa', 'gcluster_noaa_v2_usc1'], description: 'Specify the platform(s) to use') - choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion'], description: 'Specify the platform(s) to use') + // choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'], description: 'Specify the platform(s) to use') + choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'pclusternoaav2use1'], description: 'Specify the platform(s) to use') // Allow job runner to filter based on compiler choice(name: 'SRW_COMPILER_FILTER', choices: ['all', 'gnu', 'intel'], description: 'Specify the compiler(s) to use to build') + booleanParam name: 'SRW_WE2E_COMPREHENSIVE_TESTS', defaultValue: false, description: 'Whether to execute the comprehensive end-to-end tests' } stages { - /* // Start the NOAA Parallel Works clusters, if necessary stage('Start Parallel Works Clusters') { matrix { @@ -29,7 +31,7 @@ pipeline { axes { axis { name 'SRW_PLATFORM' - values 'pcluster_noaa_v2_use1', 'azcluster_noaa', 'gcluster_noaa_v2_usc1' + values 'pclusternoaav2use1' //, 'azclusternoaav2eus1', 'gclusternoaav2usc1' } } @@ -44,7 +46,6 @@ pipeline { } } } - */ // Build and test the SRW application on all supported platforms using the supported compilers for each platform stage('Build and Test') { @@ -68,8 +69,7 @@ pipeline { axes { axis { name 'SRW_PLATFORM' - // values 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'pcluster_noaa_v2_use1', 'azcluster_noaa', 'gcluster_noaa_v2_usc1' - values 'cheyenne', 'gaea', 'hera', 'jet', 'orion' + values 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'pclusternoaav2use1' //, 'azclusternoaav2eus1', 'gclusternoaav2usc1' } axis { @@ -83,7 +83,7 @@ pipeline { exclude { axis { name 'SRW_PLATFORM' - values 'gaea', 'hera', 'jet', 'orion' + values 'gaea', 'hera', 'jet', 'orion', 'pclusternoaav2use1' //, 'azclusternoaav2eus1', 'gclusternoaav2usc1' } axis { @@ -93,10 +93,6 @@ pipeline { } } - agent { - label env.SRW_PLATFORM - } - environment { BRANCH_NAME_ESCAPED = env.BRANCH_NAME.replace('/', '_') BUILD_VERSION = "${env.SRW_PLATFORM}-${env.SRW_COMPILER}-${env.BRANCH_NAME_ESCAPED}-${env.BUILD_NUMBER}" @@ -106,6 +102,10 @@ pipeline { stages { // Clean the workspace, checkout the repository, and run checkout_externals stage('Initialize') { + agent { + label env.SRW_PLATFORM + } + steps { echo "Initializing SRW (${env.SRW_COMPILER}) build environment on ${env.SRW_PLATFORM}" cleanWs() @@ -116,6 +116,10 @@ pipeline { // Run the unified build script; if successful create a tarball of the build and upload to S3 stage('Build') { + agent { + label env.SRW_PLATFORM + } + steps { echo "Building SRW (${env.SRW_COMPILER}) on ${env.SRW_PLATFORM}" sh 'bash --login "${WORKSPACE}/.cicd/scripts/srw_build.sh"' @@ -124,16 +128,49 @@ pipeline { post { success { sh 'tar --create --gzip --verbose --file "${WORKSPACE}/${BUILD_NAME}.tgz" bin include lib share' - s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'woc-epic-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: true, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.BUILD_NAME}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: [] + s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'woc-epic-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: true, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.BUILD_NAME}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'woc-epic-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: true, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "build/srw_build-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.log", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: [] } } } // Run the unified test script stage('Test') { + agent { + label env.SRW_PLATFORM + } + + environment { + SRW_WE2E_EXPERIMENT_BASE_DIR = "${env.WORKSPACE}/experiments" + } + steps { echo "Testing SRW (${env.SRW_COMPILER}) on ${env.SRW_PLATFORM}" - sh 'bash --login "${WORKSPACE}/.cicd/scripts/srw_test.sh"' + + // If executing for a Pull Request, check for the run_we2e_comprehensive_tests. If set, + // override the value of the SRW_WE2E_COMPREHENSIVE_TESTS parameter + script { + def run_we2e_comprehensive_tests = params.SRW_WE2E_COMPREHENSIVE_TESTS + def run_we2e_comprehensive_tests_label = 'run_we2e_comprehensive_tests' + + if (env.CHANGE_ID) { + pullRequest.labels.each { + if (it == run_we2e_comprehensive_tests_label) { + run_we2e_comprehensive_tests = true + } + } + } + + sh "SRW_WE2E_COMPREHENSIVE_TESTS=${run_we2e_comprehensive_tests} bash --login ${env.WORKSPACE}/.cicd/scripts/srw_test.sh" + } + } + + post { + always { + // Archive the test log files and remove the experiments directory to conserve disk space + sh 'cd "${SRW_WE2E_EXPERIMENT_BASE_DIR}" && tar --create --gzip --verbose --file "${WORKSPACE}/we2e_test_logs-${SRW_PLATFORM}-${SRW_COMPILER}.tgz" */log.generate_FV3LAM_wflow */log.launch_FV3LAM_wflow */log/*' + sh 'rm -rf "${SRW_WE2E_EXPERIMENT_BASE_DIR}"' + s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'woc-epic-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: 'we2e_test_results-*-*.txt', storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'woc-epic-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: 'we2e_test_logs-*-*.tgz', storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: [] + } } } } @@ -141,12 +178,12 @@ pipeline { } } - /* post { always { // Stop any Parallel Works clusters that were started during the pipeline execution script { - def pw_clusters = ['pcluster_noaa_v2_use1', 'azcluster_noaa', 'gcluster_noaa_v2_usc1'] + // def pw_clusters = ['pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'] + def pw_clusters = ['pclusternoaav2use1'] def clusters = [] // Determine which clusters need to be stopped, if any @@ -166,5 +203,4 @@ pipeline { } } } - */ } diff --git a/.cicd/scripts/srw_build.sh b/.cicd/scripts/srw_build.sh index 638d538fa6..fed4170e3a 100755 --- a/.cicd/scripts/srw_build.sh +++ b/.cicd/scripts/srw_build.sh @@ -16,16 +16,25 @@ else workspace="$(cd -- "${script_dir}/../.." && pwd)" fi +# Normalize Parallel Works cluster platform value. +declare platform +if [[ "${SRW_PLATFORM}" =~ ^(az|g|p)clusternoaa ]]; then + platform='noaacloud' +else + platform="${SRW_PLATFORM}" +fi + build_dir="${workspace}/build" # Set build related environment variables and load required modules. -source "${workspace}/etc/lmod-setup.sh" "${SRW_PLATFORM}" +source "${workspace}/etc/lmod-setup.sh" "${platform}" module use "${workspace}/modulefiles" -module load "build_${SRW_PLATFORM}_${SRW_COMPILER}" +module load "build_${platform}_${SRW_COMPILER}" # Compile SRW application and install to repository root. mkdir "${build_dir}" pushd "${build_dir}" - cmake -DCMAKE_INSTALL_PREFIX="${workspace}" -DENABLE_RRFS=on "${workspace}" - make -j "${MAKE_JOBS}" + build_log_file="${build_dir}/srw_build-${platform}-${SRW_COMPILER}.log" + cmake -DCMAKE_INSTALL_PREFIX="${workspace}" "${workspace}" | tee "${build_log_file}" + make -j "${MAKE_JOBS}" | tee --append "${build_log_file}" popd diff --git a/.cicd/scripts/srw_test.sh b/.cicd/scripts/srw_test.sh index 43b9935888..0582815e74 100755 --- a/.cicd/scripts/srw_test.sh +++ b/.cicd/scripts/srw_test.sh @@ -17,5 +17,227 @@ else workspace="$(cd -- "${script_dir}/../.." && pwd)" fi +# Normalize Parallel Works cluster platform value. +declare platform +if [[ "${SRW_PLATFORM}" =~ ^(az|g|p)clusternoaa ]]; then + platform='noaacloud' +else + platform="${SRW_PLATFORM}" +fi + +declare we2e_experiment_base_dir +if [[ -n "${SRW_WE2E_EXPERIMENT_BASE_DIR}" ]]; then + we2e_experiment_base_dir="${SRW_WE2E_EXPERIMENT_BASE_DIR}" +else + we2e_experiment_base_dir="${workspace}/experiments" +fi + +we2e_test_dir="${workspace}/tests/WE2E" + +we2e_test_file="${we2e_test_dir}/experiments.txt" + +# The fundamental set of end-to-end tests to run. +declare -a we2e_fundamental_tests +we2e_fundamental_tests=('grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_RRFS_v1beta' + 'grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_RAP_suite_HRRR' + 'grid_RRFS_CONUS_25km_ics_NAM_lbcs_NAM_suite_HRRR' + 'grid_RRFS_CONUS_25km_ics_NAM_lbcs_NAM_suite_RRFS_v1beta' + 'grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_HRRR_suite_HRRR' + 'grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_RAP_suite_HRRR' + 'grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_RAP_suite_RRFS_v1beta' + 'grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_SUBCONUS_Ind_3km_ics_HRRR_lbcs_RAP_suite_HRRR' + 'grid_SUBCONUS_Ind_3km_ics_HRRR_lbcs_RAP_suite_RRFS_v1beta' + 'nco_grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_RAP_suite_HRRR' + 'community_ensemble_2mems' + 'custom_ESGgrid' + 'deactivate_tasks' + 'inline_post' + 'nco_ensemble' + 'specify_DOT_OR_USCORE' + 'specify_DT_ATMOS_LAYOUT_XY_BLOCKSIZE' + 'specify_RESTART_INTERVAL' + 'specify_template_filenames') + +if [[ "${platform}" != 'gaea' && "${platform}" != 'noaacloud' ]]; then + we2e_fundamental_tests+=('MET_ensemble_verification' + 'MET_verification' + 'pregen_grid_orog_sfc_climo') +fi + +# The comprehensive set of end-to-end tests to run. +declare -a we2e_comprehensive_tests +we2e_comprehensive_tests=('community_ensemble_008mems' + 'custom_GFDLgrid' + 'custom_GFDLgrid__GFDLgrid_USE_NUM_CELLS_IN_FILENAMES_eq_FALSE' + 'custom_GFDLgrid__GFDLgrid_USE_NUM_CELLS_IN_FILENAMES_eq_TRUE' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_grib2_2019061200' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_grib2_2019101818' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_grib2_2020022518' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_grib2_2020022600' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_grib2_2021010100' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_nemsio' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_nemsio_2019061200' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_nemsio_2019101818' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_nemsio_2020022518' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_nemsio_2020022600' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_nemsio_2021010100' + 'get_from_HPSS_ics_FV3GFS_lbcs_FV3GFS_fmt_netcdf_2021062000' + 'get_from_HPSS_ics_GSMGFS_lbcs_GSMGFS' + 'get_from_HPSS_ics_HRRR_lbcs_RAP' + 'get_from_HPSS_ics_RAP_lbcs_RAP' + 'get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS_fmt_nemsio' + 'grid_CONUS_25km_GFDLgrid_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_CONUS_3km_GFDLgrid_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_AK_13km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_AK_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_CONUS_13km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v15p2' + 'grid_RRFS_CONUS_13km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_CONUS_13km_ics_FV3GFS_lbcs_FV3GFS_suite_HRRR' + 'grid_RRFS_CONUS_13km_ics_FV3GFS_lbcs_FV3GFS_suite_RRFS_v1beta' + 'grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_2017_gfdlmp' + 'grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_2017_gfdlmp_regional' + 'grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v15p2' + 'grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_HRRR' + 'grid_RRFS_CONUS_25km_ics_GSMGFS_lbcs_GSMGFS_suite_GFS_2017_gfdlmp' + 'grid_RRFS_CONUS_25km_ics_GSMGFS_lbcs_GSMGFS_suite_GFS_v15p2' + 'grid_RRFS_CONUS_25km_ics_GSMGFS_lbcs_GSMGFS_suite_GFS_v16' + 'grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v15_thompson_mynn_lam3km' + 'grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v15p2' + 'grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_HRRR' + 'grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_RRFS_v1beta' + 'grid_RRFS_CONUScompact_13km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_CONUScompact_13km_ics_HRRR_lbcs_RAP_suite_HRRR' + 'grid_RRFS_CONUScompact_13km_ics_HRRR_lbcs_RAP_suite_RRFS_v1beta' + 'grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_HRRR_suite_RRFS_v1beta' + 'grid_RRFS_CONUScompact_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_CONUScompact_3km_ics_HRRR_lbcs_RAP_suite_GFS_v15p2' + 'grid_RRFS_CONUScompact_3km_ics_HRRR_lbcs_RAP_suite_HRRR' + 'grid_RRFS_CONUScompact_3km_ics_HRRR_lbcs_RAP_suite_RRFS_v1beta' + 'grid_RRFS_NA_13km_ics_FV3GFS_lbcs_FV3GFS_suite_RRFS_v1beta' + 'grid_RRFS_NA_3km_ics_FV3GFS_lbcs_FV3GFS_suite_RRFS_v1beta' + 'grid_RRFS_SUBCONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'grid_RRFS_SUBCONUS_3km_ics_HRRR_lbcs_RAP_suite_GFS_v15p2' + 'nco_grid_RRFS_CONUS_13km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' + 'nco_grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v15_thompson_mynn_lam3km') + +declare -a we2e_tests +we2e_tests=("${we2e_fundamental_tests[@]}") +if "${SRW_WE2E_COMPREHENSIVE_TESTS}"; then + we2e_tests+=("${we2e_comprehensive_tests[@]}") + + # Add additional tests for Hera. + if [[ "${platform}" == 'hera' ]]; then + we2e_tests+=('specify_EXTRN_MDL_SYSBASEDIR_ICS_LBCS') + fi +fi + +# Parses the test log for the status of a specific test. +function workflow_status() { + local test="$1" + + local test_dir="${we2e_experiment_base_dir}/${test}" + local log_file="${test_dir}/log.launch_FV3LAM_wflow" + + if [[ -f "${log_file}" ]]; then + local status + status="$(awk 'BEGIN {FS=":";} $1 ~ "^[[:space:]]+Workflow status" {print $2}' "${log_file}" |\ + tail -1 |\ + sed --regexp-extended --expression 's/^[[:space:]]*(.*)[[:space:]]*$/\1/')" + if [[ "${status}" == 'IN PROGRESS' || "${status}" == 'SUCCESS' || "${status}" == 'FAILURE' ]]; then + echo "${status}" + else + echo 'UNKNOWN' + fi + else + echo 'NOT FOUND' + fi +} + +# Gets the status of all tests. Prints the number of tests that are running. +# Returns a non-zero code when all tests reach a final state. +function check_progress() { + local in_progress=false + local remaining=0 + + for test in "${we2e_tests[@]}"; do + local status + status="$(workflow_status "${test}")" + if [[ "${status}" == 'IN PROGRESS' ]]; then + in_progress=true + (( remaining++ )) + fi + done + + if "${in_progress}"; then + echo "Tests remaining: ${remaining}" + else + return 1 + fi +} + +# Prints the status of all tests. +function get_results() { + for test in "${we2e_tests[@]}"; do + local status + status="$(workflow_status "${test}")" + echo "${test} ${status}" + done +} + # Verify that there is a non-zero sized weather model executable. [[ -s "${workspace}/bin/ufs_model" ]] || [[ -s "${workspace}/bin/NEMS.exe" ]] + +# Set test related environment variables and load required modules. +source "${workspace}/etc/lmod-setup.sh" "${platform}" +module use "${workspace}/modulefiles" +module load "build_${platform}_${SRW_COMPILER}" +module load "wflow_${platform}" + +if [[ "${platform}" == 'cheyenne' ]]; then + export PATH="/glade/p/ral/jntp/UFS_CAM/ncar_pylib_20200427/bin:${PATH}" +else + if [[ "${platform}" == 'noaacloud' && -z "${PROJ_LIB-}" ]]; then + PROJ_LIB='' + fi + + conda activate regional_workflow +fi + +# Create the experiments/tests base directory. +mkdir "${we2e_experiment_base_dir}" + +# Generate the experiments/tests file. +for test in "${we2e_tests[@]}"; do + echo "${test}" >> "${we2e_test_file}" +done + +# Run the end-to-end tests. +"${we2e_test_dir}/run_WE2E_tests.sh" \ + tests_file="${we2e_test_file}" \ + machine="${platform}" \ + account="${SRW_PROJECT}" \ + expt_basedir="${we2e_experiment_base_dir}" \ + compiler="${SRW_COMPILER}" + +# Allow the tests to start before checking for status. +# TODO: Create a parameter that sets the initial start delay. +sleep 180 + +# Wait for all tests to complete. +while check_progress; do + # TODO: Create a paremeter that sets the poll frequency. + sleep 60 +done + +# Get test results and write to a file. +results="$(get_results |\ + tee "${workspace}/we2e_test_results-${platform}-${SRW_COMPILER}.txt")" + +# Check that the number of tests equals the number of successes, otherwise +# exit with a non-zero code that equals the difference. +successes="$(awk '$2 == "SUCCESS" {print $1}' <<< "${results}" | wc -l)" +exit "$(( ${#we2e_tests[@]} - ${successes} ))"