From 586f658d71ae4a2a8971bd67b6659547badc87fa Mon Sep 17 00:00:00 2001 From: Cory Martin Date: Fri, 21 Feb 2025 19:21:03 +0000 Subject: [PATCH] Allow for C1152 ATM-Aero cycled DA to run on WCOSS2 (#3309) This PR enables the ability to cycle C1152 atm+aerosol on WCOSS2 to support GFSv17. Wallclock times were extended because 1) model execution is now slower with GOCART (this should probably be configurable in the config.resources depending on APP but that is outside the scope of my PR) 2) the aerosol DA variational solver needs more time at this resolution and 3) it takes FOREVER to copy C1152 restart files with the GOCART aerosol species in them. Also, the prep job runs out of memory so this spreads it out over more nodes. There are also misc. things that were added to get things to run on WCOSS2 that may need altered/removed post the most recent RFC. Finally, GDAS hash is updated. Resolves #3275 --------- Co-authored-by: CatherineThomas-NOAA <59020064+CatherineThomas-NOAA@users.noreply.github.com> Co-authored-by: Rahul Mahajan --- parm/config/gfs/config.aeroanlgenb | 9 ++--- parm/config/gfs/config.resources | 51 +++++++++++++++++++++---- parm/config/gfs/config.resources.WCOSS2 | 1 + ush/load_ufsda_modules.sh | 1 - ush/module-setup.sh | 2 +- 5 files changed, 50 insertions(+), 14 deletions(-) diff --git a/parm/config/gfs/config.aeroanlgenb b/parm/config/gfs/config.aeroanlgenb index d1f0ed10bd..075b4be90b 100644 --- a/parm/config/gfs/config.aeroanlgenb +++ b/parm/config/gfs/config.aeroanlgenb @@ -12,17 +12,16 @@ export JEDI_CONFIG_YAML="${PARMgfs}/gdas/aero_bmat_jedi_config.yaml.j2" export JCB_BASE_YAML="${PARMgfs}/gdas/aero/jcb-base.yaml.j2" export AERO_BMATRIX_STAGE_TMPL="${PARMgfs}/gdas/aero_stage_bmatrix_bkg.yaml.j2" export AERO_BMATRIX_FINALIZE_TMPL="${PARMgfs}/gdas/aero_finalize_bmatrix_bkg.yaml.j2" -export aero_diffusion_iter=10 -export aero_diffusion_horiz_len=2500e3 -export aero_diffusion_fixed_val=1.0 +export aero_diffusion_iter=200 +export aero_diffusion_horiz_len=300e3 +export aero_diffusion_fixed_val=20.0 export npx_clim_b=97 export npy_clim_b=97 export aero_diagb_weight=0.9 export aero_staticb_rescaling_factor=2.0 -export aero_diagb_rescale=20.0 export aero_diagb_n_halo=4 export aero_diagb_n_neighbors=16 -export aero_diagb_smooth_horiz_iter=0 +export aero_diagb_smooth_horiz_iter=200 export aero_diagb_smooth_vert_iter=0 echo "END: config.aeroanlgenb" diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index 4cc77d7339..95d6ad117d 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -399,19 +399,23 @@ case ${step} in "C1152" | "C768") layout_x=8 layout_y=8 + walltime="00:30:00" ;; "C384") layout_x=6 layout_y=6 + walltime="00:20:00" ;; "C192" | "C96") layout_x=4 layout_y=4 + walltime="00:10:00" ;; "C48" ) # this case is for testing only layout_x=1 layout_y=1 + walltime="00:10:00" ;; *) echo "FATAL ERROR: Resources not defined for job ${step} at resolution ${CASE}" @@ -420,7 +424,6 @@ case ${step} in export layout_x export layout_y - walltime="00:10:00" ntasks=1 threads_per_task=1 tasks_per_node=$(( max_tasks_per_node / threads_per_task )) @@ -428,23 +431,34 @@ case ${step} in ;; "aeroanlvar") + + threads_per_task=1 + case ${CASE} in "C1152" | "C768") layout_x=8 layout_y=8 + walltime="00:45:00" + tasks_per_node=24 ;; "C384") layout_x=6 layout_y=6 + walltime="00:30:00" + tasks_per_node=24 ;; "C192" | "C96") layout_x=4 layout_y=4 + walltime="00:20:00" + tasks_per_node=$(( max_tasks_per_node / threads_per_task )) ;; "C48" ) # this case is for testing only layout_x=1 layout_y=1 + walltime="00:20:00" + tasks_per_node=$(( max_tasks_per_node / threads_per_task )) ;; *) echo "FATAL ERROR: Resources not defined for job ${step} at resolution ${CASE}" @@ -454,10 +468,7 @@ case ${step} in export layout_x export layout_y - walltime="00:30:00" ntasks=$(( layout_x * layout_y * 6 )) - threads_per_task=1 - tasks_per_node=$(( max_tasks_per_node / threads_per_task )) export is_exclusive=True ;; @@ -498,11 +509,36 @@ case ${step} in "aeroanlfinal") - walltime="00:10:00" + case ${CASE} in + "C1152" | "C768") + layout_x=8 + layout_y=8 + walltime="00:30:00" + ;; + "C384") + layout_x=6 + layout_y=6 + walltime="00:20:00" + ;; + "C192" | "C96") + layout_x=4 + layout_y=4 + walltime="00:10:00" + ;; + "C48" ) + # this case is for testing only + layout_x=1 + layout_y=1 + walltime="00:10:00" + ;; + *) + echo "FATAL ERROR: Resources not defined for job ${step} at resolution ${CASE}" + exit 4 + esac ntasks=1 threads_per_task=1 tasks_per_node=$(( max_tasks_per_node / threads_per_task )) - memory="3072M" + memory="13072M" ;; "marineanlinit") @@ -891,6 +927,7 @@ case ${step} in declare -x "threads_per_task"="${UFS_THREADS}" tasks_per_node=$(( max_tasks_per_node / threads_per_task )) + # TODO: make walltimes APP dependent case "${CASE}" in "C48" | "C96" | "C192") declare -x "walltime_gdas"="00:20:00" @@ -906,7 +943,7 @@ case ${step} in ;; "C768" | "C1152") # Not valid resolutions for ensembles - declare -x "walltime_gdas"="00:50:00" + declare -x "walltime_gdas"="01:20:00" declare -x "walltime_gfs"="06:00:00" ;; *) diff --git a/parm/config/gfs/config.resources.WCOSS2 b/parm/config/gfs/config.resources.WCOSS2 index be55214cac..5ca762b0a7 100644 --- a/parm/config/gfs/config.resources.WCOSS2 +++ b/parm/config/gfs/config.resources.WCOSS2 @@ -5,6 +5,7 @@ case ${step} in "prep") export is_exclusive=True + export tasks_per_node=5 export memory="480GB" ;; diff --git a/ush/load_ufsda_modules.sh b/ush/load_ufsda_modules.sh index 3af843f8fe..ab75af0253 100755 --- a/ush/load_ufsda_modules.sh +++ b/ush/load_ufsda_modules.sh @@ -41,7 +41,6 @@ case "${MACHINE_ID}" in # TODO: Add path to GDASApp libraries and cray-mpich as temporary patches # TODO: Remove LD_LIBRARY_PATH lines as soon as permanent solutions are available export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${HOMEgfs}/sorc/gdas.cd/build/lib" - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/cray/pe/mpich/8.1.19/ofi/intel/19.0/lib" fi module load "${MODS}/${MACHINE_ID}" ncdump=$( command -v ncdump ) diff --git a/ush/module-setup.sh b/ush/module-setup.sh index 2429963d70..158efd2569 100755 --- a/ush/module-setup.sh +++ b/ush/module-setup.sh @@ -52,7 +52,7 @@ elif [[ ${MACHINE_ID} = s4* ]] ; then elif [[ ${MACHINE_ID} = wcoss2 ]]; then # We are on WCOSS2 # Ignore default modules of the same version lower in the search path (req'd by spack-stack) - #export LMOD_TMOD_FIND_FIRST=yes #TODO: Uncomment this when using spack-stack + #export LMOD_TMOD_FIND_FIRST=yes #TODO: Uncomment this when using spack-stack for the entire workflow module reset elif [[ ${MACHINE_ID} = cheyenne* ]] ; then