From cae6b7311a71150fd198a25bcc8c8387ff8602e8 Mon Sep 17 00:00:00 2001 From: Joseph H Kennedy Date: Tue, 21 May 2019 13:20:36 -0400 Subject: [PATCH] Set default NTEST_PARALLEL_JOBS=MAX_MPITASKS_PER_NODE and limit E3SM machines This resets the default value of NTEST_PARALLEL_JOBS to MAX_MPITASKS_PER_NODE so as to not make any behavioral changes to CESM. Warning: This is not a safe value on machine with batch systems who's login nodes are more limited than the compute nodes and therefore NTEST_PARALLEL_JOBS should be set on these systems. @jgfouca found via E3SM testing that limiting to 4 parallel jobs was required for many of the testing machines with batch systems to prevent hammering login nodes. Therefore, we set that value for these E3SM machines: * cori-haswell * cori-knl * blues * anvil * bebop * theta * titan * summit Warning: Non test machines for E3SM that have a batch system may still oversubscribe parallel test jobs. --- config/e3sm/machines/config_machines.xml | 8 ++++++++ scripts/lib/CIME/test_scheduler.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/config/e3sm/machines/config_machines.xml b/config/e3sm/machines/config_machines.xml index f5a08077164..ba9f6f95cbb 100644 --- a/config/e3sm/machines/config_machines.xml +++ b/config/e3sm/machines/config_machines.xml @@ -218,6 +218,7 @@ /project/projectdirs/acme/tools/cprnc.cori/cprnc 8 e3sm_developer + 4 nersc_slurm e3sm 32 @@ -356,6 +357,7 @@ /project/projectdirs/acme/tools/cprnc.cori/cprnc 8 e3sm_developer + 4 nersc_slurm e3sm 128 @@ -1038,6 +1040,7 @@ /home/ccsm-data/tools/cprnc 4 e3sm_integration + 4 pbs acme 16 @@ -1141,6 +1144,7 @@ /lcrc/group/acme/tools/cprnc/cprnc 8 e3sm_integration + 4 slurm E3SM 36 @@ -1255,6 +1259,7 @@ /lcrc/group/acme/tools/cprnc/cprnc 8 e3sm_integration + 4 slurm E3SM 36 @@ -1570,6 +1575,7 @@ /projects/ccsm/acme/tools/cprnc/cprnc 8 e3sm_developer + 4 cobalt_theta E3SM 128 @@ -2177,6 +2183,7 @@ /lustre/atlas1/cli900/world-shared/cesm/tools/cprnc/cprnc.titan 8 e3sm_developer + 4 pbs TRUE E3SM @@ -3065,6 +3072,7 @@ /gpfs/alpine/cli115/world-shared/e3sm/tools/cprnc.summit/cprnc 32 e3sm_developer + 4 lsf e3sm 84 diff --git a/scripts/lib/CIME/test_scheduler.py b/scripts/lib/CIME/test_scheduler.py index e2f8bf1a830..bd22f97f27c 100644 --- a/scripts/lib/CIME/test_scheduler.py +++ b/scripts/lib/CIME/test_scheduler.py @@ -194,7 +194,7 @@ def __init__(self, test_names, test_data=None, if parallel_jobs is None: mach_parallel_jobs = self._machobj.get_value("NTEST_PARALLEL_JOBS") if mach_parallel_jobs is None: - mach_parallel_jobs = 3 + mach_parallel_jobs = self._machobj.get_value("MAX_MPITASKS_PER_NODE") self._parallel_jobs = min(len(test_names), mach_parallel_jobs) else: self._parallel_jobs = parallel_jobs