From c23ea1565fbd985c7dfeddc20054e1b1bb940f45 Mon Sep 17 00:00:00 2001 From: Joseph H Kennedy Date: Mon, 20 May 2019 15:54:47 -0400 Subject: [PATCH 1/2] Provide sane default for the number of create_test parallel jobs E3SM's `e3sm_developer` test suite will launch a large number of parallel build on the login node unless explicitly passing create_test the number of parallel jobs (-j/--parallel-jobs) it should use (see E3SM-Project/E3SM#2923 ). This is because the current default is set by the MAX_MPITASKS_PER_NODE machine/env config variable, which for Cori-knl is 64. This commit: * sets the default number of parallel jobs to 3 * add a possible machine config (xml or env) variable, NTEST_PARALLEL_JOBS, which can be set to override the default number on a per machine basis The parallel jobs setting priority is now (highest to lowest): 1. -j/--parallel-jobs command line argument 2. NTEST_PARALLEL_JOBS config_machines.xml or environment variable 3. the default value --- config/xml_schemas/config_machines.xsd | 3 +++ scripts/lib/CIME/test_scheduler.py | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/config/xml_schemas/config_machines.xsd b/config/xml_schemas/config_machines.xsd index f59f2a5f264..e8af235035c 100644 --- a/config/xml_schemas/config_machines.xsd +++ b/config/xml_schemas/config_machines.xsd @@ -46,6 +46,7 @@ + @@ -140,6 +141,8 @@ + + diff --git a/scripts/lib/CIME/test_scheduler.py b/scripts/lib/CIME/test_scheduler.py index ab1abc3cd5b..e2f8bf1a830 100644 --- a/scripts/lib/CIME/test_scheduler.py +++ b/scripts/lib/CIME/test_scheduler.py @@ -192,8 +192,10 @@ def __init__(self, test_names, test_data=None, self._walltime = walltime if parallel_jobs is None: - self._parallel_jobs = min(len(test_names), - self._machobj.get_value("MAX_MPITASKS_PER_NODE")) + mach_parallel_jobs = self._machobj.get_value("NTEST_PARALLEL_JOBS") + if mach_parallel_jobs is None: + mach_parallel_jobs = 3 + self._parallel_jobs = min(len(test_names), mach_parallel_jobs) else: self._parallel_jobs = parallel_jobs From cae6b7311a71150fd198a25bcc8c8387ff8602e8 Mon Sep 17 00:00:00 2001 From: Joseph H Kennedy Date: Tue, 21 May 2019 13:20:36 -0400 Subject: [PATCH 2/2] Set default NTEST_PARALLEL_JOBS=MAX_MPITASKS_PER_NODE and limit E3SM machines This resets the default value of NTEST_PARALLEL_JOBS to MAX_MPITASKS_PER_NODE so as to not make any behavioral changes to CESM. Warning: This is not a safe value on machine with batch systems who's login nodes are more limited than the compute nodes and therefore NTEST_PARALLEL_JOBS should be set on these systems. @jgfouca found via E3SM testing that limiting to 4 parallel jobs was required for many of the testing machines with batch systems to prevent hammering login nodes. Therefore, we set that value for these E3SM machines: * cori-haswell * cori-knl * blues * anvil * bebop * theta * titan * summit Warning: Non test machines for E3SM that have a batch system may still oversubscribe parallel test jobs. --- config/e3sm/machines/config_machines.xml | 8 ++++++++ scripts/lib/CIME/test_scheduler.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/config/e3sm/machines/config_machines.xml b/config/e3sm/machines/config_machines.xml index f5a08077164..ba9f6f95cbb 100644 --- a/config/e3sm/machines/config_machines.xml +++ b/config/e3sm/machines/config_machines.xml @@ -218,6 +218,7 @@ /project/projectdirs/acme/tools/cprnc.cori/cprnc 8 e3sm_developer + 4 nersc_slurm e3sm 32 @@ -356,6 +357,7 @@ /project/projectdirs/acme/tools/cprnc.cori/cprnc 8 e3sm_developer + 4 nersc_slurm e3sm 128 @@ -1038,6 +1040,7 @@ /home/ccsm-data/tools/cprnc 4 e3sm_integration + 4 pbs acme 16 @@ -1141,6 +1144,7 @@ /lcrc/group/acme/tools/cprnc/cprnc 8 e3sm_integration + 4 slurm E3SM 36 @@ -1255,6 +1259,7 @@ /lcrc/group/acme/tools/cprnc/cprnc 8 e3sm_integration + 4 slurm E3SM 36 @@ -1570,6 +1575,7 @@ /projects/ccsm/acme/tools/cprnc/cprnc 8 e3sm_developer + 4 cobalt_theta E3SM 128 @@ -2177,6 +2183,7 @@ /lustre/atlas1/cli900/world-shared/cesm/tools/cprnc/cprnc.titan 8 e3sm_developer + 4 pbs TRUE E3SM @@ -3065,6 +3072,7 @@ /gpfs/alpine/cli115/world-shared/e3sm/tools/cprnc.summit/cprnc 32 e3sm_developer + 4 lsf e3sm 84 diff --git a/scripts/lib/CIME/test_scheduler.py b/scripts/lib/CIME/test_scheduler.py index e2f8bf1a830..bd22f97f27c 100644 --- a/scripts/lib/CIME/test_scheduler.py +++ b/scripts/lib/CIME/test_scheduler.py @@ -194,7 +194,7 @@ def __init__(self, test_names, test_data=None, if parallel_jobs is None: mach_parallel_jobs = self._machobj.get_value("NTEST_PARALLEL_JOBS") if mach_parallel_jobs is None: - mach_parallel_jobs = 3 + mach_parallel_jobs = self._machobj.get_value("MAX_MPITASKS_PER_NODE") self._parallel_jobs = min(len(test_names), mach_parallel_jobs) else: self._parallel_jobs = parallel_jobs