From 00c106acd85b6c8daf1784c74bb33a3b4dd8227f Mon Sep 17 00:00:00 2001 From: Michael Deakin Date: Wed, 12 Jul 2017 17:17:10 -0600 Subject: [PATCH 1/5] Initial implementation of the NCR test with SystemTestsCompareTwo --- scripts/lib/CIME/SystemTests/ncr.py | 129 ++++++++-------------------- 1 file changed, 37 insertions(+), 92 deletions(-) diff --git a/scripts/lib/CIME/SystemTests/ncr.py b/scripts/lib/CIME/SystemTests/ncr.py index 9ad193fe83d..066c0c65aa7 100644 --- a/scripts/lib/CIME/SystemTests/ncr.py +++ b/scripts/lib/CIME/SystemTests/ncr.py @@ -9,103 +9,48 @@ from CIME.XML.standard_module_setup import * from CIME.case_setup import case_setup import CIME.utils -from CIME.SystemTests.system_tests_common import SystemTestsCommon +from CIME.SystemTests.system_tests_compare_two import SystemTestsCompareTwo from CIME.check_lockedfiles import * logger = logging.getLogger(__name__) -class NCR(SystemTestsCommon): +class NCR(SystemTestsCompareTwo): def __init__(self, case): """ - initialize a test object + initialize an NCR test """ - SystemTestsCommon.__init__(self, case) - - def build_phase(self, sharedlib_only=False, model_only=False): - exeroot = self._case.get_value("EXEROOT") - cime_model = CIME.utils.get_model() - - machpes1 = "env_mach_pes.NCR1.xml" - if is_locked(machpes1): - restore(machpes1, newname="env_mach_pes.xml") - - # Build two exectuables for this test, the first is a default build, the - # second halves the number of tasks and runs two instances for each component - # Lay all of the components out concurrently - for bld in range(1,3): - logging.warn("Starting bld {}".format(bld)) - machpes = "env_mach_pes.NCR{}.xml".format(bld) - ntasks_sum = 0 - for comp in ['ATM','OCN','WAV','GLC','ICE','ROF','LND']: - self._case.set_value("NINST_{}".format(comp), str(bld)) - ntasks = self._case.get_value("NTASKS_{}".format(comp)) - if(bld == 1): - self._case.set_value("ROOTPE_{}".format(comp), 0) - if ( ntasks > 1 ): - self._case.set_value("NTASKS_{}".format(comp), ntasks/2) - else: - self._case.set_value("ROOTPE_{}".format(comp), ntasks_sum) - ntasks_sum += ntasks*2 - self._case.set_value("NTASKS_{}".format(comp), ntasks*2) - self._case.flush() - - case_setup(self._case, test_mode=True, reset=True) - self.clean_build() - self.build_indv(sharedlib_only, model_only) - shutil.move("{}/{}.exe".format(exeroot,cime_model), - "{}/{}.exe.NCR{}".format(exeroot,cime_model,bld)) - lock_file("env_build.xml", newname="env_build.NCR{}.xml".format(bld)) - lock_file("env_mach_pes.xml", newname=machpes) - - # Because mira/cetus interprets its run script differently than - # other systems we need to copy the original env_mach_pes.xml back - restore(machpes1, newname="env_mach_pes.xml") - - def run_phase(self): - os.chdir(self._caseroot) - - exeroot = self._case.get_value("EXEROOT") - cime_model = CIME.utils.get_model() - - # Reset beginning test settings - expect(is_locked("env_mach_pes.NCR1.xml"), - "ERROR: LockedFiles/env_mach_pes.NCR1.xml does not exist\n" - " this would been produced in the build - must run case.test_build") - - restore("env_mach_pes.NCR1.xml", newname="env_mach_pes.xml") - restore("env_build.NCR1.xml", newname="env_build.xml") - shutil.copy("{}/{}.exe.NCR1".format(exeroot, cime_model), - "{}/{}.exe".format(exeroot, cime_model)) - - - stop_n = self._case.get_value("STOP_N") - stop_option = self._case.get_value("STOP_OPTION") - - self._case.set_value("HIST_N", stop_n) - self._case.set_value("HIST_OPTION", stop_option) - self._case.set_value("CONTINUE_RUN", False) - self._case.set_value("REST_OPTION", "none") - self._case.flush() - - #====================================================================== - # do an initial run test with NINST 1 - #====================================================================== - logger.info("default: doing a {} {} with NINST1".format(stop_n, stop_option)) - self.run_indv() - - #====================================================================== - # do an initial run test with NINST 2 - # want to run on same pe counts per instance and same cpl pe count - #====================================================================== - - os.remove("{}/{}.exe".format(exeroot, cime_model)) - shutil.copy("{}/{}.exe.NCR2".format(exeroot, cime_model), - "{}/{}.exe".format(exeroot, cime_model)) - restore("env_build.NCR2.xml", newname="env_build.xml") - - logger.info("default: doing a {} {} with NINST2".format(stop_n, stop_option)) - self.run_indv(suffix="multiinst") - - # Compare - self._component_compare_test("base", "multiinst") + SystemTestsCompareTwo.__init__(self, case, + separate_builds = True, + run_two_suffix = "multiinst", + run_one_description = "default build", + run_two_description = ("half the number of tasks, " + + "twice the number of instances")) + + def _common_setup(self): + pass + + def _case_one_setup(self): + # Set the number of instances, the ROOTPEs, and the number of tasks + # The first case should have mostly default settings; + # though we apparently halve the number of tasks if greater than 1 + for comp in ['ATM','OCN','WAV','GLC','ICE','ROF','LND']: + self._case.set_value("NINST_{}".format(comp), str(1)) + self._case.set_value("ROOTPE_{}".format(comp), 0) + ntasks = self._case.get_value("NTASKS_{}".format(comp)) + if ntasks > 1: + self._case.set_value("NTASKS_{}".format(comp), ntasks // 2) + case_setup(self._case, test_mode = True, reset = True) + + def _case_two_setup(self): + # Set the number of instances, the ROOTPEs, and the number of tasks + # The second case should have twice the number of instances and half the number of tasks + # All tasks should be running concurrently + ntasks_sum = 0 + for comp in ['ATM','OCN','WAV','GLC','ICE','ROF','LND']: + self._case.set_value("NINST_{}".format(comp), str(1)) + self._case.set_value("ROOTPE_{}".format(comp), ntasks_sum) + ntasks = self._case.get_value("NTASKS_{}".format(comp)) + ntasks_sum += ntasks * 2 + self._case.set_value("NTASKS_{}".format(comp), ntasks * 2) + case_setup(self._case, test_mode = True, reset = True) From 278baf7d3b3b51374b12e4f5185ed3a9c3baaafd Mon Sep 17 00:00:00 2001 From: Michael Deakin Date: Thu, 13 Jul 2017 12:23:00 -0600 Subject: [PATCH 2/5] Initial work on converting NCR to use the SystemTestsCompareTwo object --- config/config_tests.xml | 5 +++++ scripts/lib/CIME/SystemTests/ncr.py | 28 +++++++++++++++++++--------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/config/config_tests.xml b/config/config_tests.xml index e59aebbecca..6c1e6a22eac 100644 --- a/config/config_tests.xml +++ b/config/config_tests.xml @@ -514,6 +514,11 @@ NODEFAIL Tests restart upon detected node failure. Generates fake failu multi-instance validation sequential vs concurrent (default length) 1 FALSE + + FALSE + none + $STOP_OPTION + $STOP_N diff --git a/scripts/lib/CIME/SystemTests/ncr.py b/scripts/lib/CIME/SystemTests/ncr.py index 066c0c65aa7..f5fc9a3939a 100644 --- a/scripts/lib/CIME/SystemTests/ncr.py +++ b/scripts/lib/CIME/SystemTests/ncr.py @@ -1,9 +1,10 @@ """ Implementation of the CIME NCR test. This class inherits from SystemTestsCommon -Build two exectuables for this test, the first is a default build the -second halves the number of tasks and runs two instances for each component -Lay all of the components out concurrently +Build two exectuables for this test: +The first is a default build +The second runs two instances for each component with the same total number of tasks, +and runs each of them concurrently """ import shutil from CIME.XML.standard_module_setup import * @@ -22,7 +23,7 @@ def __init__(self, case): """ SystemTestsCompareTwo.__init__(self, case, separate_builds = True, - run_two_suffix = "multiinst", + run_two_suffix = "singleinst", run_one_description = "default build", run_two_description = ("half the number of tasks, " + "twice the number of instances")) @@ -30,11 +31,15 @@ def __init__(self, case): def _common_setup(self): pass - def _case_one_setup(self): + def _case_two_setup(self): # Set the number of instances, the ROOTPEs, and the number of tasks # The first case should have mostly default settings; # though we apparently halve the number of tasks if greater than 1 - for comp in ['ATM','OCN','WAV','GLC','ICE','ROF','LND']: + comp_classes = self._case.get_values("COMP_CLASSES") + # Is this correct? + comp_classes.remove("CPL") + + for comp in comp_classes: self._case.set_value("NINST_{}".format(comp), str(1)) self._case.set_value("ROOTPE_{}".format(comp), 0) ntasks = self._case.get_value("NTASKS_{}".format(comp)) @@ -42,15 +47,20 @@ def _case_one_setup(self): self._case.set_value("NTASKS_{}".format(comp), ntasks // 2) case_setup(self._case, test_mode = True, reset = True) - def _case_two_setup(self): + def _case_one_setup(self): # Set the number of instances, the ROOTPEs, and the number of tasks # The second case should have twice the number of instances and half the number of tasks # All tasks should be running concurrently + comp_classes = self._case.get_values("COMP_CLASSES") + # Is this correct? + comp_classes.remove("CPL") + ntasks_sum = 0 - for comp in ['ATM','OCN','WAV','GLC','ICE','ROF','LND']: + # ['ATM','OCN','WAV','GLC','ICE','ROF','LND'] + for comp in comp_classes: self._case.set_value("NINST_{}".format(comp), str(1)) self._case.set_value("ROOTPE_{}".format(comp), ntasks_sum) ntasks = self._case.get_value("NTASKS_{}".format(comp)) ntasks_sum += ntasks * 2 self._case.set_value("NTASKS_{}".format(comp), ntasks * 2) - case_setup(self._case, test_mode = True, reset = True) + case_setup(self._case, test_mode = False, reset = True) From d74308f20c69e74e2a671220dbba4c83e25f9305 Mon Sep 17 00:00:00 2001 From: Michael Deakin Date: Fri, 14 Jul 2017 15:31:29 -0600 Subject: [PATCH 3/5] Fix pylint errors --- scripts/lib/CIME/SystemTests/ncr.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/lib/CIME/SystemTests/ncr.py b/scripts/lib/CIME/SystemTests/ncr.py index f5fc9a3939a..ec480678b0d 100644 --- a/scripts/lib/CIME/SystemTests/ncr.py +++ b/scripts/lib/CIME/SystemTests/ncr.py @@ -6,10 +6,8 @@ The second runs two instances for each component with the same total number of tasks, and runs each of them concurrently """ -import shutil from CIME.XML.standard_module_setup import * from CIME.case_setup import case_setup -import CIME.utils from CIME.SystemTests.system_tests_compare_two import SystemTestsCompareTwo from CIME.check_lockedfiles import * From 88476937235fad803193dc525ac18425a4b0839a Mon Sep 17 00:00:00 2001 From: Michael Deakin Date: Mon, 17 Jul 2017 13:26:29 -0600 Subject: [PATCH 4/5] Remove empty _common_setup. Also change order of case_one_setup and case_two_setup --- scripts/lib/CIME/SystemTests/ncr.py | 35 +++++++++++++---------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/scripts/lib/CIME/SystemTests/ncr.py b/scripts/lib/CIME/SystemTests/ncr.py index ec480678b0d..8ced125ecb1 100644 --- a/scripts/lib/CIME/SystemTests/ncr.py +++ b/scripts/lib/CIME/SystemTests/ncr.py @@ -26,25 +26,6 @@ def __init__(self, case): run_two_description = ("half the number of tasks, " + "twice the number of instances")) - def _common_setup(self): - pass - - def _case_two_setup(self): - # Set the number of instances, the ROOTPEs, and the number of tasks - # The first case should have mostly default settings; - # though we apparently halve the number of tasks if greater than 1 - comp_classes = self._case.get_values("COMP_CLASSES") - # Is this correct? - comp_classes.remove("CPL") - - for comp in comp_classes: - self._case.set_value("NINST_{}".format(comp), str(1)) - self._case.set_value("ROOTPE_{}".format(comp), 0) - ntasks = self._case.get_value("NTASKS_{}".format(comp)) - if ntasks > 1: - self._case.set_value("NTASKS_{}".format(comp), ntasks // 2) - case_setup(self._case, test_mode = True, reset = True) - def _case_one_setup(self): # Set the number of instances, the ROOTPEs, and the number of tasks # The second case should have twice the number of instances and half the number of tasks @@ -62,3 +43,19 @@ def _case_one_setup(self): ntasks_sum += ntasks * 2 self._case.set_value("NTASKS_{}".format(comp), ntasks * 2) case_setup(self._case, test_mode = False, reset = True) + + def _case_two_setup(self): + # Set the number of instances, the ROOTPEs, and the number of tasks + # The first case should have mostly default settings; + # though we apparently halve the number of tasks if greater than 1 + comp_classes = self._case.get_values("COMP_CLASSES") + # Is this correct? + comp_classes.remove("CPL") + + for comp in comp_classes: + self._case.set_value("NINST_{}".format(comp), str(1)) + self._case.set_value("ROOTPE_{}".format(comp), 0) + ntasks = self._case.get_value("NTASKS_{}".format(comp)) + if ntasks > 1: + self._case.set_value("NTASKS_{}".format(comp), ntasks // 2) + case_setup(self._case, test_mode = True, reset = True) From 8fbbb031a18388d4a1d1a20b2943ddc94eacb191 Mon Sep 17 00:00:00 2001 From: Michael Deakin Date: Mon, 17 Jul 2017 16:16:23 -0600 Subject: [PATCH 5/5] Fix number of instances and number of tasks for the test. Add comments warning that this test is currently unused and may not work Add clarifying comments --- config/config_tests.xml | 1 + scripts/lib/CIME/SystemTests/ncr.py | 62 ++++++++++++++++------------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/config/config_tests.xml b/config/config_tests.xml index 6c1e6a22eac..a4b146f7b44 100644 --- a/config/config_tests.xml +++ b/config/config_tests.xml @@ -511,6 +511,7 @@ NODEFAIL Tests restart upon detected node failure. Generates fake failu + multi-instance validation sequential vs concurrent (default length) 1 FALSE diff --git a/scripts/lib/CIME/SystemTests/ncr.py b/scripts/lib/CIME/SystemTests/ncr.py index 8ced125ecb1..1d6c2dab055 100644 --- a/scripts/lib/CIME/SystemTests/ncr.py +++ b/scripts/lib/CIME/SystemTests/ncr.py @@ -2,9 +2,11 @@ Implementation of the CIME NCR test. This class inherits from SystemTestsCommon Build two exectuables for this test: -The first is a default build -The second runs two instances for each component with the same total number of tasks, +The first runs two instances for each component with the same total number of tasks, and runs each of them concurrently +The second is a default build + +NOTE: This is currently untested, and may not be working properly """ from CIME.XML.standard_module_setup import * from CIME.case_setup import case_setup @@ -22,40 +24,46 @@ def __init__(self, case): SystemTestsCompareTwo.__init__(self, case, separate_builds = True, run_two_suffix = "singleinst", - run_one_description = "default build", - run_two_description = ("half the number of tasks, " + - "twice the number of instances")) + run_one_description = "two instances, each with the same number of tasks", + run_two_description = "default build") + + def _comp_classes(self): + # Return the components which we need to set things for + # ESP cannot have more than one instance, so don't set anything for it + comp_classes = self._case.get_values("COMP_CLASSES") + if "CPL" in comp_classes: + comp_classes.remove("CPL") + if "ESP" in comp_classes: + comp_classes.remove("ESP") + return comp_classes + + def _common_setup(self): + # Set the default number of tasks + for comp in self._comp_classes(): + ntasks = self._case.get_value("NTASKS_{}".format(comp)) + if ntasks > 1: + self._case.set_value("NTASKS_{}".format(comp), ntasks // 2) def _case_one_setup(self): # Set the number of instances, the ROOTPEs, and the number of tasks - # The second case should have twice the number of instances and half the number of tasks + # This case should have twice the number of instances and half the number of tasks # All tasks should be running concurrently - comp_classes = self._case.get_values("COMP_CLASSES") - # Is this correct? - comp_classes.remove("CPL") - + # Note that this case must be the multiinstance one + # to correctly set the required number of nodes and avoid crashing ntasks_sum = 0 - # ['ATM','OCN','WAV','GLC','ICE','ROF','LND'] - for comp in comp_classes: - self._case.set_value("NINST_{}".format(comp), str(1)) + + for comp in self._comp_classes(): + self._case.set_value("NINST_{}".format(comp), str(2)) self._case.set_value("ROOTPE_{}".format(comp), ntasks_sum) - ntasks = self._case.get_value("NTASKS_{}".format(comp)) - ntasks_sum += ntasks * 2 - self._case.set_value("NTASKS_{}".format(comp), ntasks * 2) + ntasks = self._case.get_value("NTASKS_{}".format(comp)) * 2 + ntasks_sum += ntasks + self._case.set_value("NTASKS_{}".format(comp), ntasks) + # test_mode must be False here so the case.test file is updated + # This ensures that the correct number of nodes are used in case it's larger than in case 2 case_setup(self._case, test_mode = False, reset = True) def _case_two_setup(self): - # Set the number of instances, the ROOTPEs, and the number of tasks - # The first case should have mostly default settings; - # though we apparently halve the number of tasks if greater than 1 - comp_classes = self._case.get_values("COMP_CLASSES") - # Is this correct? - comp_classes.remove("CPL") - - for comp in comp_classes: + for comp in self._comp_classes(): self._case.set_value("NINST_{}".format(comp), str(1)) self._case.set_value("ROOTPE_{}".format(comp), 0) - ntasks = self._case.get_value("NTASKS_{}".format(comp)) - if ntasks > 1: - self._case.set_value("NTASKS_{}".format(comp), ntasks // 2) case_setup(self._case, test_mode = True, reset = True)