E3SM-Project · jgfouca · May 30, 2019 · May 30, 2019 · May 30, 2019
diff --git a/cime/scripts/create_test b/cime/scripts/create_test
@@ -79,8 +79,9 @@ def parse_command_line(args, description):
     parser.add_argument("--single-exe", action="store_true",
                         default=False,
                         help="Use a single build for all cases. This can "
-                        "\ndrastically improve test throughput but is current use-at-your-own risk."
-                        "\nIt's up to the user to ensure that all cases are build-compatible")
+                        "\ndrastically improve test throughput but is currently use-at-your-own risk."
+                        "\nIt's up to the user to ensure that all cases are build-compatible."
+                        "\nE3SM tests belonging to a suite with share enabled will always share exes.")
 
     default = get_default_setting(config, "SINGLE_SUBMIT", False, check_main=False)
 
@@ -404,19 +405,6 @@ def parse_command_line(args, description):
         args.compiler = mach_obj.get_default_compiler() if args.compiler is None else args.compiler
 
         test_names = get_tests.get_full_test_names(args.testargs, mach_obj.get_machine_name(), args.compiler)
-        if len(args.testargs) == 1 and \
-           args.testargs[0] in get_tests.get_test_suites() and \
-           get_tests.get_test_data(args.testargs[0])[2] and \
-           not args.single_exe:
-            logging.info("Suite supports shared executables, setting --single-exe to True")
-            args.single_exe = True
-
-        if len(args.testargs) == 1 and \
-           args.testargs[0] in get_tests.get_test_suites() and \
-           get_tests.get_test_data(args.testargs[0])[2] and \
-           not args.single_exe:
-            logging.info("Suite supports shared executables, setting --single-exe to True")
-            args.single_exe = True
 
     expect(mach_obj.is_valid_compiler(args.compiler),
            "Compiler %s not valid for machine %s" % (args.compiler, mach_obj.get_machine_name()))

diff --git a/cime/scripts/lib/CIME/build.py b/cime/scripts/lib/CIME/build.py
@@ -331,7 +331,7 @@ def _build_model_thread(config_dir, compclass, compname, caseroot, libroot, bldr
             stat = 0
             run_sub_or_cmd(cmd, [caseroot, libroot, bldroot], "buildlib",
                            [bldroot, libroot, case], logfile=file_build)
-        except Exception as e:
+        except Exception:
             stat = 1
 
     else:

diff --git a/cime/scripts/lib/CIME/test_scheduler.py b/cime/scripts/lib/CIME/test_scheduler.py
@@ -13,7 +13,7 @@
 
 from CIME.XML.standard_module_setup import *
 import six
-from get_tests import get_recommended_test_time
+from get_tests import get_recommended_test_time, get_build_groups
 from CIME.utils import append_status, append_testlog, TESTS_FAILED_ERR_CODE, parse_test_name, get_full_test_name, get_model, \
     convert_to_seconds, get_cime_root, get_project, get_timestamp, get_python_libs_root
 from CIME.test_status import *
@@ -138,8 +138,7 @@ def __init__(self, test_names, test_data=None,
         self._allow_baseline_overwrite = allow_baseline_overwrite
         self._allow_pnl       = allow_pnl
         self._non_local       = non_local
-        self._single_exe      = single_exe
-        self._single_exe_root = None
+        self._build_groups    = []
 
         self._mail_user = mail_user
         self._mail_type = mail_type
@@ -290,6 +289,25 @@ def __init__(self, test_names, test_data=None,
                        " Pick a different test-id".format(self._get_test_dir(test)))
                 logger.info("Creating test directory {}".format(self._get_test_dir(test)))
 
+        # Setup build groups
+        if single_exe:
+            self._build_groups = [self._tests]
+        elif self._cime_model == "e3sm":
+            # Any test that's in a shared-enabled suite with other tests should share exes
+            self._build_groups = get_build_groups(self._tests)
+        else:
+            self._build_groups = [ [item] for item in self._tests ]
+
+        # Build group to exeroot map
+        self._build_group_exeroots = {}
+        for build_group in self._build_groups:
+            self._build_group_exeroots[build_group] = None
+
+        logger.debug("Build groups are:")
+        for build_group in self._build_groups:
+            for test_name in build_group:
+                logger.debug("{}{}".format("  " if test_name == build_group[0] else "    ", test_name))
+
         # By the end of this constructor, this program should never hard abort,
         # instead, errors will be placed in the TestStatus files for the various
         # tests cases
@@ -648,14 +666,15 @@ def _xml_phase(self, test):
             case.set_value("SAVE_TIMING", self._save_timing)
 
             # handle single-exe here, all cases will use the EXEROOT from
-            # the first case.
-            first_test = self._first_test()
-            if self._single_exe:
-                if test == first_test:
-                    self._single_exe_root = case.get_value("EXEROOT")
-                else:
-                    expect(self._single_exe_root is not None, "Programming error for single_exe, missing root")
-                    case.set_value("EXEROOT", self._single_exe_root)
+            # the first case in the build group
+            is_first_test, _, my_build_group = self._get_build_group(test)
+            if is_first_test:
+                expect(self._build_group_exeroots[my_build_group] is None, "Should not already have exeroot")
+                self._build_group_exeroots[my_build_group] = case.get_value("EXEROOT")
+            else:
+                build_group_exeroot = self._build_group_exeroots[my_build_group]
+                expect(build_group_exeroot is not None, "Should already have exeroot")
+                case.set_value("EXEROOT", build_group_exeroot)
 
             # Scale back build parallelism on systems with few cores
             if self._model_build_cost > self._proc_pool:
@@ -680,8 +699,8 @@ def _setup_phase(self, test):
     ###########################################################################
     def _sharedlib_build_phase(self, test):
     ###########################################################################
-        first_test = self._first_test()
-        if self._single_exe and test != first_test:
+        is_first_test, first_test, _ = self._get_build_group(test)
+        if not is_first_test:
             if self._get_test_status(first_test, phase=SHAREDLIB_BUILD_PHASE) == TEST_PASS_STATUS:
                 return True, ""
             else:
@@ -691,17 +710,22 @@ def _sharedlib_build_phase(self, test):
         return self._shell_cmd_for_phase(test, "./case.build --sharedlib-only", SHAREDLIB_BUILD_PHASE, from_dir=test_dir)
 
     ###########################################################################
-    def _first_test(self):
+    def _get_build_group(self, test):
     ###########################################################################
-        return list(self._tests.keys())[0]
+        for build_group in self._build_groups:
+            if test in build_group:
+                return test == build_group[0], build_group[0], build_group
+
+        expect(False, "No build group for test '{}'".format(test))
 
     ###########################################################################
     def _model_build_phase(self, test):
     ###########################################################################
+        is_first_test, first_test, _ = self._get_build_group(test)
+
         test_dir = self._get_test_dir(test)
 
-        first_test = self._first_test()
-        if self._single_exe and test != first_test:
+        if not is_first_test:
             if self._get_test_status(first_test, phase=MODEL_BUILD_PHASE) == TEST_PASS_STATUS:
                 with Case(test_dir, read_only=False) as case:
                     post_build(case, [], build_complete=True, save_build_provenance=False)
@@ -752,12 +776,17 @@ def _run_catch_exceptions(self, test, phase, run):
     ###########################################################################
     def _get_procs_needed(self, test, phase, threads_in_flight=None, no_batch=False):
     ###########################################################################
-        # If in single_exe mode, we must wait for the first case to complete building
-        # before starting other cases.
-        first_test = self._first_test()
-        if self._single_exe and test != first_test and \
-           self._get_test_status(first_test, phase=MODEL_BUILD_PHASE) == TEST_PEND_STATUS:
-            return self._proc_pool + 1
+        # For build pools, we must wait for the first case to complete XML, SHAREDLIB,
+        # and MODEL_BUILD phases before the other cases can do those phases
+        is_first_test, first_test, _ = self._get_build_group(test)
+
+        if not is_first_test:
+            build_group_dep_phases = [XML_PHASE, SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]
+            if phase in build_group_dep_phases:
+                if self._get_test_status(first_test, phase=phase) == TEST_PEND_STATUS:
+                    return self._proc_pool + 1
+                else:
+                    return 1
 
         if phase == RUN_PHASE and (self._no_batch or no_batch):
             test_dir = self._get_test_dir(test)
@@ -833,8 +862,10 @@ def _consumer(self, test, test_phase, phase_method):
 
         logger.info(status_str)
 
+        is_first_test = self._get_build_group(test)[0]
+
         if test_phase in [CREATE_NEWCASE_PHASE, XML_PHASE] or \
-           (self._single_exe and test != self._first_test() and test_phase in [SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]):
+           (not is_first_test and test_phase in [SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]):
             # These are the phases for which TestScheduler is reponsible for
             # updating the TestStatus file
             self._update_test_status_file(test, test_phase, status)

diff --git a/cime/scripts/lib/get_tests.py b/cime/scripts/lib/get_tests.py
@@ -21,7 +21,7 @@
 #     "inherit" : (suite1, suite2, ...), # Optional. Suites to inherit tests from. Default is None. Tuple, list, or str.
 #     "time"    : "HH:MM:SS",            # Optional. Recommended upper-limit on test time.
 #     "share"   : True|False,            # Optional. If True, all tests in this suite share a build. Default is False.
-#     "tests"   : (test1, test2, ...)    # Optional. The list of tests for this suite. See above for format. Tuple, list, or str.
+#     "tests"   : (test1, test2, ...)    # Optional. The list of tests for this suite. See above for format. Tuple, list, or str. This is the ONLY inheritable attribute.
 # }
 
 _CIME_TESTS = {
@@ -221,6 +221,61 @@ def get_test_suite(suite, machine=None, compiler=None, skip_inherit=False):
 
     return tests
 
+###############################################################################
+def suite_has_test(suite, test_full_name, skip_inherit=False):
+###############################################################################
+    _, _, _, _, machine, compiler, _ = CIME.utils.parse_test_name(test_full_name)
+    expect(machine is not None, "{} is not a full test name".format(test_full_name))
+
+    tests = get_test_suite(suite, machine=machine, compiler=compiler, skip_inherit=skip_inherit)
+    return test_full_name in tests
+
+###############################################################################
+def get_build_groups(tests):
+###############################################################################
+    """
+    Given a list of tests, return a list of lists, with each list representing
+    a group of tests that can share executables.
+
+    >>> tests = ["SMS_P2.f19_g16_rx1.A.melvin_gnu", "SMS_P4.f19_g16_rx1.A.melvin_gnu", "SMS_P2.f19_g16_rx1.X.melvin_gnu", "SMS_P4.f19_g16_rx1.X.melvin_gnu", "TESTRUNSLOWPASS_P1.f19_g16_rx1.A.melvin_gnu", "TESTRUNSLOWPASS_P1.ne30_g16_rx1.A.melvin_gnu"]
+    >>> get_build_groups(tests)
+    [('SMS_P2.f19_g16_rx1.A.melvin_gnu', 'SMS_P4.f19_g16_rx1.A.melvin_gnu'), ('SMS_P2.f19_g16_rx1.X.melvin_gnu', 'SMS_P4.f19_g16_rx1.X.melvin_gnu'), ('TESTRUNSLOWPASS_P1.f19_g16_rx1.A.melvin_gnu',), ('TESTRUNSLOWPASS_P1.ne30_g16_rx1.A.melvin_gnu',)]
+    """
+    build_groups = [] # list of tuples ([tests], set(suites))
+
+    # Get a list of suites that share exes
+    suites = get_test_suites()
+    share_suites = []
+    for suite in suites:
+        share = get_test_data(suite)[2]
+        if share:
+            share_suites.append(suite)
+
+    # Divide tests up into build groups. Assumes that build-compatibility is transitive
+    for test in tests:
+        matched = False
+
+        my_share_suites = set()
+        for suite in share_suites:
+            if suite_has_test(suite, test, skip_inherit=True):
+                my_share_suites.add(suite)
+
+        # Try to match this test with an existing build group
+        if my_share_suites:
+            for build_group_tests, build_group_suites in build_groups:
+                overlap = build_group_suites & my_share_suites
+                if overlap:
+                    matched = True
+                    build_group_tests.append(test)
+                    build_group_suites.update(my_share_suites)
+                    break
+
+        # Nothing matched, this test is in a build group of its own
+        if not matched:
+            build_groups.append(([test], my_share_suites))
+
+    return [tuple(item[0]) for item in build_groups]
+
 ###############################################################################
 def infer_machine_name_from_tests(testargs):
 ###############################################################################
@@ -322,15 +377,11 @@ def get_recommended_test_time(test_full_name):
     >>> get_recommended_test_time("PET_Ln20.ne30_ne30.FC5.sandiatoss3_intel.cam-outfrq9s")
     >>>
     """
-    _, _, _, _, machine, compiler, _ = CIME.utils.parse_test_name(test_full_name)
-    expect(machine is not None, "{} is not a full test name".format(test_full_name))
-
     best_time = None
     suites = get_test_suites()
     for suite in suites:
-        tests    = get_test_suite(suite, machine=machine, compiler=compiler, skip_inherit=True)
         rec_time = get_test_data(suite)[1]
-        if test_full_name in tests and rec_time is not None and \
+        if suite_has_test(suite, test_full_name, skip_inherit=True) and rec_time is not None and \
            (best_time is None or convert_to_seconds(rec_time) < convert_to_seconds(best_time)):
             best_time = rec_time