Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a more-advanced way of sharing builds within test suites #2961

Merged
merged 2 commits into from
May 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions cime/scripts/create_test
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ def parse_command_line(args, description):
parser.add_argument("--single-exe", action="store_true",
default=False,
help="Use a single build for all cases. This can "
"\ndrastically improve test throughput but is current use-at-your-own risk."
"\nIt's up to the user to ensure that all cases are build-compatible")
"\ndrastically improve test throughput but is currently use-at-your-own risk."
"\nIt's up to the user to ensure that all cases are build-compatible."
"\nE3SM tests belonging to a suite with share enabled will always share exes.")

default = get_default_setting(config, "SINGLE_SUBMIT", False, check_main=False)

Expand Down Expand Up @@ -404,19 +405,6 @@ def parse_command_line(args, description):
args.compiler = mach_obj.get_default_compiler() if args.compiler is None else args.compiler

test_names = get_tests.get_full_test_names(args.testargs, mach_obj.get_machine_name(), args.compiler)
if len(args.testargs) == 1 and \
args.testargs[0] in get_tests.get_test_suites() and \
get_tests.get_test_data(args.testargs[0])[2] and \
not args.single_exe:
logging.info("Suite supports shared executables, setting --single-exe to True")
args.single_exe = True

if len(args.testargs) == 1 and \
args.testargs[0] in get_tests.get_test_suites() and \
get_tests.get_test_data(args.testargs[0])[2] and \
not args.single_exe:
logging.info("Suite supports shared executables, setting --single-exe to True")
args.single_exe = True

expect(mach_obj.is_valid_compiler(args.compiler),
"Compiler %s not valid for machine %s" % (args.compiler, mach_obj.get_machine_name()))
Expand Down
2 changes: 1 addition & 1 deletion cime/scripts/lib/CIME/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def _build_model_thread(config_dir, compclass, compname, caseroot, libroot, bldr
stat = 0
run_sub_or_cmd(cmd, [caseroot, libroot, bldroot], "buildlib",
[bldroot, libroot, case], logfile=file_build)
except Exception as e:
except Exception:
stat = 1

else:
Expand Down
79 changes: 55 additions & 24 deletions cime/scripts/lib/CIME/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from CIME.XML.standard_module_setup import *
import six
from get_tests import get_recommended_test_time
from get_tests import get_recommended_test_time, get_build_groups
from CIME.utils import append_status, append_testlog, TESTS_FAILED_ERR_CODE, parse_test_name, get_full_test_name, get_model, \
convert_to_seconds, get_cime_root, get_project, get_timestamp, get_python_libs_root
from CIME.test_status import *
Expand Down Expand Up @@ -138,8 +138,7 @@ def __init__(self, test_names, test_data=None,
self._allow_baseline_overwrite = allow_baseline_overwrite
self._allow_pnl = allow_pnl
self._non_local = non_local
self._single_exe = single_exe
self._single_exe_root = None
self._build_groups = []

self._mail_user = mail_user
self._mail_type = mail_type
Expand Down Expand Up @@ -290,6 +289,25 @@ def __init__(self, test_names, test_data=None,
" Pick a different test-id".format(self._get_test_dir(test)))
logger.info("Creating test directory {}".format(self._get_test_dir(test)))

# Setup build groups
if single_exe:
self._build_groups = [self._tests]
elif self._cime_model == "e3sm":
# Any test that's in a shared-enabled suite with other tests should share exes
self._build_groups = get_build_groups(self._tests)
else:
self._build_groups = [ [item] for item in self._tests ]

# Build group to exeroot map
self._build_group_exeroots = {}
for build_group in self._build_groups:
self._build_group_exeroots[build_group] = None

logger.debug("Build groups are:")
for build_group in self._build_groups:
for test_name in build_group:
logger.debug("{}{}".format(" " if test_name == build_group[0] else " ", test_name))

# By the end of this constructor, this program should never hard abort,
# instead, errors will be placed in the TestStatus files for the various
# tests cases
Expand Down Expand Up @@ -648,14 +666,15 @@ def _xml_phase(self, test):
case.set_value("SAVE_TIMING", self._save_timing)

# handle single-exe here, all cases will use the EXEROOT from
# the first case.
first_test = self._first_test()
if self._single_exe:
if test == first_test:
self._single_exe_root = case.get_value("EXEROOT")
else:
expect(self._single_exe_root is not None, "Programming error for single_exe, missing root")
case.set_value("EXEROOT", self._single_exe_root)
# the first case in the build group
is_first_test, _, my_build_group = self._get_build_group(test)
if is_first_test:
expect(self._build_group_exeroots[my_build_group] is None, "Should not already have exeroot")
self._build_group_exeroots[my_build_group] = case.get_value("EXEROOT")
else:
build_group_exeroot = self._build_group_exeroots[my_build_group]
expect(build_group_exeroot is not None, "Should already have exeroot")
case.set_value("EXEROOT", build_group_exeroot)

# Scale back build parallelism on systems with few cores
if self._model_build_cost > self._proc_pool:
Expand All @@ -680,8 +699,8 @@ def _setup_phase(self, test):
###########################################################################
def _sharedlib_build_phase(self, test):
###########################################################################
first_test = self._first_test()
if self._single_exe and test != first_test:
is_first_test, first_test, _ = self._get_build_group(test)
if not is_first_test:
if self._get_test_status(first_test, phase=SHAREDLIB_BUILD_PHASE) == TEST_PASS_STATUS:
return True, ""
else:
Expand All @@ -691,17 +710,22 @@ def _sharedlib_build_phase(self, test):
return self._shell_cmd_for_phase(test, "./case.build --sharedlib-only", SHAREDLIB_BUILD_PHASE, from_dir=test_dir)

###########################################################################
def _first_test(self):
def _get_build_group(self, test):
###########################################################################
return list(self._tests.keys())[0]
for build_group in self._build_groups:
if test in build_group:
return test == build_group[0], build_group[0], build_group

expect(False, "No build group for test '{}'".format(test))

###########################################################################
def _model_build_phase(self, test):
###########################################################################
is_first_test, first_test, _ = self._get_build_group(test)

test_dir = self._get_test_dir(test)

first_test = self._first_test()
if self._single_exe and test != first_test:
if not is_first_test:
if self._get_test_status(first_test, phase=MODEL_BUILD_PHASE) == TEST_PASS_STATUS:
with Case(test_dir, read_only=False) as case:
post_build(case, [], build_complete=True, save_build_provenance=False)
Expand Down Expand Up @@ -752,12 +776,17 @@ def _run_catch_exceptions(self, test, phase, run):
###########################################################################
def _get_procs_needed(self, test, phase, threads_in_flight=None, no_batch=False):
###########################################################################
# If in single_exe mode, we must wait for the first case to complete building
# before starting other cases.
first_test = self._first_test()
if self._single_exe and test != first_test and \
self._get_test_status(first_test, phase=MODEL_BUILD_PHASE) == TEST_PEND_STATUS:
return self._proc_pool + 1
# For build pools, we must wait for the first case to complete XML, SHAREDLIB,
# and MODEL_BUILD phases before the other cases can do those phases
is_first_test, first_test, _ = self._get_build_group(test)

if not is_first_test:
build_group_dep_phases = [XML_PHASE, SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]
if phase in build_group_dep_phases:
if self._get_test_status(first_test, phase=phase) == TEST_PEND_STATUS:
return self._proc_pool + 1
else:
return 1

if phase == RUN_PHASE and (self._no_batch or no_batch):
test_dir = self._get_test_dir(test)
Expand Down Expand Up @@ -833,8 +862,10 @@ def _consumer(self, test, test_phase, phase_method):

logger.info(status_str)

is_first_test = self._get_build_group(test)[0]

if test_phase in [CREATE_NEWCASE_PHASE, XML_PHASE] or \
(self._single_exe and test != self._first_test() and test_phase in [SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]):
(not is_first_test and test_phase in [SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]):
# These are the phases for which TestScheduler is reponsible for
# updating the TestStatus file
self._update_test_status_file(test, test_phase, status)
Expand Down
63 changes: 57 additions & 6 deletions cime/scripts/lib/get_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# "inherit" : (suite1, suite2, ...), # Optional. Suites to inherit tests from. Default is None. Tuple, list, or str.
# "time" : "HH:MM:SS", # Optional. Recommended upper-limit on test time.
# "share" : True|False, # Optional. If True, all tests in this suite share a build. Default is False.
# "tests" : (test1, test2, ...) # Optional. The list of tests for this suite. See above for format. Tuple, list, or str.
# "tests" : (test1, test2, ...) # Optional. The list of tests for this suite. See above for format. Tuple, list, or str. This is the ONLY inheritable attribute.
# }

_CIME_TESTS = {
Expand Down Expand Up @@ -221,6 +221,61 @@ def get_test_suite(suite, machine=None, compiler=None, skip_inherit=False):

return tests

###############################################################################
def suite_has_test(suite, test_full_name, skip_inherit=False):
###############################################################################
_, _, _, _, machine, compiler, _ = CIME.utils.parse_test_name(test_full_name)
expect(machine is not None, "{} is not a full test name".format(test_full_name))

tests = get_test_suite(suite, machine=machine, compiler=compiler, skip_inherit=skip_inherit)
return test_full_name in tests

###############################################################################
def get_build_groups(tests):
###############################################################################
"""
Given a list of tests, return a list of lists, with each list representing
a group of tests that can share executables.

>>> tests = ["SMS_P2.f19_g16_rx1.A.melvin_gnu", "SMS_P4.f19_g16_rx1.A.melvin_gnu", "SMS_P2.f19_g16_rx1.X.melvin_gnu", "SMS_P4.f19_g16_rx1.X.melvin_gnu", "TESTRUNSLOWPASS_P1.f19_g16_rx1.A.melvin_gnu", "TESTRUNSLOWPASS_P1.ne30_g16_rx1.A.melvin_gnu"]
>>> get_build_groups(tests)
[('SMS_P2.f19_g16_rx1.A.melvin_gnu', 'SMS_P4.f19_g16_rx1.A.melvin_gnu'), ('SMS_P2.f19_g16_rx1.X.melvin_gnu', 'SMS_P4.f19_g16_rx1.X.melvin_gnu'), ('TESTRUNSLOWPASS_P1.f19_g16_rx1.A.melvin_gnu',), ('TESTRUNSLOWPASS_P1.ne30_g16_rx1.A.melvin_gnu',)]
"""
build_groups = [] # list of tuples ([tests], set(suites))

# Get a list of suites that share exes
suites = get_test_suites()
share_suites = []
for suite in suites:
share = get_test_data(suite)[2]
if share:
share_suites.append(suite)

# Divide tests up into build groups. Assumes that build-compatibility is transitive
for test in tests:
matched = False

my_share_suites = set()
for suite in share_suites:
if suite_has_test(suite, test, skip_inherit=True):
my_share_suites.add(suite)

# Try to match this test with an existing build group
if my_share_suites:
for build_group_tests, build_group_suites in build_groups:
overlap = build_group_suites & my_share_suites
if overlap:
matched = True
build_group_tests.append(test)
build_group_suites.update(my_share_suites)
break

# Nothing matched, this test is in a build group of its own
if not matched:
build_groups.append(([test], my_share_suites))

return [tuple(item[0]) for item in build_groups]

###############################################################################
def infer_machine_name_from_tests(testargs):
###############################################################################
Expand Down Expand Up @@ -322,15 +377,11 @@ def get_recommended_test_time(test_full_name):
>>> get_recommended_test_time("PET_Ln20.ne30_ne30.FC5.sandiatoss3_intel.cam-outfrq9s")
>>>
"""
_, _, _, _, machine, compiler, _ = CIME.utils.parse_test_name(test_full_name)
expect(machine is not None, "{} is not a full test name".format(test_full_name))

best_time = None
suites = get_test_suites()
for suite in suites:
tests = get_test_suite(suite, machine=machine, compiler=compiler, skip_inherit=True)
rec_time = get_test_data(suite)[1]
if test_full_name in tests and rec_time is not None and \
if suite_has_test(suite, test_full_name, skip_inherit=True) and rec_time is not None and \
(best_time is None or convert_to_seconds(rec_time) < convert_to_seconds(best_time)):
best_time = rec_time

Expand Down