Skip to content

Commit

Permalink
Merge branch 'jgfouca/cime/create_test_share_upgrade' into next (PR #…
Browse files Browse the repository at this point in the history
…2961)

Implement a more-advanced way of sharing builds within test suites

Implements a concept of build groups within test_scheduler.

[BFB]

* origin/jgfouca/cime/create_test_share_upgrade:
  Swtich build group output to debug
  Implement a more-advanced way of sharing builds within test suites
  • Loading branch information
jgfouca committed May 30, 2019
2 parents ea48d2c + c72f5c8 commit 6cd3435
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 46 deletions.
18 changes: 3 additions & 15 deletions cime/scripts/create_test
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ def parse_command_line(args, description):
parser.add_argument("--single-exe", action="store_true",
default=False,
help="Use a single build for all cases. This can "
"\ndrastically improve test throughput but is current use-at-your-own risk."
"\nIt's up to the user to ensure that all cases are build-compatible")
"\ndrastically improve test throughput but is currently use-at-your-own risk."
"\nIt's up to the user to ensure that all cases are build-compatible."
"\nE3SM tests belonging to a suite with share enabled will always share exes.")

default = get_default_setting(config, "SINGLE_SUBMIT", False, check_main=False)

Expand Down Expand Up @@ -404,19 +405,6 @@ def parse_command_line(args, description):
args.compiler = mach_obj.get_default_compiler() if args.compiler is None else args.compiler

test_names = get_tests.get_full_test_names(args.testargs, mach_obj.get_machine_name(), args.compiler)
if len(args.testargs) == 1 and \
args.testargs[0] in get_tests.get_test_suites() and \
get_tests.get_test_data(args.testargs[0])[2] and \
not args.single_exe:
logging.info("Suite supports shared executables, setting --single-exe to True")
args.single_exe = True

if len(args.testargs) == 1 and \
args.testargs[0] in get_tests.get_test_suites() and \
get_tests.get_test_data(args.testargs[0])[2] and \
not args.single_exe:
logging.info("Suite supports shared executables, setting --single-exe to True")
args.single_exe = True

expect(mach_obj.is_valid_compiler(args.compiler),
"Compiler %s not valid for machine %s" % (args.compiler, mach_obj.get_machine_name()))
Expand Down
2 changes: 1 addition & 1 deletion cime/scripts/lib/CIME/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def _build_model_thread(config_dir, compclass, compname, caseroot, libroot, bldr
stat = 0
run_sub_or_cmd(cmd, [caseroot, libroot, bldroot], "buildlib",
[bldroot, libroot, case], logfile=file_build)
except Exception as e:
except Exception:
stat = 1

else:
Expand Down
79 changes: 55 additions & 24 deletions cime/scripts/lib/CIME/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from CIME.XML.standard_module_setup import *
import six
from get_tests import get_recommended_test_time
from get_tests import get_recommended_test_time, get_build_groups
from CIME.utils import append_status, append_testlog, TESTS_FAILED_ERR_CODE, parse_test_name, get_full_test_name, get_model, \
convert_to_seconds, get_cime_root, get_project, get_timestamp, get_python_libs_root
from CIME.test_status import *
Expand Down Expand Up @@ -138,8 +138,7 @@ def __init__(self, test_names, test_data=None,
self._allow_baseline_overwrite = allow_baseline_overwrite
self._allow_pnl = allow_pnl
self._non_local = non_local
self._single_exe = single_exe
self._single_exe_root = None
self._build_groups = []

self._mail_user = mail_user
self._mail_type = mail_type
Expand Down Expand Up @@ -290,6 +289,25 @@ def __init__(self, test_names, test_data=None,
" Pick a different test-id".format(self._get_test_dir(test)))
logger.info("Creating test directory {}".format(self._get_test_dir(test)))

# Setup build groups
if single_exe:
self._build_groups = [self._tests]
elif self._cime_model == "e3sm":
# Any test that's in a shared-enabled suite with other tests should share exes
self._build_groups = get_build_groups(self._tests)
else:
self._build_groups = [ [item] for item in self._tests ]

# Build group to exeroot map
self._build_group_exeroots = {}
for build_group in self._build_groups:
self._build_group_exeroots[build_group] = None

logger.debug("Build groups are:")
for build_group in self._build_groups:
for test_name in build_group:
logger.debug("{}{}".format(" " if test_name == build_group[0] else " ", test_name))

# By the end of this constructor, this program should never hard abort,
# instead, errors will be placed in the TestStatus files for the various
# tests cases
Expand Down Expand Up @@ -648,14 +666,15 @@ def _xml_phase(self, test):
case.set_value("SAVE_TIMING", self._save_timing)

# handle single-exe here, all cases will use the EXEROOT from
# the first case.
first_test = self._first_test()
if self._single_exe:
if test == first_test:
self._single_exe_root = case.get_value("EXEROOT")
else:
expect(self._single_exe_root is not None, "Programming error for single_exe, missing root")
case.set_value("EXEROOT", self._single_exe_root)
# the first case in the build group
is_first_test, _, my_build_group = self._get_build_group(test)
if is_first_test:
expect(self._build_group_exeroots[my_build_group] is None, "Should not already have exeroot")
self._build_group_exeroots[my_build_group] = case.get_value("EXEROOT")
else:
build_group_exeroot = self._build_group_exeroots[my_build_group]
expect(build_group_exeroot is not None, "Should already have exeroot")
case.set_value("EXEROOT", build_group_exeroot)

# Scale back build parallelism on systems with few cores
if self._model_build_cost > self._proc_pool:
Expand All @@ -680,8 +699,8 @@ def _setup_phase(self, test):
###########################################################################
def _sharedlib_build_phase(self, test):
###########################################################################
first_test = self._first_test()
if self._single_exe and test != first_test:
is_first_test, first_test, _ = self._get_build_group(test)
if not is_first_test:
if self._get_test_status(first_test, phase=SHAREDLIB_BUILD_PHASE) == TEST_PASS_STATUS:
return True, ""
else:
Expand All @@ -691,17 +710,22 @@ def _sharedlib_build_phase(self, test):
return self._shell_cmd_for_phase(test, "./case.build --sharedlib-only", SHAREDLIB_BUILD_PHASE, from_dir=test_dir)

###########################################################################
def _first_test(self):
def _get_build_group(self, test):
###########################################################################
return list(self._tests.keys())[0]
for build_group in self._build_groups:
if test in build_group:
return test == build_group[0], build_group[0], build_group

expect(False, "No build group for test '{}'".format(test))

###########################################################################
def _model_build_phase(self, test):
###########################################################################
is_first_test, first_test, _ = self._get_build_group(test)

test_dir = self._get_test_dir(test)

first_test = self._first_test()
if self._single_exe and test != first_test:
if not is_first_test:
if self._get_test_status(first_test, phase=MODEL_BUILD_PHASE) == TEST_PASS_STATUS:
with Case(test_dir, read_only=False) as case:
post_build(case, [], build_complete=True, save_build_provenance=False)
Expand Down Expand Up @@ -752,12 +776,17 @@ def _run_catch_exceptions(self, test, phase, run):
###########################################################################
def _get_procs_needed(self, test, phase, threads_in_flight=None, no_batch=False):
###########################################################################
# If in single_exe mode, we must wait for the first case to complete building
# before starting other cases.
first_test = self._first_test()
if self._single_exe and test != first_test and \
self._get_test_status(first_test, phase=MODEL_BUILD_PHASE) == TEST_PEND_STATUS:
return self._proc_pool + 1
# For build pools, we must wait for the first case to complete XML, SHAREDLIB,
# and MODEL_BUILD phases before the other cases can do those phases
is_first_test, first_test, _ = self._get_build_group(test)

if not is_first_test:
build_group_dep_phases = [XML_PHASE, SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]
if phase in build_group_dep_phases:
if self._get_test_status(first_test, phase=phase) == TEST_PEND_STATUS:
return self._proc_pool + 1
else:
return 1

if phase == RUN_PHASE and (self._no_batch or no_batch):
test_dir = self._get_test_dir(test)
Expand Down Expand Up @@ -833,8 +862,10 @@ def _consumer(self, test, test_phase, phase_method):

logger.info(status_str)

is_first_test = self._get_build_group(test)[0]

if test_phase in [CREATE_NEWCASE_PHASE, XML_PHASE] or \
(self._single_exe and test != self._first_test() and test_phase in [SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]):
(not is_first_test and test_phase in [SHAREDLIB_BUILD_PHASE, MODEL_BUILD_PHASE]):
# These are the phases for which TestScheduler is reponsible for
# updating the TestStatus file
self._update_test_status_file(test, test_phase, status)
Expand Down
63 changes: 57 additions & 6 deletions cime/scripts/lib/get_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# "inherit" : (suite1, suite2, ...), # Optional. Suites to inherit tests from. Default is None. Tuple, list, or str.
# "time" : "HH:MM:SS", # Optional. Recommended upper-limit on test time.
# "share" : True|False, # Optional. If True, all tests in this suite share a build. Default is False.
# "tests" : (test1, test2, ...) # Optional. The list of tests for this suite. See above for format. Tuple, list, or str.
# "tests" : (test1, test2, ...) # Optional. The list of tests for this suite. See above for format. Tuple, list, or str. This is the ONLY inheritable attribute.
# }

_CIME_TESTS = {
Expand Down Expand Up @@ -221,6 +221,61 @@ def get_test_suite(suite, machine=None, compiler=None, skip_inherit=False):

return tests

###############################################################################
def suite_has_test(suite, test_full_name, skip_inherit=False):
###############################################################################
_, _, _, _, machine, compiler, _ = CIME.utils.parse_test_name(test_full_name)
expect(machine is not None, "{} is not a full test name".format(test_full_name))

tests = get_test_suite(suite, machine=machine, compiler=compiler, skip_inherit=skip_inherit)
return test_full_name in tests

###############################################################################
def get_build_groups(tests):
###############################################################################
"""
Given a list of tests, return a list of lists, with each list representing
a group of tests that can share executables.
>>> tests = ["SMS_P2.f19_g16_rx1.A.melvin_gnu", "SMS_P4.f19_g16_rx1.A.melvin_gnu", "SMS_P2.f19_g16_rx1.X.melvin_gnu", "SMS_P4.f19_g16_rx1.X.melvin_gnu", "TESTRUNSLOWPASS_P1.f19_g16_rx1.A.melvin_gnu", "TESTRUNSLOWPASS_P1.ne30_g16_rx1.A.melvin_gnu"]
>>> get_build_groups(tests)
[('SMS_P2.f19_g16_rx1.A.melvin_gnu', 'SMS_P4.f19_g16_rx1.A.melvin_gnu'), ('SMS_P2.f19_g16_rx1.X.melvin_gnu', 'SMS_P4.f19_g16_rx1.X.melvin_gnu'), ('TESTRUNSLOWPASS_P1.f19_g16_rx1.A.melvin_gnu',), ('TESTRUNSLOWPASS_P1.ne30_g16_rx1.A.melvin_gnu',)]
"""
build_groups = [] # list of tuples ([tests], set(suites))

# Get a list of suites that share exes
suites = get_test_suites()
share_suites = []
for suite in suites:
share = get_test_data(suite)[2]
if share:
share_suites.append(suite)

# Divide tests up into build groups. Assumes that build-compatibility is transitive
for test in tests:
matched = False

my_share_suites = set()
for suite in share_suites:
if suite_has_test(suite, test, skip_inherit=True):
my_share_suites.add(suite)

# Try to match this test with an existing build group
if my_share_suites:
for build_group_tests, build_group_suites in build_groups:
overlap = build_group_suites & my_share_suites
if overlap:
matched = True
build_group_tests.append(test)
build_group_suites.update(my_share_suites)
break

# Nothing matched, this test is in a build group of its own
if not matched:
build_groups.append(([test], my_share_suites))

return [tuple(item[0]) for item in build_groups]

###############################################################################
def infer_machine_name_from_tests(testargs):
###############################################################################
Expand Down Expand Up @@ -322,15 +377,11 @@ def get_recommended_test_time(test_full_name):
>>> get_recommended_test_time("PET_Ln20.ne30_ne30.FC5.sandiatoss3_intel.cam-outfrq9s")
>>>
"""
_, _, _, _, machine, compiler, _ = CIME.utils.parse_test_name(test_full_name)
expect(machine is not None, "{} is not a full test name".format(test_full_name))

best_time = None
suites = get_test_suites()
for suite in suites:
tests = get_test_suite(suite, machine=machine, compiler=compiler, skip_inherit=True)
rec_time = get_test_data(suite)[1]
if test_full_name in tests and rec_time is not None and \
if suite_has_test(suite, test_full_name, skip_inherit=True) and rec_time is not None and \
(best_time is None or convert_to_seconds(rec_time) < convert_to_seconds(best_time)):
best_time = rec_time

Expand Down

0 comments on commit 6cd3435

Please sign in to comment.