Merge pull request #5721 from wxtim/feature.sim_mode_at_runtime

Feature: Sim mode at runtime
cylc · Mar 15, 2024 · a9becc2 · a9becc2
2 parents ae5709d + 9186ad1
commit a9becc2
Show file tree

Hide file tree

Showing 19 changed files with 732 additions and 137 deletions.
diff --git a/changes.d/5721.feat.md b/changes.d/5721.feat.md
@@ -0,0 +1 @@
+Allow task simulation mode settings to be changed dynamically using `cylc broadcast`.
diff --git a/cylc/flow/broadcast_mgr.py b/cylc/flow/broadcast_mgr.py
@@ -30,11 +30,12 @@
 from cylc.flow.cfgspec.workflow import SPEC
 from cylc.flow.cycling.loader import get_point, standardise_point_string
 from cylc.flow.exceptions import PointParsingError
-from cylc.flow.parsec.util import listjoin
+from cylc.flow.parsec.util import listjoin, pdeepcopy, poverride
 from cylc.flow.parsec.validate import BroadcastConfigValidator
 
 if TYPE_CHECKING:
     from cylc.flow.id import Tokens
+    from cylc.flow.task_proxy import TaskProxy
 
 
 ALL_CYCLE_POINTS_STRS = ["*", "all-cycle-points", "all-cycles"]
@@ -179,6 +180,18 @@ def get_broadcast(self, tokens: 'Optional[Tokens]' = None) -> dict:
                     addict(ret, self.broadcasts[cycle][namespace])
         return ret
 
+    def get_updated_rtconfig(self, itask: 'TaskProxy') -> dict:
+        """Retrieve updated rtconfig for a single task proxy"""
+        overrides = self.get_broadcast(
+            itask.tokens
+        )
+        if overrides:
+            rtconfig = pdeepcopy(itask.tdef.rtconfig)
+            poverride(rtconfig, overrides, prepend=True)
+        else:
+            rtconfig = itask.tdef.rtconfig
+        return rtconfig
+
     def load_db_broadcast_states(self, row_idx, row):
         """Load broadcast variables from runtime DB broadcast states row."""
         if row_idx == 0:

diff --git a/cylc/flow/cfgspec/workflow.py b/cylc/flow/cfgspec/workflow.py
@@ -1288,6 +1288,12 @@ def get_script_common_text(this: str, example: Optional[str] = None):
 
                     Task instances must be set to fail by
                     :cylc:conf:`[..]fail cycle points`.
+
+                    .. note::
+
+                       This setting is designed for use with automatic
+                       retries. Subsequent manual submissions will not
+                       change the outcome of the task.
                 ''')
                 Conf('disable task event handlers', VDR.V_BOOLEAN, True,
                      desc='''

diff --git a/cylc/flow/network/resolvers.py b/cylc/flow/network/resolvers.py
@@ -779,7 +779,9 @@ def broadcast(
                 cycle_points, namespaces, settings)
         if mode == 'clear_broadcast':
             return self.schd.task_events_mgr.broadcast_mgr.clear_broadcast(
-                cycle_points, namespaces, settings)
+                point_strings=cycle_points,
+                namespaces=namespaces,
+                cancel_settings=settings)
         if mode == 'expire_broadcast':
             return self.schd.task_events_mgr.broadcast_mgr.expire_broadcast(
                 cutoff)

diff --git a/cylc/flow/rundb.py b/cylc/flow/rundb.py
@@ -259,6 +259,7 @@ class CylcWorkflowDAO:
             ["flow_nums"],
             ["is_manual_submit", {"datatype": "INTEGER"}],
             ["try_num", {"datatype": "INTEGER"}],
+            # This is used to store simulation task start time across restarts.
             ["time_submit"],
             ["time_submit_exit"],
             ["submit_status", {"datatype": "INTEGER"}],

diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py
@@ -1757,7 +1757,10 @@ async def main_loop(self) -> None:
             if (
                 self.pool.config.run_mode('simulation')
                 and sim_time_check(
-                    self.message_queue, self.pool.get_tasks())
+                    self.task_events_mgr,
+                    self.pool.get_tasks(),
+                    self.workflow_db_mgr,
+                )
             ):
                 # A simulated task state change occurred.
                 self.reset_inactivity_timer()

diff --git a/cylc/flow/simulation.py b/cylc/flow/simulation.py
@@ -16,25 +16,118 @@
 """Utilities supporting simulation and skip modes
 """
 
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 from time import time
 
+from cylc.flow import LOG
 from cylc.flow.cycling.loader import get_point
-from cylc.flow.network.resolvers import TaskMsg
+from cylc.flow.exceptions import PointParsingError
 from cylc.flow.platforms import FORBIDDEN_WITH_PLATFORM
 from cylc.flow.task_state import (
     TASK_STATUS_RUNNING,
     TASK_STATUS_FAILED,
     TASK_STATUS_SUCCEEDED,
 )
-from cylc.flow.wallclock import get_current_time_string
+from cylc.flow.wallclock import get_unix_time_from_time_string
 
 from metomi.isodatetime.parsers import DurationParser
 
 if TYPE_CHECKING:
-    from queue import Queue
-    from cylc.flow.cycling import PointBase
+    from cylc.flow.task_events_mgr import TaskEventsManager
     from cylc.flow.task_proxy import TaskProxy
+    from cylc.flow.workflow_db_mgr import WorkflowDatabaseManager
+    from cylc.flow.cycling import PointBase
+
+
+@dataclass
+class ModeSettings:
+    """A store of state for simulation modes.
+
+    Used instead of modifying the runtime config.
+
+    Args:
+        itask:
+            The task proxy this submission relates to.
+        broadcast_mgr:
+            The broadcast manager is used to apply any runtime alterations
+            pre simulated submission.
+        db_mgr:
+            The database manager must be provided for simulated jobs
+            that are being resumed after workflow restart. It is used to
+            extract the original scheduled finish time for the job.
+
+    Attrs:
+        simulated_run_length:
+            The length of time this simulated job will take to run in seconds.
+        timeout:
+            The wall-clock time at which this simulated job will finish as
+            a Unix epoch time.
+        sim_task_fails:
+            True, if this job is intended to fail when it finishes, else False.
+
+    """
+    simulated_run_length: float = 0.0
+    sim_task_fails: bool = False
+    timeout: float = 0.0
+
+    def __init__(
+        self,
+        itask: 'TaskProxy',
+        db_mgr: 'WorkflowDatabaseManager',
+        rtconfig: Dict[str, Any]
+    ):
+
+        # itask.summary['started_time'] and mode_settings.timeout need
+        # repopulating from the DB on workflow restart:
+        started_time = itask.summary['started_time']
+        try_num = None
+        if started_time is None:
+            # Get DB info
+            db_info = db_mgr.pri_dao.select_task_job(
+                itask.tokens['cycle'],
+                itask.tokens['task'],
+                itask.tokens['job'],
+            )
+
+            # Get the started time:
+            if db_info['time_submit']:
+                started_time = get_unix_time_from_time_string(
+                    db_info["time_submit"])
+                itask.summary['started_time'] = started_time
+            else:
+                started_time = time()
+
+            # Get the try number:
+            try_num = db_info["try_num"]
+
+        # Parse fail cycle points:
+        if rtconfig != itask.tdef.rtconfig:
+            try:
+                rtconfig["simulation"][
+                    "fail cycle points"
+                ] = parse_fail_cycle_points(
+                    rtconfig["simulation"]["fail cycle points"]
+                )
+            except PointParsingError as exc:
+                # Broadcast Fail CP didn't parse
+                LOG.warning(
+                    'Broadcast fail cycle point was invalid:\n'
+                    f'    {exc.args[0]}'
+                )
+                rtconfig['simulation'][
+                    'fail cycle points'
+                ] = itask.tdef.rtconfig['simulation']['fail cycle points']
+
+        # Calculate simulation info:
+        self.simulated_run_length = (
+            get_simulated_run_len(rtconfig))
+        self.sim_task_fails = sim_task_failed(
+            rtconfig['simulation'],
+            itask.point,
+            try_num or itask.get_try_num()
+        )
+        self.timeout = started_time + self.simulated_run_length
 
 
 def configure_sim_modes(taskdefs, sim_mode):
@@ -46,23 +139,17 @@ def configure_sim_modes(taskdefs, sim_mode):
     for tdef in taskdefs:
         # Compute simulated run time by scaling the execution limit.
         rtc = tdef.rtconfig
-        sleep_sec = get_simulated_run_len(rtc)
 
-        rtc['execution time limit'] = (
-            sleep_sec + DurationParser().parse(str(
-                rtc['simulation']['time limit buffer'])).get_seconds()
-        )
-
-        rtc['simulation']['simulated run length'] = sleep_sec
         rtc['submission retry delays'] = [1]
 
-        # Generate dummy scripting.
-        rtc['init-script'] = ""
-        rtc['env-script'] = ""
-        rtc['pre-script'] = ""
-        rtc['post-script'] = ""
-        rtc['script'] = build_dummy_script(
-            rtc, sleep_sec) if dummy_mode else ""
+        if dummy_mode:
+            # Generate dummy scripting.
+            rtc['init-script'] = ""
+            rtc['env-script'] = ""
+            rtc['pre-script'] = ""
+            rtc['post-script'] = ""
+            rtc['script'] = build_dummy_script(
+                rtc, get_simulated_run_len(rtc))
 
         disable_platforms(rtc)
 
@@ -77,12 +164,13 @@ def configure_sim_modes(taskdefs, sim_mode):
 
 
 def get_simulated_run_len(rtc: Dict[str, Any]) -> int:
-    """Get simulated run time.
+    """Calculate simulation run time from a task's config.
 
     rtc = run time config
     """
     limit = rtc['execution time limit']
     speedup = rtc['simulation']['speedup factor']
+
     if limit and speedup:
         sleep_sec = (DurationParser().parse(
             str(limit)).get_seconds() / speedup)
@@ -145,19 +233,26 @@ def parse_fail_cycle_points(
         True
         >>> this([])
         []
+        >>> this(None) is None
+        True
     """
-    f_pts: 'Optional[List[PointBase]]'
-    if 'all' in f_pts_orig:
+    f_pts: 'Optional[List[PointBase]]' = []
+    if (
+        f_pts_orig is None
+        or f_pts_orig and 'all' in f_pts_orig
+    ):
         f_pts = None
-    else:
+    elif f_pts_orig:
         f_pts = []
         for point_str in f_pts_orig:
             f_pts.append(get_point(point_str).standardise())
     return f_pts
 
 
 def sim_time_check(
-    message_queue: 'Queue[TaskMsg]', itasks: 'List[TaskProxy]'
+    task_events_manager: 'TaskEventsManager',
+    itasks: 'List[TaskProxy]',
+    db_mgr: 'WorkflowDatabaseManager',
 ) -> bool:
     """Check if sim tasks have been "running" for as long as required.
 
@@ -166,38 +261,42 @@ def sim_time_check(
     Returns:
         True if _any_ simulated task state has changed.
     """
-    sim_task_state_changed = False
     now = time()
+    sim_task_state_changed: bool = False
     for itask in itasks:
         if itask.state.status != TASK_STATUS_RUNNING:
             continue
-        # Started time is not set on restart
-        if itask.summary['started_time'] is None:
-            itask.summary['started_time'] = now
-        timeout = (
-            itask.summary['started_time'] +
-            itask.tdef.rtconfig['simulation']['simulated run length']
-        )
-        if now > timeout:
-            job_d = itask.tokens.duplicate(job=str(itask.submit_num))
-            now_str = get_current_time_string()
-            if sim_task_failed(
-                itask.tdef.rtconfig['simulation'],
-                itask.point,
-                itask.get_try_num()
-            ):
-                message_queue.put(
-                    TaskMsg(job_d, now_str, 'CRITICAL', TASK_STATUS_FAILED)
+
+        # This occurs if the workflow has been restarted.
+        if itask.mode_settings is None:
+            rtconfig = task_events_manager.broadcast_mgr.get_updated_rtconfig(
+                itask)
+            itask.mode_settings = ModeSettings(
+                itask,
+                db_mgr,
+                rtconfig
+            )
+
+        if now > itask.mode_settings.timeout:
+            if itask.mode_settings.sim_task_fails:
+                task_events_manager.process_message(
+                    itask, 'CRITICAL', TASK_STATUS_FAILED,
+                    flag=task_events_manager.FLAG_RECEIVED
                 )
             else:
-                # Simulate message outputs.
-                for msg in itask.tdef.rtconfig['outputs'].values():
-                    message_queue.put(
-                        TaskMsg(job_d, now_str, 'DEBUG', msg)
-                    )
-                message_queue.put(
-                    TaskMsg(job_d, now_str, 'DEBUG', TASK_STATUS_SUCCEEDED)
+                task_events_manager.process_message(
+                    itask, 'DEBUG', TASK_STATUS_SUCCEEDED,
+                    flag=task_events_manager.FLAG_RECEIVED
                 )
+            # Simulate message outputs.
+            for msg in itask.tdef.rtconfig['outputs'].values():
+                task_events_manager.process_message(
+                    itask, 'DEBUG', msg,
+                    flag=task_events_manager.FLAG_RECEIVED
+                )
+
+            # We've finished this pseudo job, so delete all the mode settings.
+            itask.mode_settings = None
             sim_task_state_changed = True
     return sim_task_state_changed
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Allow task simulation mode settings to be changed dynamically using `cylc broadcast`.