Skip to content

Commit

Permalink
Improve log message for timeout and poll intervals
Browse files Browse the repository at this point in the history
Info on submission/execution timeout.
Info on polling schedule.
Info on issuing a poll, and estimated delay for next one.
  • Loading branch information
matthewrmshin committed Mar 9, 2018
1 parent fd9dc8c commit 808550f
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 60 deletions.
12 changes: 5 additions & 7 deletions lib/cylc/task_action_timer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ def __init__(self, ctx=None, delays=None, num=0, delay=None, timeout=None):
self.timeout = timeout
self.is_waiting = False

def delay_as_seconds(self):
"""Return the delay as PTnS, where n is number of seconds."""
return get_seconds_as_interval_string(self.delay)
def delay_timeout_as_str(self):
"""Return a string in the form "delay (after timeout)"."""
return r"%s (after %s)" % (
get_seconds_as_interval_string(self.delay),
get_time_string_from_unix_time(self.timeout))

def is_delay_done(self, now=None):
"""Is timeout done?"""
Expand Down Expand Up @@ -99,7 +101,3 @@ def set_waiting(self):
def unset_waiting(self):
"""Unset waiting flag after an action has completed."""
self.is_waiting = False

def timeout_as_str(self):
"""Return the timeout as an ISO8601 date-time string."""
return get_time_string_from_unix_time(self.timeout)
61 changes: 36 additions & 25 deletions lib/cylc/task_events_mgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@
TASK_OUTPUT_SUBMITTED, TASK_OUTPUT_STARTED, TASK_OUTPUT_SUCCEEDED,
TASK_OUTPUT_FAILED)
from cylc.wallclock import (
get_current_time_string, get_seconds_as_interval_string,
get_current_time_string,
get_seconds_as_interval_string as intvl_as_str,
RE_DATE_TIME_FORMAT_EXTENDED)


Expand Down Expand Up @@ -150,21 +151,15 @@ def check_poll_time(itask, now=None):
return False
if itask.poll_timer.num is None:
itask.poll_timer.num = 0
# With execution time limit, poll can exhaust.
delay = itask.poll_timer.next(no_exhaust=True)
if delay is not None:
LOG.info(
'next job poll in %s (after %s)' % (
itask.poll_timer.delay_as_seconds(),
itask.poll_timer.timeout_as_str()),
itask=itask)
itask.poll_timer.next(no_exhaust=True)
return True

def check_job_time(self, itask, now):
"""Check/handle job timeout and poll timer"""
can_poll = self.check_poll_time(itask, now)
if itask.timeout is None or now <= itask.timeout:
return can_poll
# Timeout reached for task, emit event and reset itask.timeout
if itask.state.status == TASK_STATUS_RUNNING:
time_ref = itask.summary['started_time']
event = 'execution timeout'
Expand All @@ -173,9 +168,9 @@ def check_job_time(self, itask, now):
event = 'submission timeout'
msg = event
try:
msg += ' after %s' % get_seconds_as_interval_string(
itask.timeout - time_ref)
msg += ' after %s' % intvl_as_str(itask.timeout - time_ref)
except (TypeError, ValueError):
# Badness in time_ref?
pass
itask.timeout = None # emit event only once
if msg and event:
Expand Down Expand Up @@ -264,14 +259,13 @@ def process_events(self, schd_ctx):
# Report retries and delayed 1st try
tmpl = None
if timer.num > 1:
tmpl = "%s/%s/%02d %s failed, retrying in %s (after %s)"
tmpl = "%s/%s/%02d %s failed, retrying in %s"
elif timer.delay:
tmpl = "%s/%s/%02d %s will run after %s (after %s)"
tmpl = "%s/%s/%02d %s will run after %s"
if tmpl:
LOG.debug(tmpl % (
point, name, submit_num, key1,
timer.delay_as_seconds(),
timer.timeout_as_str()))
timer.delay_timeout_as_str()))
# Ready to run?
if not timer.is_delay_done() or (
# Avoid flooding user's mail box with mail notification.
Expand Down Expand Up @@ -643,11 +637,9 @@ def _process_message_failed(self, itask, event_time, message):
itask.submit_num, "failed"), itask=itask)
else:
# There is a retry lined up
timeout_str = (
itask.try_timers[TASK_STATUS_RETRYING].timeout_as_str())
delay_msg = "retrying in %s" % (
itask.try_timers[TASK_STATUS_RETRYING].delay_as_seconds())
msg = "failed, %s (after %s)" % (delay_msg, timeout_str)
itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str())
msg = "failed, %s" % (delay_msg)
LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask)
itask.summary['latest_message'] = msg
self.setup_event_handlers(
Expand Down Expand Up @@ -717,10 +709,8 @@ def _process_message_submit_failed(self, itask, event_time):
else:
# There is a submission retry lined up.
timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING]
timeout_str = timer.timeout_as_str()
delay_msg = "submit-retrying in %s" % timer.delay_as_seconds()
msg = "%s, %s (after %s)" % (
self.EVENT_SUBMIT_FAILED, delay_msg, timeout_str)
delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str()
msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)
LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask)
itask.summary['latest_message'] = msg
self.setup_event_handlers(
Expand Down Expand Up @@ -924,7 +914,8 @@ def _reset_job_timers(self, itask):
delays = [] # polling intervals
if itask.state.status == TASK_STATUS_RUNNING:
timeref = itask.summary['started_time']
timeout = self._get_events_conf(itask, 'execution timeout')
timeout_key = 'execution timeout'
timeout = self._get_events_conf(itask, timeout_key)
delays = self.get_host_conf(
itask, 'execution polling intervals', skey='job',
default=[900]) # Default 15 minute intervals
Expand All @@ -949,14 +940,34 @@ def _reset_job_timers(self, itask):
delays += time_limit_delays
else: # if itask.state.status == TASK_STATUS_SUBMITTED:
timeref = itask.summary['submitted_time']
timeout = self._get_events_conf(itask, 'submission timeout')
timeout_key = 'submission timeout'
timeout = self._get_events_conf(itask, timeout_key)
delays = self.get_host_conf(
itask, 'submission polling intervals', skey='job',
default=[900]) # Default 15 minute intervals
try:
itask.timeout = timeref + float(timeout)
timeout_str = intvl_as_str(timeout)
except (TypeError, ValueError):
itask.timeout = None
timeout_str = None
itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays)
# Log timeout and polling schedule
message = 'health check settings: %s=%s' % (timeout_key, timeout_str)
# Attempt to group idenitical consecutive delays as N*DELAY,...
if itask.poll_timer.delays:
items = [] # [(number of item - 1, item), ...]
for delay in itask.poll_timer.delays:
if items and items[-1][1] == delay:
items[-1][0] += 1
else:
items.append([0, delay])
message += ', polling intervals='
for num, item in items:
if num:
message += '%d*' % (num + 1)
message += '%s,' % intvl_as_str(item)
message += '...'
LOG.info(message, itask=itask)
# Set next poll time
self.check_poll_time(itask)
5 changes: 5 additions & 0 deletions lib/cylc/task_job_mgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ def check_task_jobs(self, suite, task_pool):
for itask in task_pool.get_tasks():
if self.task_events_mgr.check_job_time(itask, now):
poll_tasks.add(itask)
if itask.poll_timer.delay is not None:
LOG.info(
'poll now, (next in %s)' % (
itask.poll_timer.delay_timeout_as_str()),
itask=itask)
if poll_tasks:
self.poll_task_jobs(suite, poll_tasks)

Expand Down
16 changes: 8 additions & 8 deletions tests/events/task-iso/bin/log-check.sh
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
#!/bin/bash
set -eu

echo "HELLO FROM log-check.sh shutdown handler for $CYLC_SUITE_NAME"
echo "HELLO FROM log-check.sh shutdown handler for ${CYLC_SUITE_NAME}"

# compare events.log with the reference version
# sorted so that event order doesn't matter

NEW_LOG=$EVNTLOG
REF_LOG=$CYLC_SUITE_DEF_PATH/events.log
sed -i 's/ (after .\+)$//' "${EVNTLOG}"
REF_LOG="${CYLC_SUITE_DEF_PATH}/events.log"

# difference with 'sort -u' (unique) because polling on timeouts may now
# result in multiple 'started' messages etc.
if ! diff -u <(sort -u $NEW_LOG) <(sort -u $REF_LOG); then
echo "ERROR: event handler output logs differ" >&2
diff -u <(sort -u $NEW_LOG) <(sort -u $REF_LOG) >&2
if ! diff -u <(sort -u "${EVNTLOG}") <(sort -u "${REF_LOG}") >&2; then
echo 'ERROR: event handler output logs differ' >&2
exit 1
else
echo "OK: event handler output logs agree"
echo 'OK: event handler output logs agree'
fi

echo "BYE FROM log-check.sh shutdown handler for $CYLC_SUITE_NAME"
echo "BYE FROM log-check.sh shutdown handler for ${CYLC_SUITE_NAME}"
16 changes: 8 additions & 8 deletions tests/events/task/bin/log-check.sh
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
#!/bin/bash
set -eu

echo "HELLO FROM log-check.sh shutdown handler for $CYLC_SUITE_NAME"
echo "HELLO FROM log-check.sh shutdown handler for ${CYLC_SUITE_NAME}"

# compare events.log with the reference version
# sorted so that event order doesn't matter

NEW_LOG=$EVNTLOG
REF_LOG=$CYLC_SUITE_DEF_PATH/events.log
sed -i 's/ (after .\+)$//' "${EVNTLOG}"
REF_LOG="${CYLC_SUITE_DEF_PATH}/events.log"

# difference with 'sort -u' (unique) because polling on timeouts may now
# result in multiple 'started' messages etc.
if ! diff -u <(sort -u $NEW_LOG) <(sort -u $REF_LOG); then
echo "ERROR: event handler output logs differ" >&2
diff -u <(sort -u $NEW_LOG) <(sort -u $REF_LOG) >&2
if ! diff -u <(sort -u "${EVNTLOG}") <(sort -u "${REF_LOG}") >&2; then
echo 'ERROR: event handler output logs differ' >&2
exit 1
else
echo "OK: event handler output logs agree"
echo 'OK: event handler output logs agree'
fi

echo "BYE FROM log-check.sh shutdown handler for $CYLC_SUITE_NAME"
echo "BYE FROM log-check.sh shutdown handler for ${CYLC_SUITE_NAME}"
16 changes: 13 additions & 3 deletions tests/job-poll/01-comm-method.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# Test poll intervals is used from both global.rc and suite.rc
. "$(dirname "${0}")/test_header"
#-------------------------------------------------------------------------------
set_test_number 4
set_test_number 6
install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"
#-------------------------------------------------------------------------------
run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
Expand All @@ -34,8 +34,18 @@ suite_run_ok "${TEST_NAME_BASE}-run" \
#-------------------------------------------------------------------------------
TEST_NAME="${TEST_NAME_BASE}"
LOG_FILE="${SUITE_RUN_DIR}/log/suite/log"
run_ok "log" grep -Fq '[t1.1] -next job poll' "${LOG_FILE}"
run_ok "log" grep -Fq '[t2.1] -next job poll' "${LOG_FILE}"
grep_ok \
'\[t1\.1\] -health check settings: submission.*, polling intervals=10\*PT6S,...' \
"${LOG_FILE}"
grep_ok \
'\[t2\.1\] -health check settings: submission.*, polling intervals=10\*PT6S,...' \
"${LOG_FILE}"
grep_ok \
'\[t1\.1\] -health check settings: execution.*, polling intervals=10\*PT6S,...' \
"${LOG_FILE}"
grep_ok \
'\[t2\.1\] -health check settings: execution.*, polling intervals=10\*PT6S,...' \
"${LOG_FILE}"
#-------------------------------------------------------------------------------
purge_suite "${SUITE_NAME}"
exit
19 changes: 13 additions & 6 deletions tests/job-poll/02-intervals.t
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,19 @@ suite_run_ok "${TEST_NAME_BASE}-run" \
#-------------------------------------------------------------------------------
TEST_NAME="${TEST_NAME_BASE}"
LOG_FILE="${SUITE_RUN_DIR}/log/suite/log"
# t1.1 should get the submission polling intervals
run_ok "log" grep -Fq '[t1.1] -next job poll in PT2S' "${LOG_FILE}"
run_ok "log" grep -Fq '[t1.1] -next job poll in PT10S' "${LOG_FILE}"
# t2.1 should get the execution polling intervals
run_ok "log" grep -Fq '[t2.1] -next job poll in PT1S' "${LOG_FILE}"
run_ok "log" grep -Fq '[t2.1] -next job poll in PT6S' "${LOG_FILE}"
grep health "${LOG_FILE}" >&2
grep_ok \
'\[t1\.1\] -health check settings: submission.*, polling intervals=PT2S,6\*PT10S,...' \
"${LOG_FILE}"
grep_ok \
'\[t2\.1\] -health check settings: submission.*, polling intervals=PT2S,6\*PT10S,...' \
"${LOG_FILE}"
grep_ok \
'\[t1\.1\] -health check settings: execution.*, polling intervals=2\*PT1S,10\*PT6S,...' \
"${LOG_FILE}"
grep_ok \
'\[t2\.1\] -health check settings: execution.*, polling intervals=2\*PT1S,10\*PT6S,...' \
"${LOG_FILE}"
#-------------------------------------------------------------------------------
purge_suite "${SUITE_NAME}"
exit
16 changes: 13 additions & 3 deletions tests/jobscript/05-global-config.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# Test that global config is used search for poll
. "$(dirname "${0}")/test_header"
#-------------------------------------------------------------------------------
set_test_number 4
set_test_number 6
install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"
#-------------------------------------------------------------------------------
run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
Expand All @@ -37,8 +37,18 @@ suite_run_ok "${TEST_NAME_BASE}-run" \
#-------------------------------------------------------------------------------
TEST_NAME="${TEST_NAME_BASE}"
LOG_FILE="${SUITE_RUN_DIR}/log/suite/log"
run_ok "log" grep -Fq '[bar.1] -next job poll' "${LOG_FILE}"
run_ok "log" grep -Fq '[foo.1] -next job poll' "${LOG_FILE}"
grep_ok \
'\[bar\.1\] -health check settings: submission.*, polling intervals=PT6S,...' \
"${LOG_FILE}"
grep_ok \
'\[foo\.1\] -health check settings: submission.*, polling intervals=PT12S,PT6S,...' \
"${LOG_FILE}"
grep_ok \
'\[bar\.1\] -health check settings: execution.*, polling intervals=PT18S,2\*PT12S,PT6S,...' \
"${LOG_FILE}"
grep_ok \
'\[foo\.1\] -health check settings: execution.*, polling intervals=PT12S,PT6S,...' \
"${LOG_FILE}"
#-------------------------------------------------------------------------------
purge_suite "${SUITE_NAME}"
exit

0 comments on commit 808550f

Please sign in to comment.