Skip to content

Commit

Permalink
Merge pull request #1948 from matthewrmshin/fix-stop-kill-fail
Browse files Browse the repository at this point in the history
Fix stop --kill on jobs-kill command fail
  • Loading branch information
arjclark authored Jul 25, 2016
2 parents 2275b49 + d96a59c commit 8f36630
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 34 deletions.
70 changes: 37 additions & 33 deletions lib/cylc/batch_sys_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,45 +380,49 @@ def job_kill(self, st_file_path):
"""
# SUITE_RUN_DIR/log/job/CYCLE/TASK/SUBMIT/job.status
self.configure_suite_run_dir(st_file_path.rsplit(os.sep, 6)[0])
st_file = open(st_file_path)
for line in st_file:
if line.startswith(self.CYLC_BATCH_SYS_NAME + "="):
batch_sys = self.get_inst(line.strip().split("=", 1)[1])
break
else:
return (1, "Cannot determine batch system from 'job.status' file")
st_file.seek(0, 0) # rewind
if getattr(batch_sys, "SHOULD_KILL_PROC_GROUP", False):
try:
st_file = open(st_file_path)
for line in st_file:
if line.startswith(TaskMessage.CYLC_JOB_PID + "="):
pid = line.strip().split("=", 1)[1]
if line.startswith(self.CYLC_BATCH_SYS_NAME + "="):
batch_sys = self.get_inst(line.strip().split("=", 1)[1])
break
else:
return (
1, "Cannot determine batch system from 'job.status' file")
st_file.seek(0, 0) # rewind
if getattr(batch_sys, "SHOULD_KILL_PROC_GROUP", False):
for line in st_file:
if line.startswith(TaskMessage.CYLC_JOB_PID + "="):
pid = line.strip().split("=", 1)[1]
try:
os.killpg(int(pid), SIGKILL)
except OSError as exc:
traceback.print_exc()
return (1, str(exc))
else:
return (0, "")
st_file.seek(0, 0) # rewind
if hasattr(batch_sys, "KILL_CMD_TMPL"):
for line in st_file:
if not line.startswith(self.CYLC_BATCH_SYS_JOB_ID + "="):
continue
job_id = line.strip().split("=", 1)[1]
command = shlex.split(
batch_sys.KILL_CMD_TMPL % {"job_id": job_id})
try:
os.killpg(int(pid), SIGKILL)
proc = Popen(command, stderr=PIPE)
except OSError as exc:
# subprocess.Popen has a bad habit of not setting the
# filename of the executable when it raises an OSError.
if not exc.filename:
exc.filename = command[0]
traceback.print_exc()
return (1, str(exc))
else:
return (0, "")
st_file.seek(0, 0) # rewind
if hasattr(batch_sys, "KILL_CMD_TMPL"):
for line in st_file:
if not line.startswith(self.CYLC_BATCH_SYS_JOB_ID + "="):
continue
job_id = line.strip().split("=", 1)[1]
command = shlex.split(
batch_sys.KILL_CMD_TMPL % {"job_id": job_id})
try:
proc = Popen(command, stderr=PIPE)
except OSError as exc:
# subprocess.Popen has a bad habit of not setting the
# filename of the executable when it raises an OSError.
if not exc.filename:
exc.filename = command[0]
traceback.print_exc()
return (1, str(exc))
else:
return (proc.wait(), proc.communicate()[1])
return (1, "Cannot determine batch job ID from 'job.status' file")
return (proc.wait(), proc.communicate()[1])
return (1, "Cannot determine batch job ID from 'job.status' file")
except IOError as exc:
return (1, str(exc))

def job_submit(self, job_file_path, remote_mode):
"""Submit a job file.
Expand Down
3 changes: 2 additions & 1 deletion lib/cylc/task_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,8 @@ def _manip_task_jobs_callback(
for job_log_dir in job_log_dirs:
point, name, submit_num = job_log_dir.split(os.sep, 2)
itask = tasks[(point, name, submit_num)]
out += "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n"
out += (BATCH_SYS_MANAGER.OUT_PREFIX_SUMMARY +
"|".join([ctx.timestamp, job_log_dir, "1"]) + "\n")
for line in out.splitlines(True):
for prefix, callback in handlers:
if line.startswith(prefix):
Expand Down
36 changes: 36 additions & 0 deletions tests/shutdown/06-kill-fail.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash
# THIS FILE IS PART OF THE CYLC SUITE ENGINE.
# Copyright (C) 2008-2016 NIWA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#-------------------------------------------------------------------------------
# Test kill command fail on shutdown --kill.
. "$(dirname "$0")/test_header"

set_test_number 3

install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"
run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
run_ok "${TEST_NAME_BASE}-run" cylc run "${SUITE_NAME}"
LOGD="$(cylc get-global-config --print-run-dir)/${SUITE_NAME}/log/job"
JLOGD="${LOGD}/1/t1/01"
poll test '!' -f "${JLOGD}/job.status"
poll '!' grep -q 'CYLC_JOB_INIT_TIME' "${JLOGD}/job.status" 2>'/dev/null'
mv "${JLOGD}/job.status" "${JLOGD}/job.status.old"
run_ok "${TEST_NAME_BASE}-shutdown" \
cylc shutdown --kill --max-polls=10 --interval=2 "${SUITE_NAME}"
mv "${JLOGD}/job.status.old" "${JLOGD}/job.status"
cylc jobs-kill "${LOGD}" '1/t1/01' 1>'/dev/null' 2>&1
purge_suite "${SUITE_NAME}"
exit
12 changes: 12 additions & 0 deletions tests/shutdown/06-kill-fail/suite.rc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[cylc]
[[event hooks]]
abort on timeout = True
timeout = PT1M

[scheduling]
[[dependencies]]
graph = t1

[runtime]
[[t1]]
script = sleep 60

0 comments on commit 8f36630

Please sign in to comment.