Skip to content

Commit

Permalink
Also cancel a node block body if the associated node is removed.
Browse files Browse the repository at this point in the history
  • Loading branch information
jglick committed Apr 30, 2019
1 parent eaebe34 commit 58b127f
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import hudson.Util;
import hudson.init.Terminator;
import hudson.model.Node;
import hudson.model.Result;
import hudson.model.TaskListener;
import hudson.remoting.Channel;
import hudson.remoting.ChannelClosedException;
Expand Down Expand Up @@ -69,12 +70,14 @@
import org.jenkinsci.plugins.workflow.actions.LabelAction;
import org.jenkinsci.plugins.workflow.graph.FlowNode;
import org.jenkinsci.plugins.workflow.steps.AbstractStepExecutionImpl;
import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException;
import org.jenkinsci.plugins.workflow.steps.Step;
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.StepDescriptor;
import org.jenkinsci.plugins.workflow.steps.StepExecution;
import org.jenkinsci.plugins.workflow.support.concurrent.Timeout;
import org.jenkinsci.plugins.workflow.support.concurrent.WithThreadName;
import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.DoNotUse;
import org.kohsuke.accmod.restrictions.NoExternalUse;
Expand Down Expand Up @@ -318,18 +321,18 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
return false;
}

private @CheckForNull FilePath getWorkspace() throws AbortException {
private @CheckForNull FilePath getWorkspace() throws IOException, InterruptedException {
if (ws == null) {
ws = FilePathUtils.find(node, remote);
if (ws == null) {
// Part of JENKINS-49707: check whether an agent has been removed.
// (Note that a Computer may be missing because a Node is offline,
// and conversely after removing a Node its Computer may remain for a while.
// Therefore we only fail here if _both_ are absent.)
// ExecutorStepExecution.NodeListener will normally do this first, so this is a fallback.
Jenkins j = Jenkins.getInstanceOrNull();
if (!node.isEmpty() && j != null && j.getNode(node) == null) {
// TODO or FlowInterruptedException?
throw new AbortException("Agent " + node + " was removed");
throw new FlowInterruptedException(Result.ABORTED, new ExecutorStepExecution.RemovedNodeCause());
}
LOGGER.log(Level.FINE, "Jenkins is not running, no such node {0}, or it is offline", node);
return null;
Expand Down Expand Up @@ -478,9 +481,11 @@ private void check() {
final FilePath workspace;
try {
workspace = getWorkspace();
} catch (AbortException x) {
} catch (IOException | InterruptedException x) {
recurrencePeriod = 0;
getContext().onFailure(x);
if (causeOfStoppage == null) { // do not doubly terminate
getContext().onFailure(x);
}
return;
}
if (workspace == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@
import java.util.logging.Logger;
import javax.annotation.CheckForNull;
import javax.annotation.Nullable;
import jenkins.model.CauseOfInterruption;
import jenkins.model.Jenkins;
import jenkins.model.Jenkins.MasterComputer;
import jenkins.model.NodeListener;
import jenkins.model.queue.AsynchronousExecution;
import jenkins.security.QueueItemAuthenticator;
import jenkins.security.QueueItemAuthenticatorProvider;
Expand All @@ -68,11 +70,14 @@
import org.jenkinsci.plugins.workflow.graph.FlowNode;
import org.jenkinsci.plugins.workflow.graphanalysis.FlowScanningUtils;
import org.jenkinsci.plugins.workflow.steps.AbstractStepExecutionImpl;
import org.jenkinsci.plugins.workflow.steps.BodyExecution;
import org.jenkinsci.plugins.workflow.steps.BodyExecutionCallback;
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.durable_task.Messages;
import org.jenkinsci.plugins.workflow.support.actions.WorkspaceActionImpl;
import org.jenkinsci.plugins.workflow.support.concurrent.Timeout;
import org.jenkinsci.plugins.workflow.support.pickles.ExecutorPickle;
import org.jenkinsci.plugins.workflow.support.pickles.WorkspaceListLeasePickle;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.DoNotUse;
import org.kohsuke.accmod.restrictions.NoExternalUse;
Expand Down Expand Up @@ -248,6 +253,40 @@ public void stop(Throwable cause) throws Exception {

}

@Extension public static final class RemovedNodeListener extends NodeListener {
@Override protected void onDeleted(Node node) {
Computer c = node.toComputer();
if (c == null || c.isOnline()) {
LOGGER.fine(() -> "computer for " + node.getNodeName() + " was missing or online, skipping");
return;
}
for (Executor e : c.getExecutors()) {
Queue.Executable exec = e.getCurrentExecutable();
if (exec instanceof PlaceholderTask.PlaceholderExecutable) {
PlaceholderTask task = ((PlaceholderTask.PlaceholderExecutable) exec).getParent();
TaskListener listener = TaskListener.NULL;
try {
listener = task.context.get(TaskListener.class);
} catch (Exception x) {
LOGGER.log(Level.WARNING, null, x);
}
if (task.body == null) {
listener.getLogger().println("Agent " + node.getNodeName() + " was deleted, but do not have a node body to cancel");
continue;
}
listener.getLogger().println("Agent " + node.getNodeName() + " was deleted; cancelling node body");
task.body.cancel(new RemovedNodeCause());
}
}
}
}

public static final class RemovedNodeCause extends CauseOfInterruption {
@Override public String getShortDescription() {
return "Agent was removed";
}
}

/** Transient handle of a running executor task. */
private static final class RunningTask {
/** null until placeholder executable runs */
Expand Down Expand Up @@ -278,6 +317,17 @@ public static final class PlaceholderTask implements ContinuedTask, Serializable
*/
private String cookie;

/**
* Needed for {@link BodyExecution#cancel}.
* {@code transient} because we cannot save a {@link BodyExecution} in {@link PlaceholderTask}:
* {@link ExecutorPickle} is written to the stream first, which holds a {@link PlaceholderTask},
* and the {@link BodyExecution} holds {@link PlaceholderTask.Callback} whose {@link WorkspaceList.Lease}
* is not processed by {@link WorkspaceListLeasePickle} since pickles are not recursive.
* So we make a best effort and only try to cancel a body within the current session.
* @see RemovedNodeListener
*/
private transient @CheckForNull BodyExecution body;

/** {@link Authentication#getName} of user of build, if known. */
private final @CheckForNull String auth;

Expand Down Expand Up @@ -781,7 +831,7 @@ private final class PlaceholderExecutable implements ContinuableExecutable, Acce
flowNode.addAction(new WorkspaceActionImpl(workspace, flowNode));
}
listener.getLogger().println("Running on " + ModelHyperlinkNote.encodeTo(node) + " in " + workspace);
context.newBodyInvoker()
body = context.newBodyInvoker()
.withContexts(exec, computer, env, workspace)
.withCallback(new Callback(cookie, lease))
.start();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.StepDescriptor;
import org.jenkinsci.plugins.workflow.steps.StepExecution;
import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution;
import org.jenkinsci.plugins.workflow.support.visualization.table.FlowGraphTable;
import org.jenkinsci.plugins.workflow.support.visualization.table.FlowGraphTable.Row;
import static org.junit.Assert.*;
Expand Down Expand Up @@ -614,15 +615,15 @@ private static final class HelloNote extends ConsoleNote<Object> {

@Issue("JENKINS-49707")
@Test public void removingAgentIsFatal() throws Exception {
logging.record(DurableTaskStep.class, Level.FINE).record(FileMonitoringTask.class, Level.FINE);
logging.record(DurableTaskStep.class, Level.FINE).record(FileMonitoringTask.class, Level.FINE).record(ExecutorStepExecution.class, Level.FINE);
DumbSlave s = j.createSlave("remote", null, null);
WorkflowJob p = j.jenkins.createProject(WorkflowJob.class, "p");
p.setDefinition(new CpsFlowDefinition("node('remote') {isUnix() ? sh('sleep 1000000') : bat('ping -t 127.0.0.1 > nul')}", true));
WorkflowRun b = p.scheduleBuild2(0).waitForStart();
j.waitForMessage(Functions.isWindows() ? ">ping" : "+ sleep", b);
j.jenkins.removeNode(s);
j.assertBuildStatus(Result.FAILURE, j.waitForCompletion(b));
j.assertLogContains("Agent remote was removed", b);
j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b));
j.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b);
}

@Issue("JENKINS-44521")
Expand Down

0 comments on commit 58b127f

Please sign in to comment.