Skip to content

Commit

Permalink
[JENKINS-49707] If a KubernetesComputer disconnects, remove the… (#461)
Browse files Browse the repository at this point in the history
[JENKINS-49707] If a KubernetesComputer disconnects, remove the KubernetesSlave
  • Loading branch information
Vlatombe authored Jul 10, 2019
2 parents ec03063 + bb4e297 commit f991e78
Show file tree
Hide file tree
Showing 7 changed files with 212 additions and 5 deletions.
5 changes: 4 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
buildPlugin(configurations: buildPlugin.recommendedConfigurations().findAll { it.platform == 'linux' })
buildPlugin(configurations: [
[platform: 'linux', jdk: '8', jenkins: null],
[platform: 'linux', jdk: '11', jenkins: null],
])
9 changes: 5 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,12 @@
<connectorHost />
<jenkins.host.address />
<java.level>8</java.level>
<jenkins.version>2.138.4</jenkins.version>
<jenkins.version>2.176.1</jenkins.version>
<no-test-jar>false</no-test-jar>
<useBeta>true</useBeta>
<surefire.rerunFailingTestsCount>0</surefire.rerunFailingTestsCount>
<pipeline-model-definition.version>1.3.7</pipeline-model-definition.version>
<workflow-support-plugin.version>3.3</workflow-support-plugin.version>
<workflow-step-api-plugin.version>2.20</workflow-step-api-plugin.version>
<slf4jVersion>1.7.26</slf4jVersion>
</properties>
Expand Down Expand Up @@ -145,19 +146,19 @@
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-support</artifactId>
<version>3.3</version>
<version>${workflow-support-plugin.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-durable-task-step</artifactId>
<version>2.28</version>
<version>2.32</version>
<scope>test</scope>
</dependency>
<dependency> <!-- SemaphoreStep -->
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-support</artifactId>
<version>3.0</version>
<version>${workflow-support-plugin.version}</version>
<classifier>tests</classifier>
<scope>test</scope>
</dependency>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* Copyright 2019 CloudBees, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.csanchez.jenkins.plugins.kubernetes.pod.retention;

import hudson.Extension;
import hudson.model.Computer;
import hudson.model.Node;
import hudson.model.TaskListener;
import hudson.slaves.Cloud;
import hudson.slaves.ComputerListener;
import hudson.slaves.EphemeralNode;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.KubernetesClientException;
import io.fabric8.kubernetes.client.Watcher;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesComputer;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave;

/**
* Checks for deleted pods corresponding to {@link KubernetesSlave} and ensures the node is removed from Jenkins too.
* <p>If the pod has been deleted, all of the associated state (running user processes, workspace, etc.) must also be gone;
* so there is no point in retaining this agent definition any further.
* ({@link KubernetesSlave} is not an {@link EphemeralNode}: it <em>does</em> support running across Jenkins restarts.)
* <p>Note that pod retention policies other than the default {@link Never} may disable this system,
* unless some external process or garbage collection policy results in pod deletion.
*/
@Extension
public class Reaper extends ComputerListener implements Watcher<Pod> {

private static final Logger LOGGER = Logger.getLogger(Reaper.class.getName());

/**
* Activate this feature only if and when some Kubernetes agent is actually used.
* Avoids touching the API server when this plugin is not even in use.
*/
private final AtomicBoolean activated = new AtomicBoolean();

@Override
public void onOnline(Computer c, TaskListener listener) throws IOException, InterruptedException {
if (c instanceof KubernetesComputer && activated.compareAndSet(false, true)) {
activate();
}
}

private void activate() {
LOGGER.fine("Activating reaper");
// First check all existing nodes to see if they still have active pods.
// (We may have missed deletion events while Jenkins was shut off,
// or pods may have been deleted before any Kubernetes agent was brought online.)
for (Node n : new ArrayList<>(Jenkins.get().getNodes())) {
if (!(n instanceof KubernetesSlave)) {
continue;
}
KubernetesSlave ks = (KubernetesSlave) n;
String ns = ks.getNamespace();
String name = ks.getPodName();
try {
// TODO more efficient to do a single (or paged) list request, but tricky since there may be multiple clouds,
// and even within a single cloud an agent pod is permitted to use a nondefault namespace,
// yet we do not want to do an unnamespaced pod list for RBAC reasons.
// Could use a hybrid approach: first list all pods in the configured namespace for all clouds;
// then go back and individually check any unmatched agents with their configured namespace.
if (ks.getKubernetesCloud().connect().pods().inNamespace(ns).withName(name).get() == null) {
LOGGER.info(() -> ns + "/" + name + " seems to have been deleted, so removing corresponding Jenkins agent");
Jenkins.get().removeNode(ks);
} else {
LOGGER.fine(() -> ns + "/" + name + " still seems to exist, OK");
}
} catch (Exception x) {
LOGGER.log(Level.WARNING, "failed to do initial reap check for " + ns + "/" + name, x);
}
}
// Now set up a watch for any subsequent pod deletions.
for (Cloud c : Jenkins.get().clouds) {
if (!(c instanceof KubernetesCloud)) {
continue;
}
KubernetesCloud kc = (KubernetesCloud) c;
try {
KubernetesClient client = kc.connect();
client.pods().inNamespace(client.getNamespace()).watch(this);
} catch (Exception x) {
LOGGER.log(Level.WARNING, "failed to set up watcher on " + kc.getDisplayName(), x);
}
}
}

@Override
public void eventReceived(Watcher.Action action, Pod pod) {
if (action == Watcher.Action.DELETED) {
String ns = pod.getMetadata().getNamespace();
String name = pod.getMetadata().getName();
for (Node n : new ArrayList<>(Jenkins.get().getNodes())) {
if (!(n instanceof KubernetesSlave)) {
continue;
}
KubernetesSlave ks = (KubernetesSlave) n;
if (ks.getNamespace().equals(ns) && ks.getPodName().equals(name)) {
LOGGER.info(() -> ns + "/" + name + " was just deleted, so removing corresponding Jenkins agent");
try {
Jenkins.get().removeNode(ks);
return;
} catch (Exception x) {
LOGGER.log(Level.WARNING, "failed to reap " + ns + "/" + name, x);
}
}
}
LOGGER.fine(() -> "received deletion notice for " + ns + "/" + name + " which does not seem to correspond to any Jenkins agent");
}
}

@Override
public void onClose(KubernetesClientException cause) {
// TODO ignore, or do we need to manually reattach the watcher?
// AllContainersRunningPodWatcher is not reattached, but this is expected to be short-lived,
// useful only until the containers of a single pod start running.
// (At least when using kubernetes-client/java, the connection gets closed after 2m on GKE
// and you need to rerun the watch. Does the fabric8io client wrap this?)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.csanchez.jenkins.plugins.kubernetes.PodAnnotation;
import org.csanchez.jenkins.plugins.kubernetes.PodTemplate;
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution;
import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep;
import org.junit.Before;
import org.junit.Rule;
Expand Down Expand Up @@ -352,6 +353,15 @@ public void runInPodWithRetention() throws Exception {
assertTrue(deletePods(cloud.connect(), getLabels(this, name), true));
}

@Issue("JENKINS-49707")
@Test
public void terminatedPod() throws Exception {
r.waitForMessage("+ sleep", b);
deletePods(cloud.connect(), getLabels(this, name), false);
r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b));
r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b);
}

@Test
public void computerCantBeConfigured() throws Exception {
r.jenkins.setSecurityRealm(r.createDummySecurityRealm());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,21 @@
import org.csanchez.jenkins.plugins.kubernetes.model.TemplateEnvVar;
import org.jenkinsci.plugins.workflow.job.WorkflowJob;
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.junit.rules.TestName;
import org.jvnet.hudson.test.BuildWatcher;
import org.jvnet.hudson.test.Issue;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.LoggerRule;
import org.jvnet.hudson.test.RestartableJenkinsNonLocalhostRule;

import hudson.model.Node;
import hudson.model.Result;
import hudson.slaves.DumbSlave;
import hudson.slaves.JNLPLauncher;
import hudson.slaves.NodeProperty;
Expand Down Expand Up @@ -188,6 +191,34 @@ public void runInPodWithRestartWithLongSleep() throws Exception {
});
}

@Issue("JENKINS-49707")
@Test
public void terminatedPodAfterRestart() throws Exception {
AtomicReference<String> projectName = new AtomicReference<>();
story.then(r -> {
configureCloud();
WorkflowRun b = getPipelineJobThenScheduleRun(r);
projectName.set(b.getParent().getFullName());
r.waitForMessage("+ sleep", b);
});
story.then(r -> {
WorkflowRun b = r.jenkins.getItemByFullName(projectName.get(), WorkflowJob.class).getBuildByNumber(1);
r.waitForMessage("Ready to run", b);
// Note that the test is cheating here slightly.
// The watch in Reaper is still running across the in-JVM restarts,
// whereas in production it would have been cancelled during the shutdown.
// But it does not matter since we are waiting for the agent to come back online after the restart,
// which is sufficient trigger to reactivate the reaper.
// Indeed we get two Reaper instances running, which independently remove the node.
deletePods(cloud.connect(), getLabels(this, name), false);
r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b));
r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b);
// Currently the logic in ExecutorStepExecution cannot handle a Jenkins restart so it prints the following.
// It does not matter since DurableTaskStep redundantly implements the same check.
r.assertLogContains(" was deleted, but do not have a node body to cancel", b);
});
}

@Test
public void getContainerLogWithRestart() throws Exception {
AtomicReference<String> projectName = new AtomicReference<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
podTemplate(label: '$NAME', containers: [
containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'),
]) {
node ('$NAME') {
container('busybox') {
sh 'sleep 9999999'
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package org.csanchez.jenkins.plugins.kubernetes.pipeline

podTemplate(label: '$NAME', containers: [
containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'),
]) {
node ('$NAME') {
container('busybox') {
sh 'sleep 9999999'
}
}
}

0 comments on commit f991e78

Please sign in to comment.