diff --git a/Jenkinsfile b/Jenkinsfile index b3e75c465c..3063ec25ed 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1 +1,4 @@ -buildPlugin(configurations: buildPlugin.recommendedConfigurations().findAll { it.platform == 'linux' }) +buildPlugin(configurations: [ + [platform: 'linux', jdk: '8', jenkins: null], + [platform: 'linux', jdk: '11', jenkins: null], +]) diff --git a/pom.xml b/pom.xml index 63e658fc46..9d1541dd45 100644 --- a/pom.xml +++ b/pom.xml @@ -46,11 +46,12 @@ 8 - 2.138.4 + 2.176.1 false true 0 1.3.7 + 3.3 2.20 1.7.26 @@ -145,19 +146,19 @@ org.jenkins-ci.plugins.workflow workflow-support - 3.3 + ${workflow-support-plugin.version} test org.jenkins-ci.plugins.workflow workflow-durable-task-step - 2.28 + 2.32 test org.jenkins-ci.plugins.workflow workflow-support - 3.0 + ${workflow-support-plugin.version} tests test diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java new file mode 100644 index 0000000000..8621731706 --- /dev/null +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -0,0 +1,142 @@ +/* + * Copyright 2019 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.csanchez.jenkins.plugins.kubernetes.pod.retention; + +import hudson.Extension; +import hudson.model.Computer; +import hudson.model.Node; +import hudson.model.TaskListener; +import hudson.slaves.Cloud; +import hudson.slaves.ComputerListener; +import hudson.slaves.EphemeralNode; +import io.fabric8.kubernetes.api.model.Pod; +import io.fabric8.kubernetes.client.KubernetesClient; +import io.fabric8.kubernetes.client.KubernetesClientException; +import io.fabric8.kubernetes.client.Watcher; +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.logging.Level; +import java.util.logging.Logger; +import jenkins.model.Jenkins; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesComputer; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; + +/** + * Checks for deleted pods corresponding to {@link KubernetesSlave} and ensures the node is removed from Jenkins too. + *

If the pod has been deleted, all of the associated state (running user processes, workspace, etc.) must also be gone; + * so there is no point in retaining this agent definition any further. + * ({@link KubernetesSlave} is not an {@link EphemeralNode}: it does support running across Jenkins restarts.) + *

Note that pod retention policies other than the default {@link Never} may disable this system, + * unless some external process or garbage collection policy results in pod deletion. + */ +@Extension +public class Reaper extends ComputerListener implements Watcher { + + private static final Logger LOGGER = Logger.getLogger(Reaper.class.getName()); + + /** + * Activate this feature only if and when some Kubernetes agent is actually used. + * Avoids touching the API server when this plugin is not even in use. + */ + private final AtomicBoolean activated = new AtomicBoolean(); + + @Override + public void onOnline(Computer c, TaskListener listener) throws IOException, InterruptedException { + if (c instanceof KubernetesComputer && activated.compareAndSet(false, true)) { + activate(); + } + } + + private void activate() { + LOGGER.fine("Activating reaper"); + // First check all existing nodes to see if they still have active pods. + // (We may have missed deletion events while Jenkins was shut off, + // or pods may have been deleted before any Kubernetes agent was brought online.) + for (Node n : new ArrayList<>(Jenkins.get().getNodes())) { + if (!(n instanceof KubernetesSlave)) { + continue; + } + KubernetesSlave ks = (KubernetesSlave) n; + String ns = ks.getNamespace(); + String name = ks.getPodName(); + try { + // TODO more efficient to do a single (or paged) list request, but tricky since there may be multiple clouds, + // and even within a single cloud an agent pod is permitted to use a nondefault namespace, + // yet we do not want to do an unnamespaced pod list for RBAC reasons. + // Could use a hybrid approach: first list all pods in the configured namespace for all clouds; + // then go back and individually check any unmatched agents with their configured namespace. + if (ks.getKubernetesCloud().connect().pods().inNamespace(ns).withName(name).get() == null) { + LOGGER.info(() -> ns + "/" + name + " seems to have been deleted, so removing corresponding Jenkins agent"); + Jenkins.get().removeNode(ks); + } else { + LOGGER.fine(() -> ns + "/" + name + " still seems to exist, OK"); + } + } catch (Exception x) { + LOGGER.log(Level.WARNING, "failed to do initial reap check for " + ns + "/" + name, x); + } + } + // Now set up a watch for any subsequent pod deletions. + for (Cloud c : Jenkins.get().clouds) { + if (!(c instanceof KubernetesCloud)) { + continue; + } + KubernetesCloud kc = (KubernetesCloud) c; + try { + KubernetesClient client = kc.connect(); + client.pods().inNamespace(client.getNamespace()).watch(this); + } catch (Exception x) { + LOGGER.log(Level.WARNING, "failed to set up watcher on " + kc.getDisplayName(), x); + } + } + } + + @Override + public void eventReceived(Watcher.Action action, Pod pod) { + if (action == Watcher.Action.DELETED) { + String ns = pod.getMetadata().getNamespace(); + String name = pod.getMetadata().getName(); + for (Node n : new ArrayList<>(Jenkins.get().getNodes())) { + if (!(n instanceof KubernetesSlave)) { + continue; + } + KubernetesSlave ks = (KubernetesSlave) n; + if (ks.getNamespace().equals(ns) && ks.getPodName().equals(name)) { + LOGGER.info(() -> ns + "/" + name + " was just deleted, so removing corresponding Jenkins agent"); + try { + Jenkins.get().removeNode(ks); + return; + } catch (Exception x) { + LOGGER.log(Level.WARNING, "failed to reap " + ns + "/" + name, x); + } + } + } + LOGGER.fine(() -> "received deletion notice for " + ns + "/" + name + " which does not seem to correspond to any Jenkins agent"); + } + } + + @Override + public void onClose(KubernetesClientException cause) { + // TODO ignore, or do we need to manually reattach the watcher? + // AllContainersRunningPodWatcher is not reattached, but this is expected to be short-lived, + // useful only until the containers of a single pod start running. + // (At least when using kubernetes-client/java, the connection gets closed after 2m on GKE + // and you need to rerun the watch. Does the fabric8io client wrap this?) + } + +} diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index 7ac3dced7f..bdbff5e519 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -46,6 +46,7 @@ import org.csanchez.jenkins.plugins.kubernetes.PodAnnotation; import org.csanchez.jenkins.plugins.kubernetes.PodTemplate; import org.jenkinsci.plugins.workflow.job.WorkflowRun; +import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep; import org.junit.Before; import org.junit.Rule; @@ -352,6 +353,15 @@ public void runInPodWithRetention() throws Exception { assertTrue(deletePods(cloud.connect(), getLabels(this, name), true)); } + @Issue("JENKINS-49707") + @Test + public void terminatedPod() throws Exception { + r.waitForMessage("+ sleep", b); + deletePods(cloud.connect(), getLabels(this, name), false); + r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); + r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); + } + @Test public void computerCantBeConfigured() throws Exception { r.jenkins.setSecurityRealm(r.createDummySecurityRealm()); diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index 301ebf88e4..ffb3129d94 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -47,6 +47,7 @@ import org.csanchez.jenkins.plugins.kubernetes.model.TemplateEnvVar; import org.jenkinsci.plugins.workflow.job.WorkflowJob; import org.jenkinsci.plugins.workflow.job.WorkflowRun; +import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Rule; @@ -54,11 +55,13 @@ import org.junit.rules.TemporaryFolder; import org.junit.rules.TestName; import org.jvnet.hudson.test.BuildWatcher; +import org.jvnet.hudson.test.Issue; import org.jvnet.hudson.test.JenkinsRule; import org.jvnet.hudson.test.LoggerRule; import org.jvnet.hudson.test.RestartableJenkinsNonLocalhostRule; import hudson.model.Node; +import hudson.model.Result; import hudson.slaves.DumbSlave; import hudson.slaves.JNLPLauncher; import hudson.slaves.NodeProperty; @@ -188,6 +191,34 @@ public void runInPodWithRestartWithLongSleep() throws Exception { }); } + @Issue("JENKINS-49707") + @Test + public void terminatedPodAfterRestart() throws Exception { + AtomicReference projectName = new AtomicReference<>(); + story.then(r -> { + configureCloud(); + WorkflowRun b = getPipelineJobThenScheduleRun(r); + projectName.set(b.getParent().getFullName()); + r.waitForMessage("+ sleep", b); + }); + story.then(r -> { + WorkflowRun b = r.jenkins.getItemByFullName(projectName.get(), WorkflowJob.class).getBuildByNumber(1); + r.waitForMessage("Ready to run", b); + // Note that the test is cheating here slightly. + // The watch in Reaper is still running across the in-JVM restarts, + // whereas in production it would have been cancelled during the shutdown. + // But it does not matter since we are waiting for the agent to come back online after the restart, + // which is sufficient trigger to reactivate the reaper. + // Indeed we get two Reaper instances running, which independently remove the node. + deletePods(cloud.connect(), getLabels(this, name), false); + r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); + r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); + // Currently the logic in ExecutorStepExecution cannot handle a Jenkins restart so it prints the following. + // It does not matter since DurableTaskStep redundantly implements the same check. + r.assertLogContains(" was deleted, but do not have a node body to cancel", b); + }); + } + @Test public void getContainerLogWithRestart() throws Exception { AtomicReference projectName = new AtomicReference<>(); diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy new file mode 100644 index 0000000000..e0b1c9f982 --- /dev/null +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy @@ -0,0 +1,9 @@ +podTemplate(label: '$NAME', containers: [ + containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'), + ]) { + node ('$NAME') { + container('busybox') { + sh 'sleep 9999999' + } + } +} diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy new file mode 100644 index 0000000000..d1eca4791b --- /dev/null +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy @@ -0,0 +1,11 @@ +package org.csanchez.jenkins.plugins.kubernetes.pipeline + +podTemplate(label: '$NAME', containers: [ + containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'), +]) { + node ('$NAME') { + container('busybox') { + sh 'sleep 9999999' + } + } +}