-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JENKINS-73224] Do not try to create a pod which already exists #1561
Changes from all commits
367fbc1
08f553f
c5ce8c2
b65787a
3b78a0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -47,7 +47,6 @@ | |||||||||||
import java.util.List; | ||||||||||||
import java.util.Map; | ||||||||||||
import java.util.concurrent.TimeUnit; | ||||||||||||
import java.util.concurrent.atomic.AtomicBoolean; | ||||||||||||
import java.util.logging.Level; | ||||||||||||
import java.util.logging.Logger; | ||||||||||||
import java.util.stream.Collectors; | ||||||||||||
|
@@ -68,7 +67,7 @@ public class KubernetesLauncher extends JNLPLauncher { | |||||||||||
|
||||||||||||
private static final Logger LOGGER = Logger.getLogger(KubernetesLauncher.class.getName()); | ||||||||||||
|
||||||||||||
private final AtomicBoolean launched = new AtomicBoolean(false); | ||||||||||||
private volatile boolean launched = false; | ||||||||||||
|
||||||||||||
/** | ||||||||||||
* Provisioning exception if any. | ||||||||||||
|
@@ -87,7 +86,7 @@ public KubernetesLauncher() { | |||||||||||
|
||||||||||||
@Override | ||||||||||||
public boolean isLaunchSupported() { | ||||||||||||
return !launched.get(); | ||||||||||||
return !launched; | ||||||||||||
} | ||||||||||||
|
||||||||||||
@Override | ||||||||||||
|
@@ -104,7 +103,7 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) { | |||||||||||
if (node == null) { | ||||||||||||
throw new IllegalStateException("Node has been removed, cannot launch " + computer.getName()); | ||||||||||||
} | ||||||||||||
if (launched.get()) { | ||||||||||||
if (launched) { | ||||||||||||
LOGGER.log(INFO, "Agent has already been launched, activating: {0}", node.getNodeName()); | ||||||||||||
computer.setAcceptingTasks(true); | ||||||||||||
return; | ||||||||||||
|
@@ -129,51 +128,71 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) { | |||||||||||
.orElse(null); | ||||||||||||
node.setNamespace(namespace); | ||||||||||||
|
||||||||||||
LOGGER.log(FINE, () -> "Creating Pod: " + cloudName + " " + namespace + "/" + podName); | ||||||||||||
try { | ||||||||||||
pod = client.pods().inNamespace(namespace).create(pod); | ||||||||||||
} catch (KubernetesClientException e) { | ||||||||||||
Metrics.metricRegistry().counter(MetricNames.CREATION_FAILED).inc(); | ||||||||||||
int httpCode = e.getCode(); | ||||||||||||
if (400 <= httpCode && httpCode < 500) { // 4xx | ||||||||||||
if (httpCode == 403 && e.getMessage().contains("is forbidden: exceeded quota")) { | ||||||||||||
node.getRunListener() | ||||||||||||
.getLogger() | ||||||||||||
.printf( | ||||||||||||
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota exceeded. %n%s%nRetrying...%n%n", | ||||||||||||
cloudName, namespace, pod.getMetadata().getName(), e.getMessage()); | ||||||||||||
} else if (httpCode == 409 | ||||||||||||
&& e.getMessage().contains("Operation cannot be fulfilled on resourcequotas")) { | ||||||||||||
// See: https://github.com/kubernetes/kubernetes/issues/67761 ; A retry usually works. | ||||||||||||
node.getRunListener() | ||||||||||||
.getLogger() | ||||||||||||
.printf( | ||||||||||||
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota update conflict. %n%s%nRetrying...%n%n", | ||||||||||||
cloudName, namespace, pod.getMetadata().getName(), e.getMessage()); | ||||||||||||
// if the controller was interrupted after creating the pod but before it connected back, then | ||||||||||||
// the pod might already exist and the creating logic must be skipped. | ||||||||||||
Pod existingPod = | ||||||||||||
client.pods().inNamespace(namespace).withName(podName).get(); | ||||||||||||
if (existingPod == null) { | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Everything inside this |
||||||||||||
LOGGER.log(FINE, () -> "Creating Pod: " + cloudName + " " + namespace + "/" + podName); | ||||||||||||
try { | ||||||||||||
pod = client.pods().inNamespace(namespace).create(pod); | ||||||||||||
} catch (KubernetesClientException e) { | ||||||||||||
Metrics.metricRegistry() | ||||||||||||
.counter(MetricNames.CREATION_FAILED) | ||||||||||||
.inc(); | ||||||||||||
int httpCode = e.getCode(); | ||||||||||||
if (400 <= httpCode && httpCode < 500) { // 4xx | ||||||||||||
if (httpCode == 403 && e.getMessage().contains("is forbidden: exceeded quota")) { | ||||||||||||
node.getRunListener() | ||||||||||||
.getLogger() | ||||||||||||
.printf( | ||||||||||||
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota exceeded. %n%s%nRetrying...%n%n", | ||||||||||||
cloudName, | ||||||||||||
namespace, | ||||||||||||
pod.getMetadata().getName(), | ||||||||||||
e.getMessage()); | ||||||||||||
} else if (httpCode == 409 | ||||||||||||
&& e.getMessage().contains("Operation cannot be fulfilled on resourcequotas")) { | ||||||||||||
// See: https://github.com/kubernetes/kubernetes/issues/67761 ; A retry usually works. | ||||||||||||
node.getRunListener() | ||||||||||||
.getLogger() | ||||||||||||
.printf( | ||||||||||||
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota update conflict. %n%s%nRetrying...%n%n", | ||||||||||||
cloudName, | ||||||||||||
namespace, | ||||||||||||
pod.getMetadata().getName(), | ||||||||||||
e.getMessage()); | ||||||||||||
} else { | ||||||||||||
node.getRunListener() | ||||||||||||
.getLogger() | ||||||||||||
.printf( | ||||||||||||
"ERROR: Unable to create pod %s %s/%s.%n%s%n", | ||||||||||||
cloudName, | ||||||||||||
namespace, | ||||||||||||
pod.getMetadata().getName(), | ||||||||||||
e.getMessage()); | ||||||||||||
PodUtils.cancelQueueItemFor(pod, e.getMessage()); | ||||||||||||
} | ||||||||||||
} else if (500 <= httpCode && httpCode < 600) { // 5xx | ||||||||||||
LOGGER.log(FINE, "Kubernetes returned HTTP code {0} {1}. Retrying...", new Object[] { | ||||||||||||
e.getCode(), e.getStatus() | ||||||||||||
}); | ||||||||||||
} else { | ||||||||||||
node.getRunListener() | ||||||||||||
.getLogger() | ||||||||||||
.printf( | ||||||||||||
"ERROR: Unable to create pod %s %s/%s.%n%s%n", | ||||||||||||
cloudName, namespace, pod.getMetadata().getName(), e.getMessage()); | ||||||||||||
PodUtils.cancelQueueItemFor(pod, e.getMessage()); | ||||||||||||
LOGGER.log(WARNING, "Kubernetes returned unhandled HTTP code {0} {1}", new Object[] { | ||||||||||||
e.getCode(), e.getStatus() | ||||||||||||
}); | ||||||||||||
} | ||||||||||||
} else if (500 <= httpCode && httpCode < 600) { // 5xx | ||||||||||||
LOGGER.log(FINE, "Kubernetes returned HTTP code {0} {1}. Retrying...", new Object[] { | ||||||||||||
e.getCode(), e.getStatus() | ||||||||||||
}); | ||||||||||||
} else { | ||||||||||||
LOGGER.log(WARNING, "Kubernetes returned unhandled HTTP code {0} {1}", new Object[] { | ||||||||||||
e.getCode(), e.getStatus() | ||||||||||||
}); | ||||||||||||
throw e; | ||||||||||||
} | ||||||||||||
throw e; | ||||||||||||
LOGGER.log(INFO, () -> "Created Pod: " + cloudName + " " + namespace + "/" + podName); | ||||||||||||
listener.getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName); | ||||||||||||
Metrics.metricRegistry().counter(MetricNames.PODS_CREATED).inc(); | ||||||||||||
|
||||||||||||
node.getRunListener().getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName); | ||||||||||||
} else { | ||||||||||||
LOGGER.log(INFO, () -> "Pod already exists: " + cloudName + " " + namespace + "/" + podName); | ||||||||||||
listener.getLogger().printf("Pod already exists: %s %s/%s%n", cloudName, namespace, podName); | ||||||||||||
Comment on lines
+193
to
+194
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So how does this actually differ from kubernetes-plugin/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java Lines 106 to 110 in 3b78a0b
launching field if we have this logic?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is another face of the same behaviour, just under a slightly different timing (a lot narrower than the one being fixed in this PR). This PR catches the case there the controller is restarted between the pod creating call and the end of the waiting for loop. While the existing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Not sure. https://github.com/amuniz/kubernetes-plugin/blob/f10083a42e708a6bff990c6ccb306ce618b8bccd/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesComputer.java#L201 is never used from this plugin sources, but it's public and it could be used somewhere else, so I would stay on the safe side and keep it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Well, if it is not used it can be deleted. (I usually just do a GH org-wide search.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then there is stuff like https://github.com/amuniz/kubernetes-plugin/blob/3b78a0ba311af4c091b94c070b9a8c8b6ab6768b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java#L199-L200 which it would not make sense to do twice, but there is no current defense against race conditions. It seems there are deeper bugs waiting here. |
||||||||||||
} | ||||||||||||
LOGGER.log(INFO, () -> "Created Pod: " + cloudName + " " + namespace + "/" + podName); | ||||||||||||
listener.getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName); | ||||||||||||
Metrics.metricRegistry().counter(MetricNames.PODS_CREATED).inc(); | ||||||||||||
|
||||||||||||
node.getRunListener().getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName); | ||||||||||||
kubernetesComputer.setLaunching(true); | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we fix this field please, so it is not persisting an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it need to be an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see #1479 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 3b78a0b |
||||||||||||
|
||||||||||||
ObjectMeta podMetadata = pod.getMetadata(); | ||||||||||||
|
@@ -268,7 +287,7 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) { | |||||||||||
} | ||||||||||||
|
||||||||||||
computer.setAcceptingTasks(true); | ||||||||||||
launched.set(true); | ||||||||||||
launched = true; | ||||||||||||
try { | ||||||||||||
// We need to persist the "launched" setting... | ||||||||||||
node.save(); | ||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,14 @@ | ||
package org.csanchez.jenkins.plugins.kubernetes.pipeline; | ||
|
||
import io.fabric8.kubernetes.api.model.NodeBuilder; | ||
import io.fabric8.kubernetes.client.KubernetesClient; | ||
import io.fabric8.kubernetes.client.KubernetesClientBuilder; | ||
import java.net.UnknownHostException; | ||
import org.csanchez.jenkins.plugins.kubernetes.KubernetesTestUtil; | ||
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildLogMessage; | ||
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildStatusSuccess; | ||
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.CreateWorkflowJobThenScheduleTask; | ||
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.RunId; | ||
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.SetupCloud; | ||
import org.junit.Test; | ||
|
||
|
@@ -12,6 +19,44 @@ public KubernetesPipelineRJRTest() throws UnknownHostException { | |
|
||
@Test | ||
public void basicPipeline() throws Throwable { | ||
RunId runId = createWorkflowJobThenScheduleRun(); | ||
rjr.runRemotely(new AssertBuildStatusSuccess(runId)); | ||
} | ||
|
||
@Test | ||
public void restartDuringPodLaunch() throws Throwable { | ||
// try to run something on a pod which is not schedulable (disktype=special) | ||
RunId build = rjr.runRemotely(new CreateWorkflowJobThenScheduleTask( | ||
KubernetesTestUtil.loadPipelineScript(getClass(), name.getMethodName() + ".groovy"))); | ||
// the pod is created, but not connected yet | ||
rjr.runRemotely(new AssertBuildLogMessage("Created Pod", build)); | ||
// restart | ||
rjr.stopJenkins(); | ||
rjr.startJenkins(); | ||
// update k8s to make a node suitable to schedule (add disktype=special to the node) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A little scary to have the test even temporarily patch the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was the easiest way I found, and with the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So long as the JVM terminates normally, yes. |
||
System.out.println("Adding label to node...."); | ||
try (KubernetesClient client = new KubernetesClientBuilder().build()) { | ||
String nodeName = | ||
client.nodes().list().getItems().get(0).getMetadata().getName(); | ||
client.nodes().withName(nodeName).edit(n -> new NodeBuilder(n) | ||
.editMetadata() | ||
.addToLabels("disktype", "special") | ||
.endMetadata() | ||
.build()); | ||
|
||
// pod connects back and the build finishes correctly | ||
rjr.runRemotely(new AssertBuildStatusSuccess(build)); | ||
} finally { | ||
// clean up | ||
try (KubernetesClient client = new KubernetesClientBuilder().build()) { | ||
String nodeName = | ||
client.nodes().list().getItems().get(0).getMetadata().getName(); | ||
client.nodes().withName(nodeName).edit(n -> new NodeBuilder(n) | ||
.editMetadata() | ||
.removeFromLabels("disktype") | ||
.endMetadata() | ||
.build()); | ||
} | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without the fix this test fails with the log in the linked ticket. |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
package org.csanchez.jenkins.plugins.kubernetes.pipeline.steps; | ||
|
||
import org.jenkinsci.plugins.workflow.job.WorkflowJob; | ||
import org.jenkinsci.plugins.workflow.job.WorkflowRun; | ||
import org.jvnet.hudson.test.JenkinsRule; | ||
import org.jvnet.hudson.test.RealJenkinsRule; | ||
|
||
public class AssertBuildLogMessage implements RealJenkinsRule.Step { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW you can also make these things There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, but less reusable then... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How is that less reusable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When writing a new test it seems natural to go to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I meant reusable static methods in a utility class in the |
||
|
||
private final String message; | ||
private final RunId runId; | ||
|
||
public AssertBuildLogMessage(String message, RunId runId) { | ||
this.message = message; | ||
this.runId = runId; | ||
} | ||
|
||
@Override | ||
public void run(JenkinsRule r) throws Throwable { | ||
WorkflowJob p = r.jenkins.getItemByFullName(runId.name, WorkflowJob.class); | ||
WorkflowRun b = p.getBuildByNumber(runId.number); | ||
r.waitForMessage(message, b); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package org.csanchez.jenkins.plugins.kubernetes.pipeline.steps; | ||
|
||
import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition; | ||
import org.jenkinsci.plugins.workflow.job.WorkflowJob; | ||
import org.jenkinsci.plugins.workflow.job.WorkflowRun; | ||
import org.jvnet.hudson.test.JenkinsRule; | ||
import org.jvnet.hudson.test.RealJenkinsRule; | ||
|
||
/** | ||
* Creates a workflow job using the specified script, then schedules it and returns a reference to the run. | ||
*/ | ||
public class CreateWorkflowJobThenScheduleTask implements RealJenkinsRule.Step2<RunId> { | ||
private String script; | ||
|
||
public CreateWorkflowJobThenScheduleTask(String script) { | ||
this.script = script; | ||
} | ||
|
||
@Override | ||
public RunId run(JenkinsRule r) throws Throwable { | ||
WorkflowJob project = r.createProject(WorkflowJob.class); | ||
project.setDefinition(new CpsFlowDefinition(script, true)); | ||
project.save(); | ||
System.out.println("Scheduling build..."); | ||
WorkflowRun b = project.scheduleBuild2(0).getStartCondition().get(); | ||
System.out.println("Build scheduled..."); | ||
return new RunId(project.getFullName(), b.number); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
podTemplate(yaml: ''' | ||
apiVersion: v1 | ||
kind: Pod | ||
spec: | ||
nodeSelector: | ||
disktype: special | ||
''') { | ||
node(POD_LABEL) { | ||
sh 'true' | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FTR: this is not directly related or needed for the goal of this PR, but it was requested by @jglick in a review.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(but consider #1561 (comment))