Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JENKINS-73224] Do not try to create a pod which already exists #1561

Merged
merged 5 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
Expand All @@ -68,7 +67,7 @@ public class KubernetesLauncher extends JNLPLauncher {

private static final Logger LOGGER = Logger.getLogger(KubernetesLauncher.class.getName());

private final AtomicBoolean launched = new AtomicBoolean(false);
private volatile boolean launched = false;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FTR: this is not directly related or needed for the goal of this PR, but it was requested by @jglick in a review.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(but consider #1561 (comment))


/**
* Provisioning exception if any.
Expand All @@ -87,7 +86,7 @@ public KubernetesLauncher() {

@Override
public boolean isLaunchSupported() {
return !launched.get();
return !launched;
}

@Override
Expand All @@ -104,7 +103,7 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) {
if (node == null) {
throw new IllegalStateException("Node has been removed, cannot launch " + computer.getName());
}
if (launched.get()) {
if (launched) {
LOGGER.log(INFO, "Agent has already been launched, activating: {0}", node.getNodeName());
computer.setAcceptingTasks(true);
return;
Expand All @@ -129,51 +128,71 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) {
.orElse(null);
node.setNamespace(namespace);

LOGGER.log(FINE, () -> "Creating Pod: " + cloudName + " " + namespace + "/" + podName);
try {
pod = client.pods().inNamespace(namespace).create(pod);
} catch (KubernetesClientException e) {
Metrics.metricRegistry().counter(MetricNames.CREATION_FAILED).inc();
int httpCode = e.getCode();
if (400 <= httpCode && httpCode < 500) { // 4xx
if (httpCode == 403 && e.getMessage().contains("is forbidden: exceeded quota")) {
node.getRunListener()
.getLogger()
.printf(
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota exceeded. %n%s%nRetrying...%n%n",
cloudName, namespace, pod.getMetadata().getName(), e.getMessage());
} else if (httpCode == 409
&& e.getMessage().contains("Operation cannot be fulfilled on resourcequotas")) {
// See: https://github.com/kubernetes/kubernetes/issues/67761 ; A retry usually works.
node.getRunListener()
.getLogger()
.printf(
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota update conflict. %n%s%nRetrying...%n%n",
cloudName, namespace, pod.getMetadata().getName(), e.getMessage());
// if the controller was interrupted after creating the pod but before it connected back, then
// the pod might already exist and the creating logic must be skipped.
Pod existingPod =
client.pods().inNamespace(namespace).withName(podName).get();
if (existingPod == null) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Everything inside this if branch is untouched (except formatting, which I was forced to do by Spotless)

LOGGER.log(FINE, () -> "Creating Pod: " + cloudName + " " + namespace + "/" + podName);
try {
pod = client.pods().inNamespace(namespace).create(pod);
} catch (KubernetesClientException e) {
Metrics.metricRegistry()
.counter(MetricNames.CREATION_FAILED)
.inc();
int httpCode = e.getCode();
if (400 <= httpCode && httpCode < 500) { // 4xx
if (httpCode == 403 && e.getMessage().contains("is forbidden: exceeded quota")) {
node.getRunListener()
.getLogger()
.printf(
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota exceeded. %n%s%nRetrying...%n%n",
cloudName,
namespace,
pod.getMetadata().getName(),
e.getMessage());
} else if (httpCode == 409
&& e.getMessage().contains("Operation cannot be fulfilled on resourcequotas")) {
// See: https://github.com/kubernetes/kubernetes/issues/67761 ; A retry usually works.
node.getRunListener()
.getLogger()
.printf(
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota update conflict. %n%s%nRetrying...%n%n",
cloudName,
namespace,
pod.getMetadata().getName(),
e.getMessage());
} else {
node.getRunListener()
.getLogger()
.printf(
"ERROR: Unable to create pod %s %s/%s.%n%s%n",
cloudName,
namespace,
pod.getMetadata().getName(),
e.getMessage());
PodUtils.cancelQueueItemFor(pod, e.getMessage());
}
} else if (500 <= httpCode && httpCode < 600) { // 5xx
LOGGER.log(FINE, "Kubernetes returned HTTP code {0} {1}. Retrying...", new Object[] {
e.getCode(), e.getStatus()
});
} else {
node.getRunListener()
.getLogger()
.printf(
"ERROR: Unable to create pod %s %s/%s.%n%s%n",
cloudName, namespace, pod.getMetadata().getName(), e.getMessage());
PodUtils.cancelQueueItemFor(pod, e.getMessage());
LOGGER.log(WARNING, "Kubernetes returned unhandled HTTP code {0} {1}", new Object[] {
e.getCode(), e.getStatus()
});
}
} else if (500 <= httpCode && httpCode < 600) { // 5xx
LOGGER.log(FINE, "Kubernetes returned HTTP code {0} {1}. Retrying...", new Object[] {
e.getCode(), e.getStatus()
});
} else {
LOGGER.log(WARNING, "Kubernetes returned unhandled HTTP code {0} {1}", new Object[] {
e.getCode(), e.getStatus()
});
throw e;
}
throw e;
LOGGER.log(INFO, () -> "Created Pod: " + cloudName + " " + namespace + "/" + podName);
listener.getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName);
Metrics.metricRegistry().counter(MetricNames.PODS_CREATED).inc();

node.getRunListener().getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName);
} else {
LOGGER.log(INFO, () -> "Pod already exists: " + cloudName + " " + namespace + "/" + podName);
listener.getLogger().printf("Pod already exists: %s %s/%s%n", cloudName, namespace, podName);
Comment on lines +193 to +194
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So how does this actually differ from

if (launched) {
LOGGER.log(INFO, "Agent has already been launched, activating: {0}", node.getNodeName());
computer.setAcceptingTasks(true);
return;
}
and do we even still need the launching field if we have this logic?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is another face of the same behaviour, just under a slightly different timing (a lot narrower than the one being fixed in this PR).

This PR catches the case there the controller is restarted between the pod creating call and the end of the waiting for loop. While the existing launched check would only catch a restart right after the node save and the end of the method, which is quite unlikely to happen, but possible I guess.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we even still need the launching field if we have this logic?

Not sure. https://github.com/amuniz/kubernetes-plugin/blob/f10083a42e708a6bff990c6ccb306ce618b8bccd/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesComputer.java#L201 is never used from this plugin sources, but it's public and it could be used somewhere else, so I would stay on the safe side and keep it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is never used from this plugin sources, but it's public and it could be used somewhere else

Well, if it is not used it can be deleted. (I usually just do a GH org-wide search.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then there is stuff like https://github.com/amuniz/kubernetes-plugin/blob/3b78a0ba311af4c091b94c070b9a8c8b6ab6768b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesLauncher.java#L199-L200 which it would not make sense to do twice, but there is no current defense against race conditions. It seems there are deeper bugs waiting here.

}
LOGGER.log(INFO, () -> "Created Pod: " + cloudName + " " + namespace + "/" + podName);
listener.getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName);
Metrics.metricRegistry().counter(MetricNames.PODS_CREATED).inc();

node.getRunListener().getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName);
kubernetesComputer.setLaunching(true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we fix this field please, so it is not persisting an AtomicBoolean?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it need to be an AtomicBoolean?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see #1479 (comment)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 3b78a0b


ObjectMeta podMetadata = pod.getMetadata();
Expand Down Expand Up @@ -268,7 +287,7 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) {
}

computer.setAcceptingTasks(true);
launched.set(true);
launched = true;
try {
// We need to persist the "launched" setting...
node.save();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ public abstract class AbstractKubernetesPipelineRJRTest {
}
}

protected RunId runId;

private SetupCloud setup;

public AbstractKubernetesPipelineRJRTest(SetupCloud setup) {
Expand All @@ -52,7 +50,10 @@ public static void isKubernetesConfigured() throws Exception {
public void setUp() throws Throwable {
rjr.startJenkins();
rjr.runRemotely(setup);
runId = rjr.runRemotely(new CreateWorkflowJobThenScheduleRun(
}

protected RunId createWorkflowJobThenScheduleRun() throws Throwable {
return rjr.runRemotely(new CreateWorkflowJobThenScheduleRun(
KubernetesTestUtil.loadPipelineScript(getClass(), name.getMethodName() + ".groovy")));
amuniz marked this conversation as resolved.
Show resolved Hide resolved
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.net.UnknownHostException;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildStatusSuccess;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.RunId;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.SetupCloud;
import org.junit.Test;

Expand All @@ -29,6 +30,7 @@ public KubernetesDeclarativeAgentRJRTest() throws UnknownHostException {

@Test
public void declarative() throws Throwable {
RunId runId = createWorkflowJobThenScheduleRun();
rjr.runRemotely(new AssertBuildStatusSuccess(runId));
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
package org.csanchez.jenkins.plugins.kubernetes.pipeline;

import io.fabric8.kubernetes.api.model.NodeBuilder;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.KubernetesClientBuilder;
import java.net.UnknownHostException;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesTestUtil;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildLogMessage;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildStatusSuccess;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.CreateWorkflowJobThenScheduleTask;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.RunId;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.SetupCloud;
import org.junit.Test;

Expand All @@ -12,6 +19,44 @@ public KubernetesPipelineRJRTest() throws UnknownHostException {

@Test
public void basicPipeline() throws Throwable {
RunId runId = createWorkflowJobThenScheduleRun();
rjr.runRemotely(new AssertBuildStatusSuccess(runId));
}

@Test
public void restartDuringPodLaunch() throws Throwable {
// try to run something on a pod which is not schedulable (disktype=special)
RunId build = rjr.runRemotely(new CreateWorkflowJobThenScheduleTask(
KubernetesTestUtil.loadPipelineScript(getClass(), name.getMethodName() + ".groovy")));
// the pod is created, but not connected yet
rjr.runRemotely(new AssertBuildLogMessage("Created Pod", build));
// restart
rjr.stopJenkins();
rjr.startJenkins();
// update k8s to make a node suitable to schedule (add disktype=special to the node)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A little scary to have the test even temporarily patch the Node! Would be safer to do something to the namespace like a quota 🤔 Fine enough for Kind/CI, could just become a problem if we try to run tests on a permanent cluster.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the easiest way I found, and with the finally it's granted to be back to the original state.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So long as the JVM terminates normally, yes.

System.out.println("Adding label to node....");
try (KubernetesClient client = new KubernetesClientBuilder().build()) {
String nodeName =
client.nodes().list().getItems().get(0).getMetadata().getName();
client.nodes().withName(nodeName).edit(n -> new NodeBuilder(n)
.editMetadata()
.addToLabels("disktype", "special")
.endMetadata()
.build());

// pod connects back and the build finishes correctly
rjr.runRemotely(new AssertBuildStatusSuccess(build));
} finally {
// clean up
try (KubernetesClient client = new KubernetesClientBuilder().build()) {
String nodeName =
client.nodes().list().getItems().get(0).getMetadata().getName();
client.nodes().withName(nodeName).edit(n -> new NodeBuilder(n)
.editMetadata()
.removeFromLabels("disktype")
.endMetadata()
.build());
}
}
}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without the fix this test fails with the log in the linked ticket.

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.net.UnknownHostException;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildStatusSuccess;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.RunId;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.SetupCloud;
import org.junit.Test;

Expand All @@ -13,6 +14,7 @@ public KubernetesPipelineWebsocketRJRTest() throws UnknownHostException {

@Test
public void basicPipeline() throws Throwable {
RunId runId = createWorkflowJobThenScheduleRun();
rjr.runRemotely(new AssertBuildStatusSuccess(runId));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package org.csanchez.jenkins.plugins.kubernetes.pipeline.steps;

import org.jenkinsci.plugins.workflow.job.WorkflowJob;
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.RealJenkinsRule;

public class AssertBuildLogMessage implements RealJenkinsRule.Step {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW you can also make these things static methods in some util class; a bit less boilerplate.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, but less reusable then...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is that less reusable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When writing a new test it seems natural to go to the steps package and look for generic steps to use in the new test, however inspecting other test classes looking for static methods seems less discoverable to me at least. That will likely end up with re-writing the same (or very similar) static methods in multiple test classes.

Copy link
Member

@jglick jglick May 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inspecting other test classes looking for static methods

I meant reusable static methods in a utility class in the steps package. Or just a single class in the main test package called Steps. Etc.


private final String message;
private final RunId runId;

public AssertBuildLogMessage(String message, RunId runId) {
this.message = message;
this.runId = runId;
}

@Override
public void run(JenkinsRule r) throws Throwable {
WorkflowJob p = r.jenkins.getItemByFullName(runId.name, WorkflowJob.class);
WorkflowRun b = p.getBuildByNumber(runId.number);
r.waitForMessage(message, b);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package org.csanchez.jenkins.plugins.kubernetes.pipeline.steps;

import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition;
import org.jenkinsci.plugins.workflow.job.WorkflowJob;
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.RealJenkinsRule;

/**
* Creates a workflow job using the specified script, then schedules it and returns a reference to the run.
*/
public class CreateWorkflowJobThenScheduleTask implements RealJenkinsRule.Step2<RunId> {
private String script;

public CreateWorkflowJobThenScheduleTask(String script) {
this.script = script;
}

@Override
public RunId run(JenkinsRule r) throws Throwable {
WorkflowJob project = r.createProject(WorkflowJob.class);
project.setDefinition(new CpsFlowDefinition(script, true));
project.save();
System.out.println("Scheduling build...");
WorkflowRun b = project.scheduleBuild2(0).getStartCondition().get();
System.out.println("Build scheduled...");
return new RunId(project.getFullName(), b.number);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
podTemplate(yaml: '''
apiVersion: v1
kind: Pod
spec:
nodeSelector:
disktype: special
''') {
node(POD_LABEL) {
sh 'true'
}
}
2 changes: 1 addition & 1 deletion test-in-k8s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ rules:
verbs: ["watch"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list"]
verbs: ["list","get","patch","update"] # KubernetesPipelineRJRTest.restartDuringPodLaunch
- apiGroups: [""]
resources: ["secrets"]
verbs: ["create","delete","get","list","patch","update","watch"]
Expand Down