From b0c67638ab0f5612caab5633977d09b2a740df27 Mon Sep 17 00:00:00 2001 From: Nikhil Thomas Date: Thu, 27 Jan 2022 07:15:31 +0530 Subject: [PATCH] Fix upgrade deadlock in Pipeline and Triggers Add a mechanism to disable webhook validation if the configMaps are not already there in the cluster. What this patch does: when webhook endpoint slice is [], the "rules" section of config.webhook.** validatingwebhookconfiguration is remove. So that the updates on configMaps eg: config-defaults are not deadlocked. The dead lock is caused due to a cluster level cyclic dependency in tektoncd/pipelines and in tektoncd/triggers. More details will be captured in issues in the both repository. Note: infact this deadlock can occur in any tektoncd project which follows tektoncd/pipeline file list structure in `config/`. Signed-off-by: Nikhil Thomas --- pkg/reconciler/common/deadlockbreaker.go | 92 +++++++++++++++++++ .../tektonpipeline/tektonpipeline.go | 8 +- .../kubernetes/tektontrigger/controller.go | 2 + .../kubernetes/tektontrigger/tektontrigger.go | 10 +- 4 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 pkg/reconciler/common/deadlockbreaker.go diff --git a/pkg/reconciler/common/deadlockbreaker.go b/pkg/reconciler/common/deadlockbreaker.go new file mode 100644 index 0000000000..073ccd69b9 --- /dev/null +++ b/pkg/reconciler/common/deadlockbreaker.go @@ -0,0 +1,92 @@ +/* +Copyright 2022 The Tekton Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package common + +import ( + "context" + + "github.com/manifestival/manifestival" + "github.com/tektoncd/operator/pkg/apis/operator/v1alpha1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/client-go/kubernetes" +) + +var webhookNames = map[string]string{ + v1alpha1.PipelineResourceName: "config.webhook.pipeline.tekton.dev", + v1alpha1.TriggerResourceName: "config.webhook.triggers.tekton.dev", +} + +var webhookServiceNames = map[string]string{ + v1alpha1.PipelineResourceName: "tekton-pipelines-webhook", + v1alpha1.TriggerResourceName: "tekton-triggers-webhook", +} + +func PreemptDeadlock(ctx context.Context, m *manifestival.Manifest, kc kubernetes.Interface, component string) error { + + // check if there are pod endpoints populated for webhhook service + webhookServiceName := webhookServiceNames[component] + ok, err := isWebhookEndpointsActive(m, kc, webhookServiceName) + if err != nil { + return err + } + if ok { + return nil + } + // if endpoints are empty, set webhook definition rules + // to the initial state where the webhook pod can refill the rules when it comes up + webhookName := webhookNames[component] + err = removeValidatingWebhookRules(m, kc, webhookName) + if err != nil { + return err + } + return nil +} + +// isWebhookEndpointsActive checks if the there are valid Endpoint resources associated with a webhook service +func isWebhookEndpointsActive(m *manifestival.Manifest, kc kubernetes.Interface, svcName string) (bool, error) { + svcResource := m.Filter(manifestival.ByKind("Service"), manifestival.ByName(svcName)) + targetNamespace := svcResource.Resources()[0].GetNamespace() + endPoint, err := kc.CoreV1().Endpoints(targetNamespace).Get(context.TODO(), svcName, v1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + return false, err + } + if endPoint.Subsets == nil || len(endPoint.Subsets) == 0 { + return false, nil + } + return true, nil +} + +// removeValidatingWebhookRules remove "rules" from config.webhook.** webhook definiton(s) +func removeValidatingWebhookRules(m *manifestival.Manifest, kc kubernetes.Interface, webhookName string) error { + cmValidationWebHookManifest := m.Filter(manifestival.ByName(webhookName)) + cmValidationWebHookManifest, err := cmValidationWebHookManifest.Transform(removeWebhooks) + if err != nil { + return err + } + return cmValidationWebHookManifest.Apply() +} + +// removeWebhooks is a Transformer function which clears our webhooks[...].rules +func removeWebhooks(u *unstructured.Unstructured) error { + unstructured.RemoveNestedField(u.Object, "webhooks") + return nil +} diff --git a/pkg/reconciler/kubernetes/tektonpipeline/tektonpipeline.go b/pkg/reconciler/kubernetes/tektonpipeline/tektonpipeline.go index 2cb6fdc100..a2356c0538 100644 --- a/pkg/reconciler/kubernetes/tektonpipeline/tektonpipeline.go +++ b/pkg/reconciler/kubernetes/tektonpipeline/tektonpipeline.go @@ -263,8 +263,14 @@ func (r *Reconciler) ReconcileKind(ctx context.Context, tp *v1alpha1.TektonPipel return nil } else if ready.Status == corev1.ConditionFalse { tp.Status.MarkInstallerSetNotReady(ready.Message) + manifest := r.manifest + if err := r.transform(ctx, &manifest, tp); err != nil { + logger.Error("manifest transformation failed: ", err) + return err + } + err = common.PreemptDeadlock(ctx, &manifest, r.kubeClientSet, v1alpha1.PipelineResourceName) r.enqueueAfter(tp, 10*time.Second) - return nil + return err } // Mark InstallerSet Ready diff --git a/pkg/reconciler/kubernetes/tektontrigger/controller.go b/pkg/reconciler/kubernetes/tektontrigger/controller.go index 6da71eb1b1..71f3845dd5 100644 --- a/pkg/reconciler/kubernetes/tektontrigger/controller.go +++ b/pkg/reconciler/kubernetes/tektontrigger/controller.go @@ -20,6 +20,7 @@ import ( "context" "github.com/tektoncd/operator/pkg/reconciler/kubernetes/initcontroller" + kubeclient "knative.dev/pkg/client/injection/kube/client" tektonInstallerinformer "github.com/tektoncd/operator/pkg/client/injection/informers/operator/v1alpha1/tektoninstallerset" tektonPipelineinformer "github.com/tektoncd/operator/pkg/client/injection/informers/operator/v1alpha1/tektonpipeline" @@ -67,6 +68,7 @@ func NewExtendedController(generator common.ExtensionGenerator) injection.Contro } c := &Reconciler{ + kubeClientSet: kubeclient.Get(ctx), operatorClientSet: operatorclient.Get(ctx), pipelineInformer: tektonPipelineinformer.Get(ctx), extension: generator(ctx), diff --git a/pkg/reconciler/kubernetes/tektontrigger/tektontrigger.go b/pkg/reconciler/kubernetes/tektontrigger/tektontrigger.go index 50fb71a4bb..3eb3d38c9a 100644 --- a/pkg/reconciler/kubernetes/tektontrigger/tektontrigger.go +++ b/pkg/reconciler/kubernetes/tektontrigger/tektontrigger.go @@ -35,6 +35,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" "knative.dev/pkg/apis" "knative.dev/pkg/logging" pkgreconciler "knative.dev/pkg/reconciler" @@ -66,6 +67,7 @@ type Reconciler struct { enqueueAfter func(obj interface{}, after time.Duration) triggersVersion string operatorVersion string + kubeClientSet kubernetes.Interface } // Check that our Reconciler implements controller.Reconciler @@ -277,8 +279,14 @@ func (r *Reconciler) ReconcileKind(ctx context.Context, tt *v1alpha1.TektonTrigg return nil } else if ready.Status == corev1.ConditionFalse { tt.Status.MarkInstallerSetNotReady(ready.Message) + manifest := r.manifest + if err := r.transform(ctx, &manifest, tt); err != nil { + logger.Error("manifest transformation failed: ", err) + return err + } + err = common.PreemptDeadlock(ctx, &manifest, r.kubeClientSet, v1alpha1.TriggerResourceName) r.enqueueAfter(tt, 10*time.Second) - return nil + return err } // Mark InstallerSet Ready