Skip to content

Commit

Permalink
kube-updater: Wire up main executable (#23565)
Browse files Browse the repository at this point in the history
* kube-updater: write main function, fix scheme, reduce grace period

* fixup! kube-updater: write main function, fix scheme, reduce grace period

* Address feedback

* Update integrations/kube-agent-updater/cmd/teleport-kube-agent-updater/main.go

Co-authored-by: Roman Tkachenko <roman@goteleport.com>

---------

Co-authored-by: Roman Tkachenko <roman@goteleport.com>
  • Loading branch information
hugoShaka and r0mant authored Mar 31, 2023
1 parent 0f3c14e commit 311b1be
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 20 deletions.
26 changes: 26 additions & 0 deletions integrations/kube-agent-updater/DEBUG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
## Debugging tips for the kube-agent-updater

### Running locally the updater against a remote Kubernetes cluster

Running locally let you attach a debugger while still working against a real
cluster. This can be used to reproduce most complex issues and troubleshoot
specific cases.

- Validate your current context works
```shell
kubectl cluster-info
```
- Open a proxy to the api-server, then let the shell open and running
```shell
kubectl proxy
```
- open a new terminal, create a new temporary directory and create your new kubeconfig
```shell
export kubeconfig="$(mktemp)"
kubectl config set-credentials myself --username=foo
kubectl config set-cluster local-server --server=http://localhost:8001
kubectl config set-context default-context --cluster=local-server --user=myself
kubectl config use-context default-context
echo "$KUBECONFIG"
```
- run the controller with the `KUBECONFIG` environment variable set
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
Copyright 2023 Gravitational, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

// teleportProdOCIPubKey is the key used to sign Teleport distroless images.
// The key lives in the Teleport production AWS KMS.
// In case of controlled rotation, we will want to add a second validator with
// the new key to support the transition period.
var teleportProdOCIPubKey = []byte(`-----BEGIN PUBLIC KEY-----
MIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKCAgEAx+9UZboMl9ibwu/IWqbX
+wEJeKJqVpaLEsy1ODRpzIgcgaMh2n3BWtFEIoEszR3ZNlGdfqoPmb0nNnWx/qSf
eEsoSXievXa63M/gAUBB+jecbGEJH+SNaJPMVuvjabPqKtoMT2Spw3cacqpINzq1
rkWU8IawY333gXbwzgsuK7izT7ymgOLPO9qPuX7Q3EBaGw3EvY7u6UKtqhvSGdyr
MirEErOERQ8EP8TrkCcJk0UfPAukzIcj91uHlXaqYBD/IyNYiC70EOlSLoN5/EeA
I4jQnGRfaKF6H6K+WieX9tP9k8/02S+1EVJW592pdQZhJZEq1B/dMc8UR3IjPMMC
qCT2xT6TsinaVzDaAbaRf0hvp311GxwrckNofGm/OSLn1+HqM6q4/A7qHubeRXGO
byabRr93CHSLegZ7OBMswHqqnu6/DuXjc6gOsQkH09dVTFeh34rQy4GKrvnpmOwj
Er1ccxzKcF/pw+lxi07hkpihR/uHUPxFboA/Wl7H2Jub21MFwIFQrDJv7z8yQgxJ
EuIXJJox2oAL7NzdSi9VIUYnEnx+2EtkU/spAFRR6i1BnT6aoIy3521B76wnmRr9
atCSKjt6MdRxgj4htCjBWWJAGM9Z/avF4CYFmK7qiVxgpdrSM8Esbt2Ta+Lu3QMJ
T8LjqFu3u3dxVOo9RuLk+BkCAwEAAQ==
-----END PUBLIC KEY-----`)
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,21 @@ package main

import (
"flag"
"net/url"
"os"
"strings"
"time"

"github.com/docker/distribution/reference"
"github.com/gravitational/trace"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
runtimescheme "sigs.k8s.io/controller-runtime/pkg/scheme"

"github.com/gravitational/teleport/integrations/kube-agent-updater/pkg/controller"
"github.com/gravitational/teleport/integrations/kube-agent-updater/pkg/img"
Expand All @@ -39,18 +41,12 @@ import (
)

var (
SchemeBuilder = &runtimescheme.Builder{GroupVersion: appsv1.SchemeGroupVersion}
scheme = runtime.NewScheme()
scheme = runtime.NewScheme()
)

func init() {
SchemeBuilder.Register(
&appsv1.Deployment{},
&appsv1.DeploymentList{},
&appsv1.StatefulSet{},
&appsv1.StatefulSetList{},
)
utilruntime.Must(SchemeBuilder.AddToScheme(scheme))
utilruntime.Must(appsv1.AddToScheme(scheme))
utilruntime.Must(v1.AddToScheme(scheme))
}

func main() {
Expand All @@ -61,12 +57,22 @@ func main() {
var metricsAddr string
var probeAddr string
var syncPeriod time.Duration
var baseImageName string
var versionServer string
var versionChannel string
var insecureNoVerify bool
var disableLeaderElection bool

flag.StringVar(&agentName, "agent-name", "", "The name of the agent that should be updated. This is mandatory.")
flag.StringVar(&agentNamespace, "agent-namespace", "", "The namespace of the agent that should be updated. This is mandatory.")
flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "healthz-addr", ":8081", "The address the probe endpoint binds to.")
flag.DurationVar(&syncPeriod, "sync-period", 10*time.Hour, "Operator sync period (format: https://pkg.go.dev/time#ParseDuration)")
flag.BoolVar(&insecureNoVerify, "insecure-no-verify-image", false, "Disable image signature verification.")
flag.BoolVar(&disableLeaderElection, "disable-leader-election", false, "Disable leader election, used when running the kube-agent-updater outside of Kubernetes.")
flag.StringVar(&versionServer, "version-server", "https://update.gravitational.io/v1/", "URL of the HTTP server advertising target version and critical maintenances. Trailing slash is optional.")
flag.StringVar(&versionChannel, "version-channel", "cloud/stable", "Version channel to get updates from.")
flag.StringVar(&baseImageName, "base-image", "public.ecr.aws/gravitational/teleport", "Image reference containing registry and repository.")

opts := zap.Options{
Development: true,
Expand All @@ -88,7 +94,7 @@ func main() {
MetricsBindAddress: metricsAddr,
Port: 9443,
HealthProbeBindAddress: probeAddr,
LeaderElection: true,
LeaderElection: !disableLeaderElection,
LeaderElectionID: agentName,
Namespace: agentNamespace,
SyncPeriod: &syncPeriod,
Expand All @@ -108,16 +114,36 @@ func main() {
os.Exit(1)
}

// TODO: replace those mocks by the real thing
versionGetter := version.NewGetterMock("12.0.3", nil)
imageValidators := []img.Validator{
img.NewImageValidatorMock("mock", true, img.NewImageRef("", "", "", "")),
versionServerURL, err := url.Parse(strings.TrimRight(versionServer, "/") + "/" + versionChannel)
if err != nil {
ctrl.Log.Error(err, "failed to pasre version server URL, exiting")
os.Exit(1)
}
versionGetter := version.NewBasicHTTPVersionGetter(versionServerURL)
maintenanceTriggers := maintenance.Triggers{
maintenance.NewBasicHTTPMaintenanceTrigger("critical update", versionServerURL),
maintenance.NewUnhealthyWorkloadTrigger("unhealthy pods", mgr.GetClient()),
maintenance.NewWindowTrigger("maintenance window", mgr.GetClient()),
}

var imageValidators img.Validators
if insecureNoVerify {
ctrl.Log.Info("INSECURE: Image validation disabled")
imageValidators = append(imageValidators, img.NewInsecureValidator("insecure always verify"))
} else {
validator, err := img.NewCosignSingleKeyValidator(teleportProdOCIPubKey, "cosign signature validator")
if err != nil {
ctrl.Log.Error(err, "failed to build image validator, exiting")
os.Exit(1)
}
imageValidators = append(imageValidators, validator)
}
maintenanceTriggers := []maintenance.Trigger{
maintenance.NewMaintenanceTriggerMock("never", false),

baseImage, err := reference.ParseNamed(baseImageName)
if err != nil {
ctrl.Log.Error(err, "failed to parse base image reference, exiting")
os.Exit(1)
}
baseImage, _ := reference.ParseNamed("public.ecr.aws/trent-playground/gravitational/teleport")
// End of mocks

versionUpdater := controller.NewVersionUpdater(versionGetter, imageValidators, maintenanceTriggers, baseImage)

Expand Down
3 changes: 3 additions & 0 deletions integrations/kube-agent-updater/pkg/controller/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ const (
defaultRequeue = 30 * time.Minute
reconciliationTimeout = 2 * time.Minute
kubeClientTimeout = 1 * time.Minute
// skipReconciliationAnnotation is inspired by the tenant-operator one
// (from the Teleport Cloud) but namespaced under `teleport.dev`
skipReconciliationAnnotation = "teleport.dev/skipreconcile"
)

var (
Expand Down
4 changes: 4 additions & 0 deletions integrations/kube-agent-updater/pkg/controller/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ func (r *DeploymentVersionUpdater) Reconcile(ctx context.Context, req ctrl.Reque
}
return ctrl.Result{}, trace.Wrap(err)
}
if skipReconciliation(&obj) {
log.Info("Reconciliation disabled by resource annotations. Skipping.")
return requeueLater, nil
}

// Get the current and past version
currentVersion, err := getWorkloadVersion(obj.Spec.Template.Spec)
Expand Down
4 changes: 4 additions & 0 deletions integrations/kube-agent-updater/pkg/controller/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ func (r *StatefulSetVersionUpdater) Reconcile(ctx context.Context, req ctrl.Requ
}
return ctrl.Result{}, trace.Wrap(err)
}
if skipReconciliation(&obj) {
log.Info("Reconciliation disabled by resource annotations. Skipping.")
return requeueLater, nil
}

// Get the current and past version
currentVersion, err := getWorkloadVersion(obj.Spec.Template.Spec)
Expand Down
19 changes: 19 additions & 0 deletions integrations/kube-agent-updater/pkg/controller/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ limitations under the License.
package controller

import (
"strconv"

"github.com/docker/distribution/reference"
"github.com/gravitational/trace"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/gravitational/teleport/integrations/kube-agent-updater/pkg/version"
)
Expand Down Expand Up @@ -65,3 +68,19 @@ func setContainerImageFromPodSpec(spec *v1.PodSpec, container, image string) err
}
return trace.NotFound("container %q not found in podSpec", container)
}

// skipReconciliation checks if the object has an annotation specifying that we
// must skip the reconciliation. Disabling reconciliation is useful for
// debugging purposes or when the user wants to suspend the updater for some
// reason.
func skipReconciliation(object metav1.Object) bool {
annotations := object.GetAnnotations()
if reconciliationAnnotation, ok := annotations[skipReconciliationAnnotation]; ok {
skip, err := strconv.ParseBool(reconciliationAnnotation)
if err != nil {
return false
}
return skip
}
return false
}
63 changes: 63 additions & 0 deletions integrations/kube-agent-updater/pkg/img/insecure.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
Copyright 2023 Gravitational, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package img

import (
"context"

"github.com/docker/distribution/reference"
"github.com/gravitational/trace"
"github.com/opencontainers/go-digest"
)

type insecureValidator struct {
name string
}

// Name returns the validator name
func (v *insecureValidator) Name() string {
return v.name
}

// TODO: cache this to protect against registry quotas
// The image validation is only invoked when we are in a maintenance window and
// the target version is different than our current version. In regular usage we
// are called only once per update. However, Kubernetes controllers failure mode
// is usually infinite retry loop. If something fails after the image validation,
// we might get called in a loop indefinitely. To mitigate the impact of such
// failure, ValidateAndResolveDigest should cache its result.

// ValidateAndResolveDigest resolves the image digest and always return the
// image is valid. Using this validator makes you vulnerable in case of image
// registry compromise.
func (v *insecureValidator) ValidateAndResolveDigest(ctx context.Context, image reference.NamedTagged) (NamedTaggedDigested, error) {
ref, err := NamedTaggedToDigest(image)
if err != nil {
return nil, trace.Wrap(err)
}

digestedImage := NewImageRef(ref.RegistryStr(), ref.RepositoryStr(), image.Tag(), digest.Digest(ref.DigestStr()))
return digestedImage, nil
}

// NewInsecureValidator returns an img.Validator that only resolves the image
// but does not check its signature.
func NewInsecureValidator(name string) Validator {
return &insecureValidator{
name: name,
}
}
8 changes: 7 additions & 1 deletion integrations/kube-agent-updater/pkg/podutils/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,13 @@ func Not(filterFunc FilterFunc) FilterFunc {
}
}

const podReadinessGracePeriod = 10 * time.Minute
// podReadinessGracePeriod represents how much time we wait before we consider
// the pod (and a fortiori the workload) unhealthy. We might want to empirically
// tune this value. A higher value can lead to workloads being stuck longer in
// case of error. A shorter value might cause false positives and trigger
// updates because of other cluster-related events like network issues, registry
// downtime or missing capacity.
const podReadinessGracePeriod = 5 * time.Minute

// IsUnhealthy checks if a pod has not been ready since at least 10 minutes/
// This heuristic also detects infrastructure issues like not enough room to
Expand Down

0 comments on commit 311b1be

Please sign in to comment.