From 917b09617de12077f97d3e4c2771b36138c3cc30 Mon Sep 17 00:00:00 2001 From: Israel Blancas Date: Tue, 12 Nov 2024 21:35:17 +0100 Subject: [PATCH] Fix error when the operator metrics ServiceMonitor already exists (#3447) * Fix error when the operator metrics ServiceMonitor already exists Signed-off-by: Israel Blancas * Wrap the errors and log them Signed-off-by: Israel Blancas --------- Signed-off-by: Israel Blancas --- .chloggen/3446.yaml | 19 +++++ internal/operator-metrics/metrics.go | 82 +++++++++++++++++---- internal/operator-metrics/metrics_test.go | 86 +++++++++++++++++++++-- main.go | 2 +- 4 files changed, 167 insertions(+), 22 deletions(-) create mode 100755 .chloggen/3446.yaml diff --git a/.chloggen/3446.yaml b/.chloggen/3446.yaml new file mode 100755 index 0000000000..a8cb5de7ad --- /dev/null +++ b/.chloggen/3446.yaml @@ -0,0 +1,19 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: 'bug_fix' + +# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action) +component: operator + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Operator pod crashed if the Service Monitor for the operator metrics was created before by another operator pod. + +# One or more tracking issues related to the change +issues: [3446] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: | + Operator fails when the pod is restarted and the Service Monitor for operator metrics was already created by another operator pod. + To fix this, the operator now sets the owner reference on the Service Monitor to itself and checks if the Service Monitor already exists. + diff --git a/internal/operator-metrics/metrics.go b/internal/operator-metrics/metrics.go index 43b3a607e3..dd95e16e7e 100644 --- a/internal/operator-metrics/metrics.go +++ b/internal/operator-metrics/metrics.go @@ -19,8 +19,11 @@ import ( "fmt" "os" + "github.com/go-logr/logr" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" @@ -51,9 +54,10 @@ var _ manager.Runnable = &OperatorMetrics{} type OperatorMetrics struct { kubeClient client.Client + log logr.Logger } -func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme) (OperatorMetrics, error) { +func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme, log logr.Logger) (OperatorMetrics, error) { kubeClient, err := client.New(config, client.Options{Scheme: scheme}) if err != nil { return OperatorMetrics{}, err @@ -61,16 +65,74 @@ func NewOperatorMetrics(config *rest.Config, scheme *runtime.Scheme) (OperatorMe return OperatorMetrics{ kubeClient: kubeClient, + log: log, }, nil } func (om OperatorMetrics) Start(ctx context.Context) error { + err := om.createOperatorMetricsServiceMonitor(ctx) + if err != nil { + om.log.Error(err, "error creating Service Monitor for operator metrics") + } + + return nil +} + +func (om OperatorMetrics) NeedLeaderElection() bool { + return true +} + +func (om OperatorMetrics) caConfigMapExists() bool { + return om.kubeClient.Get(context.Background(), client.ObjectKey{ + Name: caBundleConfigMap, + Namespace: openshiftInClusterMonitoringNamespace, + }, &corev1.ConfigMap{}, + ) == nil +} + +func (om OperatorMetrics) getOwnerReferences(ctx context.Context, namespace string) (metav1.OwnerReference, error) { + var deploymentList appsv1.DeploymentList + + listOptions := []client.ListOption{ + client.InNamespace(namespace), + client.MatchingLabels(map[string]string{ + "app.kubernetes.io/name": "opentelemetry-operator", + "control-plane": "controller-manager", + }), + } + + err := om.kubeClient.List(ctx, &deploymentList, listOptions...) + if err != nil { + return metav1.OwnerReference{}, err + } + + if len(deploymentList.Items) == 0 { + return metav1.OwnerReference{}, fmt.Errorf("no deployments found with the specified label") + } + deployment := &deploymentList.Items[0] + + ownerRef := metav1.OwnerReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: deployment.Name, + UID: deployment.UID, + } + + return ownerRef, nil +} + +func (om OperatorMetrics) createOperatorMetricsServiceMonitor(ctx context.Context) error { rawNamespace, err := os.ReadFile(namespaceFile) if err != nil { return fmt.Errorf("error reading namespace file: %w", err) } namespace := string(rawNamespace) + ownerRef, err := om.getOwnerReferences(ctx, namespace) + if err != nil { + return fmt.Errorf("error getting owner references: %w", err) + } + var tlsConfig *monitoringv1.TLSConfig if om.caConfigMapExists() { @@ -101,6 +163,7 @@ func (om OperatorMetrics) Start(ctx context.Context) error { "app.kubernetes.io/part-of": "opentelemetry-operator", "control-plane": "controller-manager", }, + OwnerReferences: []metav1.OwnerReference{ownerRef}, }, Spec: monitoringv1.ServiceMonitorSpec{ Selector: metav1.LabelSelector{ @@ -123,23 +186,12 @@ func (om OperatorMetrics) Start(ctx context.Context) error { } err = om.kubeClient.Create(ctx, &sm) - if err != nil { - return fmt.Errorf("error creating service monitor: %w", err) + // The ServiceMonitor can be already there if this is a restart + if err != nil && !apierrors.IsAlreadyExists(err) { + return err } <-ctx.Done() return om.kubeClient.Delete(ctx, &sm) } - -func (om OperatorMetrics) NeedLeaderElection() bool { - return true -} - -func (om OperatorMetrics) caConfigMapExists() bool { - return om.kubeClient.Get(context.Background(), client.ObjectKey{ - Name: caBundleConfigMap, - Namespace: openshiftInClusterMonitoringNamespace, - }, &corev1.ConfigMap{}, - ) == nil -} diff --git a/internal/operator-metrics/metrics_test.go b/internal/operator-metrics/metrics_test.go index ae625bfff4..a0293fa2e5 100644 --- a/internal/operator-metrics/metrics_test.go +++ b/internal/operator-metrics/metrics_test.go @@ -17,12 +17,15 @@ package operatormetrics import ( "context" "os" + "reflect" "testing" "time" + "github.com/go-logr/logr" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -30,13 +33,14 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestNewOperatorMetrics(t *testing.T) { config := &rest.Config{} scheme := runtime.NewScheme() - metrics, err := NewOperatorMetrics(config, scheme) + metrics, err := NewOperatorMetrics(config, scheme, logr.Discard()) assert.NoError(t, err) assert.NotNil(t, metrics.kubeClient) } @@ -53,12 +57,15 @@ func TestOperatorMetrics_Start(t *testing.T) { namespaceFile = tmpFile.Name() scheme := runtime.NewScheme() - err = corev1.AddToScheme(scheme) - require.NoError(t, err) - err = monitoringv1.AddToScheme(scheme) - require.NoError(t, err) + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, appsv1.AddToScheme(scheme)) + require.NoError(t, monitoringv1.AddToScheme(scheme)) - client := fake.NewClientBuilder().WithScheme(scheme).Build() + client := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects( + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "opentelemetry-operator", Namespace: "test-namespace", Labels: map[string]string{"app.kubernetes.io/name": "opentelemetry-operator", "control-plane": "controller-manager"}}, + }, + ).Build() metrics := OperatorMetrics{kubeClient: client} @@ -125,3 +132,70 @@ func TestOperatorMetrics_caConfigMapExists(t *testing.T) { metricsWithoutConfigMap := OperatorMetrics{kubeClient: clientWithoutConfigMap} assert.False(t, metricsWithoutConfigMap.caConfigMapExists()) } + +func TestOperatorMetrics_getOwnerReferences(t *testing.T) { + tests := []struct { + name string + namespace string + objects []client.Object + want metav1.OwnerReference + wantErr bool + }{ + { + name: "successful owner reference retrieval", + namespace: "test-namespace", + objects: []client.Object{ + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "opentelemetry-operator", + Namespace: "test-namespace", + UID: "test-uid", + Labels: map[string]string{ + "app.kubernetes.io/name": "opentelemetry-operator", + "control-plane": "controller-manager", + }, + }, + }, + }, + want: metav1.OwnerReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "opentelemetry-operator", + UID: "test-uid", + }, + wantErr: false, + }, + { + name: "no deployments found", + namespace: "test-namespace", + objects: []client.Object{}, + want: metav1.OwnerReference{}, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := runtime.NewScheme() + _ = appsv1.AddToScheme(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tt.objects...). + Build() + + om := OperatorMetrics{ + kubeClient: fakeClient, + log: logr.Discard(), + } + + got, err := om.getOwnerReferences(context.Background(), tt.namespace) + if (err != nil) != tt.wantErr { + t.Errorf("getOwnerReferences() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("getOwnerReferences() got = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/main.go b/main.go index b6330b2ef1..62ad926d87 100644 --- a/main.go +++ b/main.go @@ -424,7 +424,7 @@ func main() { } if cfg.PrometheusCRAvailability() == prometheus.Available { - operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme) + operatorMetrics, opError := operatormetrics.NewOperatorMetrics(mgr.GetConfig(), scheme, ctrl.Log.WithName("operator-metrics-sm")) if opError != nil { setupLog.Error(opError, "Failed to create the operator metrics SM") }