From 4103b32d0a0db4751daa265921e2ac696f3ccb11 Mon Sep 17 00:00:00 2001 From: Mohamed-Amine Bouqsimi Date: Tue, 4 Oct 2022 17:58:52 +0200 Subject: [PATCH] operator: Use cluster monitoring alertmanager by default on openshift clusters (#7272) --- operator/CHANGELOG.md | 1 + .../loki-operator.clusterserviceversion.yaml | 6 + operator/config/rbac/role.yaml | 6 + .../controllers/loki/lokistack_controller.go | 1 + .../internal/openshift/alertmanager.go | 30 +++ .../handlers/lokistack_create_or_update.go | 15 ++ operator/internal/manifests/config.go | 9 +- operator/internal/manifests/config_test.go | 204 +++++++++++++++++- .../internal/manifests/gateway_tenants.go | 17 ++ .../manifests/gateway_tenants_test.go | 2 + .../internal/config/loki-config.yaml | 4 +- .../internal/manifests/openshift/build.go | 20 +- .../manifests/openshift/build_test.go | 31 ++- .../internal/manifests/openshift/configure.go | 59 +++++ .../internal/manifests/openshift/options.go | 4 + operator/internal/manifests/openshift/rbac.go | 71 +++++- .../manifests/openshift/serviceaccount.go | 25 ++- .../openshift/serviceaccount_test.go | 8 +- operator/internal/manifests/openshift/var.go | 17 +- operator/internal/manifests/ruler.go | 55 ++++- operator/internal/manifests/var.go | 2 + 21 files changed, 550 insertions(+), 37 deletions(-) create mode 100644 operator/internal/handlers/internal/openshift/alertmanager.go diff --git a/operator/CHANGELOG.md b/operator/CHANGELOG.md index 254d800144b50..ca4d894820c91 100644 --- a/operator/CHANGELOG.md +++ b/operator/CHANGELOG.md @@ -1,5 +1,6 @@ ## Main +- [7272](https://github.com/grafana/loki/pull/7272) **aminesnow**: Use cluster monitoring alertmanager by default on openshift clusters - [7295](https://github.com/grafana/loki/pull/7295) **xperimental**: Add extended-validation for rules on OpenShift - [6951](https://github.com/grafana/loki/pull/6951) **Red-GV**: Adding operational Lokistack alerts - [7254](https://github.com/grafana/loki/pull/7254) **periklis**: Expose Loki Ruler API via the lokistack-gateway diff --git a/operator/bundle/manifests/loki-operator.clusterserviceversion.yaml b/operator/bundle/manifests/loki-operator.clusterserviceversion.yaml index 1b7fa2495683b..2315967ded340 100644 --- a/operator/bundle/manifests/loki-operator.clusterserviceversion.yaml +++ b/operator/bundle/manifests/loki-operator.clusterserviceversion.yaml @@ -1097,6 +1097,12 @@ spec: - get - patch - update + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + verbs: + - patch - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/config/rbac/role.yaml b/operator/config/rbac/role.yaml index 5dfac08b20c24..38c74b296090b 100644 --- a/operator/config/rbac/role.yaml +++ b/operator/config/rbac/role.yaml @@ -172,6 +172,12 @@ rules: - get - patch - update +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + verbs: + - patch - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/controllers/loki/lokistack_controller.go b/operator/controllers/loki/lokistack_controller.go index ea21d29dca857..9acd6c8b08250 100644 --- a/operator/controllers/loki/lokistack_controller.go +++ b/operator/controllers/loki/lokistack_controller.go @@ -81,6 +81,7 @@ type LokiStackReconciler struct { // +kubebuilder:rbac:groups=apps,resources=deployments;statefulsets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings;clusterroles;roles;rolebindings,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=get;list;watch;create;update;delete +// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=alertmanagers,verbs=patch // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update // +kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses,verbs=get;list;watch;create;update // +kubebuilder:rbac:groups=config.openshift.io,resources=dnses;apiservers,verbs=get;list;watch diff --git a/operator/internal/handlers/internal/openshift/alertmanager.go b/operator/internal/handlers/internal/openshift/alertmanager.go new file mode 100644 index 0000000000000..7172beadeea2c --- /dev/null +++ b/operator/internal/handlers/internal/openshift/alertmanager.go @@ -0,0 +1,30 @@ +package openshift + +import ( + "context" + + "github.com/ViaQ/logerr/v2/kverrors" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/manifests/openshift" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// AlertManagerSVCExists returns true if the Openshift AlertManager is present in the cluster. +func AlertManagerSVCExists(ctx context.Context, stack lokiv1.LokiStackSpec, k k8s.Client) (bool, error) { + if stack.Tenants == nil || (stack.Tenants.Mode != lokiv1.OpenshiftLogging && stack.Tenants.Mode != lokiv1.OpenshiftNetwork) { + return false, nil + } + + var svc corev1.Service + key := client.ObjectKey{Name: openshift.MonitoringSVCOperated, Namespace: openshift.MonitoringNS} + + err := k.Get(ctx, key, &svc) + if err != nil && !apierrors.IsNotFound(err) { + return false, kverrors.Wrap(err, "failed to lookup alertmanager service", "name", key) + } + + return err == nil, nil +} diff --git a/operator/internal/handlers/lokistack_create_or_update.go b/operator/internal/handlers/lokistack_create_or_update.go index 7731779acfaef..fda5cf3e65228 100644 --- a/operator/internal/handlers/lokistack_create_or_update.go +++ b/operator/internal/handlers/lokistack_create_or_update.go @@ -12,10 +12,12 @@ import ( lokiv1beta1 "github.com/grafana/loki/operator/apis/loki/v1beta1" "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/handlers/internal/gateway" + "github.com/grafana/loki/operator/internal/handlers/internal/openshift" "github.com/grafana/loki/operator/internal/handlers/internal/rules" "github.com/grafana/loki/operator/internal/handlers/internal/storage" "github.com/grafana/loki/operator/internal/handlers/internal/tlsprofile" "github.com/grafana/loki/operator/internal/manifests" + manifests_openshift "github.com/grafana/loki/operator/internal/manifests/openshift" storageoptions "github.com/grafana/loki/operator/internal/manifests/storage" "github.com/grafana/loki/operator/internal/metrics" "github.com/grafana/loki/operator/internal/status" @@ -168,6 +170,7 @@ func CreateOrUpdateLokiStack( recordingRules []lokiv1beta1.RecordingRule rulerConfig *lokiv1beta1.RulerConfigSpec rulerSecret *manifests.RulerSecret + ocpAmEnabled bool ) if stack.Spec.Rules != nil && stack.Spec.Rules.Enabled { alertingRules, recordingRules, err = rules.List(ctx, k, req.Namespace, stack.Spec.Rules) @@ -203,6 +206,13 @@ func CreateOrUpdateLokiStack( } } } + + ocpAmEnabled, err = openshift.AlertManagerSVCExists(ctx, stack.Spec, k) + if err != nil { + ll.Error(err, "failed to check OCP AlertManager") + return err + } + } // Here we will translate the lokiv1.LokiStack options into manifest options @@ -226,6 +236,11 @@ func CreateOrUpdateLokiStack( Configs: tenantConfigs, }, TLSProfileType: projectconfigv1.TLSProfileType(fg.TLSProfile), + OpenShiftOptions: manifests_openshift.Options{ + BuildOpts: manifests_openshift.BuildOptions{ + AlertManagerEnabled: ocpAmEnabled, + }, + }, } ll.Info("begin building manifests") diff --git a/operator/internal/manifests/config.go b/operator/internal/manifests/config.go index b86a1ee80d837..655caea5bd894 100644 --- a/operator/internal/manifests/config.go +++ b/operator/internal/manifests/config.go @@ -16,6 +16,13 @@ import ( // LokiConfigMap creates the single configmap containing the loki configuration for the whole cluster func LokiConfigMap(opt Options) (*corev1.ConfigMap, string, error) { cfg := ConfigOptions(opt) + + if opt.Stack.Tenants != nil { + if err := ConfigureOptionsForMode(&cfg, opt); err != nil { + return nil, "", err + } + } + c, rc, err := config.Build(cfg) if err != nil { return nil, "", err @@ -55,8 +62,6 @@ func ConfigOptions(opt Options) config.Options { ) if rulerEnabled { - rulerEnabled = true - // Map alertmanager config from CRD to config options if opt.Ruler.Spec != nil { evalInterval = string(opt.Ruler.Spec.EvalutionInterval) diff --git a/operator/internal/manifests/config_test.go b/operator/internal/manifests/config_test.go index 7c28a4042ca69..222d9fda58558 100644 --- a/operator/internal/manifests/config_test.go +++ b/operator/internal/manifests/config_test.go @@ -7,8 +7,10 @@ import ( "github.com/google/uuid" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/apis/loki/v1beta1" "github.com/grafana/loki/operator/internal/manifests" "github.com/grafana/loki/operator/internal/manifests/internal/config" + "github.com/grafana/loki/operator/internal/manifests/openshift" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -28,7 +30,6 @@ func TestConfigOptions_UserOptionsTakePrecedence(t *testing.T) { // the user-defined values. This creates an all-inclusive manifests.Options and then checks // that every value is present in the result opts := randomConfigOptions() - res := manifests.ConfigOptions(opts) expected, err := json.Marshal(opts.Stack) @@ -287,3 +288,204 @@ func TestConfigOptions_RetentionConfig(t *testing.T) { }) } } + +func TestConfigOptions_RulerAlertManager(t *testing.T) { + tt := []struct { + desc string + opts manifests.Options + wantOptions *config.AlertManagerConfig + }{ + { + desc: "static mode", + opts: manifests.Options{ + Stack: lokiv1.LokiStackSpec{ + Tenants: &lokiv1.TenantsSpec{ + Mode: lokiv1.Static, + }, + }, + }, + wantOptions: nil, + }, + { + desc: "dynamic mode", + opts: manifests.Options{ + Stack: lokiv1.LokiStackSpec{ + Tenants: &lokiv1.TenantsSpec{ + Mode: lokiv1.Dynamic, + }, + }, + }, + wantOptions: nil, + }, + { + desc: "openshift-logging mode", + opts: manifests.Options{ + Stack: lokiv1.LokiStackSpec{ + Tenants: &lokiv1.TenantsSpec{ + Mode: lokiv1.OpenshiftLogging, + }, + }, + OpenShiftOptions: openshift.Options{ + BuildOpts: openshift.BuildOptions{ + AlertManagerEnabled: true, + }, + }, + }, + wantOptions: &config.AlertManagerConfig{ + EnableV2: true, + EnableDiscovery: true, + RefreshInterval: "1m", + Hosts: "https://_web._tcp.alertmanager-operated.openshift-monitoring.svc", + }, + }, + { + desc: "openshift-network mode", + opts: manifests.Options{ + Stack: lokiv1.LokiStackSpec{ + Tenants: &lokiv1.TenantsSpec{ + Mode: lokiv1.OpenshiftNetwork, + }, + }, + OpenShiftOptions: openshift.Options{ + BuildOpts: openshift.BuildOptions{ + AlertManagerEnabled: true, + }, + }, + }, + wantOptions: &config.AlertManagerConfig{ + EnableV2: true, + EnableDiscovery: true, + RefreshInterval: "1m", + Hosts: "https://_web._tcp.alertmanager-operated.openshift-monitoring.svc", + }, + }, + } + + for _, tc := range tt { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + t.Parallel() + + cfg := manifests.ConfigOptions(tc.opts) + err := manifests.ConfigureOptionsForMode(&cfg, tc.opts) + + require.Nil(t, err) + require.Equal(t, tc.wantOptions, cfg.Ruler.AlertManager) + }) + } +} + +func TestConfigOptions_RulerAlertManager_UserOverride(t *testing.T) { + tt := []struct { + desc string + opts manifests.Options + wantOptions *config.AlertManagerConfig + }{ + { + desc: "static mode", + opts: manifests.Options{ + Stack: lokiv1.LokiStackSpec{ + Tenants: &lokiv1.TenantsSpec{ + Mode: lokiv1.Static, + }, + }, + }, + wantOptions: nil, + }, + { + desc: "dynamic mode", + opts: manifests.Options{ + Stack: lokiv1.LokiStackSpec{ + Tenants: &lokiv1.TenantsSpec{ + Mode: lokiv1.Dynamic, + }, + }, + }, + wantOptions: nil, + }, + { + desc: "openshift-logging mode", + opts: manifests.Options{ + Stack: lokiv1.LokiStackSpec{ + Tenants: &lokiv1.TenantsSpec{ + Mode: lokiv1.OpenshiftLogging, + }, + Rules: &lokiv1.RulesSpec{ + Enabled: true, + }, + }, + Ruler: manifests.Ruler{ + Spec: &v1beta1.RulerConfigSpec{ + AlertManagerSpec: &v1beta1.AlertManagerSpec{ + EnableV2: false, + DiscoverySpec: &v1beta1.AlertManagerDiscoverySpec{ + EnableSRV: false, + RefreshInterval: "2m", + }, + Endpoints: []string{"http://my-alertmanager"}, + }, + }, + }, + OpenShiftOptions: openshift.Options{ + BuildOpts: openshift.BuildOptions{ + AlertManagerEnabled: true, + }, + }, + }, + wantOptions: &config.AlertManagerConfig{ + EnableV2: false, + EnableDiscovery: false, + RefreshInterval: "2m", + Hosts: "http://my-alertmanager", + }, + }, + { + desc: "openshift-network mode", + opts: manifests.Options{ + Stack: lokiv1.LokiStackSpec{ + Tenants: &lokiv1.TenantsSpec{ + Mode: lokiv1.OpenshiftNetwork, + }, + Rules: &lokiv1.RulesSpec{ + Enabled: true, + }, + }, + Ruler: manifests.Ruler{ + Spec: &v1beta1.RulerConfigSpec{ + AlertManagerSpec: &v1beta1.AlertManagerSpec{ + EnableV2: false, + DiscoverySpec: &v1beta1.AlertManagerDiscoverySpec{ + EnableSRV: false, + RefreshInterval: "2m", + }, + Endpoints: []string{"http://my-alertmanager"}, + }, + }, + }, + OpenShiftOptions: openshift.Options{ + BuildOpts: openshift.BuildOptions{ + AlertManagerEnabled: true, + }, + }, + }, + wantOptions: &config.AlertManagerConfig{ + EnableV2: false, + EnableDiscovery: false, + RefreshInterval: "2m", + Hosts: "http://my-alertmanager", + }, + }, + } + + for _, tc := range tt { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + t.Parallel() + + cfg := manifests.ConfigOptions(tc.opts) + err := manifests.ConfigureOptionsForMode(&cfg, tc.opts) + require.Nil(t, err) + require.Equal(t, tc.wantOptions, cfg.Ruler.AlertManager) + }) + } +} diff --git a/operator/internal/manifests/gateway_tenants.go b/operator/internal/manifests/gateway_tenants.go index c96860f46a971..6adcc11d5f670 100644 --- a/operator/internal/manifests/gateway_tenants.go +++ b/operator/internal/manifests/gateway_tenants.go @@ -5,6 +5,7 @@ import ( configv1 "github.com/grafana/loki/operator/apis/config/v1" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/manifests/internal/config" "github.com/grafana/loki/operator/internal/manifests/openshift" "github.com/imdario/mergo" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" @@ -46,6 +47,7 @@ func ApplyGatewayDefaultOptions(opts *Options) error { gatewayHTTPPortName, ComponentLabels(LabelGatewayComponent, opts.Name), tenantData, + RulerName(opts.Name), ) if err := mergo.Merge(&opts.OpenShiftOptions, &defaults, mergo.WithOverride); err != nil { @@ -153,3 +155,18 @@ func configureGatewayServiceMonitorForMode(sm *monitoringv1.ServiceMonitor, mode return nil } + +// ConfigureOptionsForMode applies configuration depending on the mode type. +func ConfigureOptionsForMode(cfg *config.Options, opt Options) error { + switch opt.Stack.Tenants.Mode { + case lokiv1.Static, lokiv1.Dynamic: + return nil // nothing to configure + case lokiv1.OpenshiftLogging, lokiv1.OpenshiftNetwork: + if opt.OpenShiftOptions.BuildOpts.AlertManagerEnabled { + return openshift.ConfigureOptions(cfg) + } + return nil + } + + return nil +} diff --git a/operator/internal/manifests/gateway_tenants_test.go b/operator/internal/manifests/gateway_tenants_test.go index 19de9dd6a1af5..c50101f3bd6b9 100644 --- a/operator/internal/manifests/gateway_tenants_test.go +++ b/operator/internal/manifests/gateway_tenants_test.go @@ -125,6 +125,7 @@ func TestApplyGatewayDefaultsOptions(t *testing.T) { GatewayName: "lokistack-ocp-gateway", GatewaySvcName: "lokistack-ocp-gateway-http", GatewaySvcTargetPort: "public", + RulerName: "lokistack-ocp-ruler", Labels: ComponentLabels(LabelGatewayComponent, "lokistack-ocp"), }, Authentication: []openshift.AuthenticationSpec{ @@ -199,6 +200,7 @@ func TestApplyGatewayDefaultsOptions(t *testing.T) { GatewayName: "lokistack-ocp-gateway", GatewaySvcName: "lokistack-ocp-gateway-http", GatewaySvcTargetPort: "public", + RulerName: "lokistack-ocp-ruler", Labels: ComponentLabels(LabelGatewayComponent, "lokistack-ocp"), }, Authentication: []openshift.AuthenticationSpec{ diff --git a/operator/internal/manifests/internal/config/loki-config.yaml b/operator/internal/manifests/internal/config/loki-config.yaml index a288138d83280..03304054f160a 100644 --- a/operator/internal/manifests/internal/config/loki-config.yaml +++ b/operator/internal/manifests/internal/config/loki-config.yaml @@ -185,7 +185,9 @@ ruler: poll_interval: {{ . }} {{ end }} {{ with .Ruler.AlertManager }} - external_url: {{ .ExternalURL }} + {{ with .ExternalURL }} + external_url: {{ . }} + {{ end}} {{ with .ExternalLabels }} external_labels: {{ range $name, $value := . }} diff --git a/operator/internal/manifests/openshift/build.go b/operator/internal/manifests/openshift/build.go index e921a970f3a47..f203c621a1601 100644 --- a/operator/internal/manifests/openshift/build.go +++ b/operator/internal/manifests/openshift/build.go @@ -1,15 +1,17 @@ package openshift -import "sigs.k8s.io/controller-runtime/pkg/client" +import ( + "sigs.k8s.io/controller-runtime/pkg/client" +) // BuildGatewayObjects returns a list of auxiliary openshift/k8s objects // for lokistack gateway deployments on OpenShift. func BuildGatewayObjects(opts Options) []client.Object { return []client.Object{ BuildRoute(opts), - BuildServiceAccount(opts), - BuildClusterRole(opts), - BuildClusterRoleBinding(opts), + BuildGatewayServiceAccount(opts), + BuildGatewayClusterRole(opts), + BuildGatewayClusterRoleBinding(opts), BuildMonitoringRole(opts), BuildMonitoringRoleBinding(opts), } @@ -22,3 +24,13 @@ func BuildLokiStackObjects(opts Options) []client.Object { BuildServiceCAConfigMap(opts), } } + +// BuildRulerObjects returns a list of auxiliary openshift/k8s objects +// for lokistack ruler deployments on OpenShift. +func BuildRulerObjects(opts Options) []client.Object { + return []client.Object{ + BuildRulerServiceAccount(opts), + BuildRulerClusterRole(opts), + BuildRulerClusterRoleBinding(opts), + } +} diff --git a/operator/internal/manifests/openshift/build_test.go b/operator/internal/manifests/openshift/build_test.go index 04a95272da470..1180383d91078 100644 --- a/operator/internal/manifests/openshift/build_test.go +++ b/operator/internal/manifests/openshift/build_test.go @@ -12,8 +12,8 @@ import ( rbacv1 "k8s.io/api/rbac/v1" ) -func TestBuild_ServiceAccountRefMatches(t *testing.T) { - opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}) +func TestBuildGatewayObjects_ServiceAccountRefMatches(t *testing.T) { + opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}, "abc") objs := BuildGatewayObjects(opts) sa := objs[1].(*corev1.ServiceAccount) @@ -24,8 +24,8 @@ func TestBuild_ServiceAccountRefMatches(t *testing.T) { require.Equal(t, sa.Namespace, rb.Subjects[0].Namespace) } -func TestBuild_ClusterRoleRefMatches(t *testing.T) { - opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}) +func TestBuildGatewayObjects_ClusterRoleRefMatches(t *testing.T) { + opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}, "abc") objs := BuildGatewayObjects(opts) cr := objs[2].(*rbacv1.ClusterRole) @@ -35,8 +35,8 @@ func TestBuild_ClusterRoleRefMatches(t *testing.T) { require.Equal(t, cr.Name, rb.RoleRef.Name) } -func TestBuild_MonitoringClusterRoleRefMatches(t *testing.T) { - opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}) +func TestBuildGatewayObjects_MonitoringClusterRoleRefMatches(t *testing.T) { + opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}, "abc") objs := BuildGatewayObjects(opts) cr := objs[4].(*rbacv1.Role) @@ -46,8 +46,8 @@ func TestBuild_MonitoringClusterRoleRefMatches(t *testing.T) { require.Equal(t, cr.Name, rb.RoleRef.Name) } -func TestBuild_ServiceAccountAnnotationsRouteRefMatches(t *testing.T) { - opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}) +func TestBuildGatewayObjects_ServiceAccountAnnotationsRouteRefMatches(t *testing.T) { + opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}, "abc") objs := BuildGatewayObjects(opts) rt := objs[0].(*routev1.Route) @@ -71,3 +71,18 @@ func TestBuild_ServiceAccountAnnotationsRouteRefMatches(t *testing.T) { require.Equal(t, rt.Kind, oauthRef.Ref.Kind) } } + +func TestBuildRulerObjects(t *testing.T) { + opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}, "abc") + + objs := BuildRulerObjects(opts) + sa := objs[0].(*corev1.ServiceAccount) + cr := objs[1].(*rbacv1.ClusterRole) + rb := objs[2].(*rbacv1.ClusterRoleBinding) + + require.Equal(t, sa.Kind, rb.Subjects[0].Kind) + require.Equal(t, sa.Name, rb.Subjects[0].Name) + require.Equal(t, sa.Namespace, rb.Subjects[0].Namespace) + require.Equal(t, cr.Kind, rb.RoleRef.Kind) + require.Equal(t, cr.Name, rb.RoleRef.Name) +} diff --git a/operator/internal/manifests/openshift/configure.go b/operator/internal/manifests/openshift/configure.go index d364db1a0e839..0b03674c8eab1 100644 --- a/operator/internal/manifests/openshift/configure.go +++ b/operator/internal/manifests/openshift/configure.go @@ -10,6 +10,7 @@ import ( "github.com/imdario/mergo" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/manifests/internal/config" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -279,3 +280,61 @@ func ConfigureQueryFrontendDeployment( return nil } + +// ConfigureRulerStatefulSet configures the ruler to use the cluster monitoring alertmanager. +func ConfigureRulerStatefulSet( + ss *appsv1.StatefulSet, + token, caBundleVolumeName, caDir, caFile string, + monitorServerName, rulerContainerName string, +) error { + var rulerIndex int + for i, c := range ss.Spec.Template.Spec.Containers { + if c.Name == rulerContainerName { + rulerIndex = i + break + } + } + + rulerContainer := ss.Spec.Template.Spec.Containers[rulerIndex].DeepCopy() + + rulerContainer.Args = append(rulerContainer.Args, + fmt.Sprintf("-ruler.alertmanager-client.tls-ca-path=%s/%s", caDir, caFile), + fmt.Sprintf("-ruler.alertmanager-client.tls-server-name=%s", monitorServerName), + fmt.Sprintf("-ruler.alertmanager-client.credentials-file=%s", token), + ) + + p := corev1.PodSpec{ + ServiceAccountName: ss.GetName(), + Containers: []corev1.Container{ + *rulerContainer, + }, + } + + if err := mergo.Merge(&ss.Spec.Template.Spec, p, mergo.WithOverride); err != nil { + return kverrors.Wrap(err, "failed to merge ruler container spec ") + } + + return nil +} + +// ConfigureOptions applies default configuration for the use of the cluster monitoring alertmanager. +func ConfigureOptions(configOpt *config.Options) error { + if configOpt.Ruler.AlertManager == nil { + configOpt.Ruler.AlertManager = &config.AlertManagerConfig{} + } + + if len(configOpt.Ruler.AlertManager.Hosts) == 0 { + amc := &config.AlertManagerConfig{ + Hosts: "https://_web._tcp.alertmanager-operated.openshift-monitoring.svc", + EnableV2: true, + EnableDiscovery: true, + RefreshInterval: "1m", + } + + if err := mergo.Merge(configOpt.Ruler.AlertManager, amc); err != nil { + return kverrors.Wrap(err, "failed merging AlertManager config") + } + } + + return nil +} diff --git a/operator/internal/manifests/openshift/options.go b/operator/internal/manifests/openshift/options.go index 9f9d895e98e01..8f6ca2abb1d9e 100644 --- a/operator/internal/manifests/openshift/options.go +++ b/operator/internal/manifests/openshift/options.go @@ -41,7 +41,9 @@ type BuildOptions struct { GatewayName string GatewaySvcName string GatewaySvcTargetPort string + RulerName string Labels map[string]string + AlertManagerEnabled bool } // TenantData defines the existing cookieSecret for lokistack reconcile. @@ -56,6 +58,7 @@ func NewOptions( gwName, gwBaseDomain, gwSvcName, gwPortName string, gwLabels map[string]string, tenantConfigMap map[string]TenantData, + rulerName string, ) Options { host := ingressHost(stackName, stackNamespace, gwBaseDomain) @@ -85,6 +88,7 @@ func NewOptions( GatewaySvcName: gwSvcName, GatewaySvcTargetPort: gwPortName, Labels: gwLabels, + RulerName: rulerName, }, Authentication: authn, Authorization: AuthorizationSpec{ diff --git a/operator/internal/manifests/openshift/rbac.go b/operator/internal/manifests/openshift/rbac.go index 813f2e87399b5..43dc3f6e72fd0 100644 --- a/operator/internal/manifests/openshift/rbac.go +++ b/operator/internal/manifests/openshift/rbac.go @@ -5,19 +5,19 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// BuildClusterRole returns a k8s ClusterRole object for the +// BuildGatewayClusterRole returns a k8s ClusterRole object for the // lokistack gateway serviceaccount to allow creating: // - TokenReviews to authenticate the user by bearer token. // - SubjectAccessReview to authorize the user by bearer token. // if having access to read/create logs. -func BuildClusterRole(opts Options) *rbacv1.ClusterRole { +func BuildGatewayClusterRole(opts Options) *rbacv1.ClusterRole { return &rbacv1.ClusterRole{ TypeMeta: metav1.TypeMeta{ Kind: "ClusterRole", APIVersion: rbacv1.SchemeGroupVersion.String(), }, ObjectMeta: metav1.ObjectMeta{ - Name: authorizerRbacName(opts), + Name: authorizerRbacName(opts.BuildOpts.GatewayName), Labels: opts.BuildOpts.Labels, }, Rules: []rbacv1.PolicyRule{ @@ -47,29 +47,84 @@ func BuildClusterRole(opts Options) *rbacv1.ClusterRole { } } -// BuildClusterRoleBinding returns a k8s ClusterRoleBinding object for +// BuildGatewayClusterRoleBinding returns a k8s ClusterRoleBinding object for // the lokistack gateway serviceaccount to grant access to: // - rbac.authentication.k8s.io/TokenReviews // - rbac.authorization.k8s.io/SubjectAccessReviews -func BuildClusterRoleBinding(opts Options) *rbacv1.ClusterRoleBinding { +func BuildGatewayClusterRoleBinding(opts Options) *rbacv1.ClusterRoleBinding { return &rbacv1.ClusterRoleBinding{ TypeMeta: metav1.TypeMeta{ Kind: "ClusterRoleBinding", APIVersion: rbacv1.SchemeGroupVersion.String(), }, ObjectMeta: metav1.ObjectMeta{ - Name: authorizerRbacName(opts), + Name: authorizerRbacName(opts.BuildOpts.GatewayName), Labels: opts.BuildOpts.Labels, }, RoleRef: rbacv1.RoleRef{ APIGroup: "rbac.authorization.k8s.io", Kind: "ClusterRole", - Name: authorizerRbacName(opts), + Name: authorizerRbacName(opts.BuildOpts.GatewayName), }, Subjects: []rbacv1.Subject{ { Kind: rbacv1.ServiceAccountKind, - Name: serviceAccountName(opts), + Name: gatewayServiceAccountName(opts), + Namespace: opts.BuildOpts.LokiStackNamespace, + }, + }, + } +} + +// BuildRulerClusterRole returns a k8s ClusterRole object for the +// lokistack ruler serviceaccount to allow patching sending alerts to alertmanagers. +func BuildRulerClusterRole(opts Options) *rbacv1.ClusterRole { + return &rbacv1.ClusterRole{ + TypeMeta: metav1.TypeMeta{ + Kind: "ClusterRole", + APIVersion: rbacv1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: authorizerRbacName(opts.BuildOpts.RulerName), + Labels: opts.BuildOpts.Labels, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{ + "monitoring.coreos.com", + }, + Resources: []string{ + "alertmanagers", + }, + Verbs: []string{ + "patch", + }, + }, + }, + } +} + +// BuildRulerClusterRoleBinding returns a k8s ClusterRoleBinding object for +// the lokistack ruler serviceaccount to grant access to alertmanagers. +func BuildRulerClusterRoleBinding(opts Options) *rbacv1.ClusterRoleBinding { + return &rbacv1.ClusterRoleBinding{ + TypeMeta: metav1.TypeMeta{ + Kind: "ClusterRoleBinding", + APIVersion: rbacv1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: authorizerRbacName(opts.BuildOpts.RulerName), + Labels: opts.BuildOpts.Labels, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: authorizerRbacName(opts.BuildOpts.RulerName), + }, + Subjects: []rbacv1.Subject{ + { + Kind: rbacv1.ServiceAccountKind, + Name: rulerServiceAccountName(opts), Namespace: opts.BuildOpts.LokiStackNamespace, }, }, diff --git a/operator/internal/manifests/openshift/serviceaccount.go b/operator/internal/manifests/openshift/serviceaccount.go index a7d4f849e43d0..82b3805a0ab2f 100644 --- a/operator/internal/manifests/openshift/serviceaccount.go +++ b/operator/internal/manifests/openshift/serviceaccount.go @@ -7,10 +7,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// BuildServiceAccount returns a k8s object for the LokiStack Gateway +// BuildGatewayServiceAccount returns a k8s object for the LokiStack Gateway // serviceaccount. This ServiceAccount is used in parallel as an // OpenShift OAuth Client. -func BuildServiceAccount(opts Options) client.Object { +func BuildGatewayServiceAccount(opts Options) client.Object { return &corev1.ServiceAccount{ TypeMeta: metav1.TypeMeta{ Kind: "ServiceAccount", @@ -19,7 +19,26 @@ func BuildServiceAccount(opts Options) client.Object { ObjectMeta: metav1.ObjectMeta{ Annotations: serviceAccountAnnotations(opts), Labels: opts.BuildOpts.Labels, - Name: serviceAccountName(opts), + Name: gatewayServiceAccountName(opts), + Namespace: opts.BuildOpts.LokiStackNamespace, + }, + AutomountServiceAccountToken: pointer.Bool(true), + } +} + +// BuildRulerServiceAccount returns a k8s object for the LokiStack Ruler +// serviceaccount. +// This ServiceAccount is used to autheticate and access the alertmanager host. +func BuildRulerServiceAccount(opts Options) client.Object { + return &corev1.ServiceAccount{ + TypeMeta: metav1.TypeMeta{ + Kind: "ServiceAccount", + APIVersion: corev1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Annotations: serviceAccountAnnotations(opts), + Labels: opts.BuildOpts.Labels, + Name: rulerServiceAccountName(opts), Namespace: opts.BuildOpts.LokiStackNamespace, }, AutomountServiceAccountToken: pointer.Bool(true), diff --git a/operator/internal/manifests/openshift/serviceaccount_test.go b/operator/internal/manifests/openshift/serviceaccount_test.go index b2df933f467ff..bb857b4b7798b 100644 --- a/operator/internal/manifests/openshift/serviceaccount_test.go +++ b/operator/internal/manifests/openshift/serviceaccount_test.go @@ -10,9 +10,9 @@ import ( ) func TestBuildServiceAccount_AnnotationsMatchLoggingTenants(t *testing.T) { - opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}) + opts := NewOptions(lokiv1.OpenshiftLogging, "abc", "ns", "abc", "example.com", "abc", "abc", map[string]string{}, map[string]TenantData{}, "abc") - sa := BuildServiceAccount(opts) + sa := BuildGatewayServiceAccount(opts) require.Len(t, sa.GetAnnotations(), len(loggingTenants)) var keys []string @@ -27,9 +27,9 @@ func TestBuildServiceAccount_AnnotationsMatchLoggingTenants(t *testing.T) { } func TestBuildServiceAccount_AnnotationsMatchNetworkTenants(t *testing.T) { - opts := NewOptions(lokiv1.OpenshiftNetwork, "def", "ns2", "def", "example2.com", "def", "def", map[string]string{}, map[string]TenantData{}) + opts := NewOptions(lokiv1.OpenshiftNetwork, "def", "ns2", "def", "example2.com", "def", "def", map[string]string{}, map[string]TenantData{}, "abc") - sa := BuildServiceAccount(opts) + sa := BuildGatewayServiceAccount(opts) require.Len(t, sa.GetAnnotations(), len(networkTenants)) var keys []string diff --git a/operator/internal/manifests/openshift/var.go b/operator/internal/manifests/openshift/var.go index 6b2d5b01b1dc5..7d872e564ae90 100644 --- a/operator/internal/manifests/openshift/var.go +++ b/operator/internal/manifests/openshift/var.go @@ -31,10 +31,17 @@ var ( // cert-signing service to inject the service CA into the annotated // configmap. InjectCABundleKey = "service.beta.openshift.io/inject-cabundle" + + // MonitoringNS is the namespace containing cluster monitoring objects such as alertmanager. + MonitoringNS = "openshift-monitoring" + // MonitoringSVCMain is the name of the alertmanager main service used for alerts. + MonitoringSVCMain = "alertmanager-main" + // MonitoringSVCOperated is the name of the alertmanager operator service used for alerts. + MonitoringSVCOperated = "alertmanager-operated" ) -func authorizerRbacName(opts Options) string { - return fmt.Sprintf("%s-authorizer", opts.BuildOpts.GatewayName) +func authorizerRbacName(componentName string) string { + return fmt.Sprintf("%s-authorizer", componentName) } func monitoringRbacName(stackName string) string { @@ -49,10 +56,14 @@ func routeName(opts Options) string { return opts.BuildOpts.LokiStackName } -func serviceAccountName(opts Options) string { +func gatewayServiceAccountName(opts Options) string { return opts.BuildOpts.GatewayName } +func rulerServiceAccountName(opts Options) string { + return opts.BuildOpts.RulerName +} + func serviceCABundleName(opts Options) string { return fmt.Sprintf("%s-ca-bundle", opts.BuildOpts.LokiStackName) } diff --git a/operator/internal/manifests/ruler.go b/operator/internal/manifests/ruler.go index d118110305fae..bdf7be06df140 100644 --- a/operator/internal/manifests/ruler.go +++ b/operator/internal/manifests/ruler.go @@ -5,7 +5,9 @@ import ( "path" "github.com/ViaQ/logerr/v2/kverrors" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/manifests/internal/config" + "github.com/grafana/loki/operator/internal/manifests/openshift" "github.com/imdario/mergo" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -32,11 +34,21 @@ func BuildRuler(opts Options) ([]client.Object, error) { } } - return []client.Object{ + objs := []client.Object{} + + if opts.Stack.Tenants != nil { + if err := configureRulerStatefulSetForMode(statefulSet, opts.Stack.Tenants.Mode, opts.Name); err != nil { + return nil, err + } + + objs = configureRulerObjsForMode(opts) + } + + return append(objs, statefulSet, NewRulerGRPCService(opts), NewRulerHTTPService(opts), - }, nil + ), nil } // NewRulerStatefulSet creates a statefulset object for a ruler @@ -71,7 +83,7 @@ func NewRulerStatefulSet(opts Options) *appsv1.StatefulSet { Containers: []corev1.Container{ { Image: opts.Image, - Name: "loki-ruler", + Name: rulerContainerName, Resources: corev1.ResourceRequirements{ Limits: opts.ResourceRequirements.Ruler.Limits, Requests: opts.ResourceRequirements.Ruler.Requests, @@ -210,6 +222,43 @@ func NewRulerStatefulSet(opts Options) *appsv1.StatefulSet { } } +func configureRulerStatefulSetForMode( + ss *appsv1.StatefulSet, mode lokiv1.ModeType, + stackName string, +) error { + switch mode { + case lokiv1.Static, lokiv1.Dynamic: + return nil // nothing to configure + case lokiv1.OpenshiftLogging, lokiv1.OpenshiftNetwork: + caBundleName := signingCABundleName(stackName) + monitorServerName := fqdn(openshift.MonitoringSVCMain, openshift.MonitoringNS) + return openshift.ConfigureRulerStatefulSet( + ss, + BearerTokenFile, + caBundleName, + caBundleDir, + caFile, + monitorServerName, + rulerContainerName, + ) + } + + return nil +} + +func configureRulerObjsForMode(opts Options) []client.Object { + openShiftObjs := []client.Object{} + + switch opts.Stack.Tenants.Mode { + case lokiv1.Static, lokiv1.Dynamic: + // nothing to configure + case lokiv1.OpenshiftLogging, lokiv1.OpenshiftNetwork: + openShiftObjs = openshift.BuildRulerObjects(opts.OpenShiftOptions) + } + + return openShiftObjs +} + // NewRulerGRPCService creates a k8s service for the ruler GRPC endpoint func NewRulerGRPCService(opts Options) *corev1.Service { serviceName := serviceNameRulerGRPC(opts.Name) diff --git a/operator/internal/manifests/var.go b/operator/internal/manifests/var.go index ed14da9af4ea9..d211f98ebfc6f 100644 --- a/operator/internal/manifests/var.go +++ b/operator/internal/manifests/var.go @@ -42,6 +42,8 @@ const ( dataDirectory = "/tmp/loki" rulesStorageDirectory = "/tmp/rules" + rulerContainerName = "loki-ruler" + // EnvRelatedImageLoki is the environment variable to fetch the Loki image pullspec. EnvRelatedImageLoki = "RELATED_IMAGE_LOKI" // EnvRelatedImageGateway is the environment variable to fetch the Gateway image pullspec.