diff --git a/Dockerfiles/Dockerfile b/Dockerfiles/Dockerfile index a4bf0cc4018..ac3797f304a 100644 --- a/Dockerfiles/Dockerfile +++ b/Dockerfiles/Dockerfile @@ -14,6 +14,11 @@ RUN if [ "${USE_LOCAL}" != "true" ]; then \ ./get_all_manifests.sh ${OVERWRITE_MANIFESTS}; \ fi +# Copy monitoring config +COPY config/monitoring/ /opt/manifests/monitoring +# Copy ods-configs +COPY config/osd-configs/ /opt/manifests/osd-configs + ################################################################################ FROM registry.access.redhat.com/ubi8/go-toolset:$GOLANG_VERSION as builder ARG CGO_ENABLED=1 diff --git a/config/osd-configs/dedicated-admins-mgmt-role.yaml b/config/osd-configs/dedicated-admins-mgmt-role.yaml new file mode 100644 index 00000000000..61c44a3ab26 --- /dev/null +++ b/config/osd-configs/dedicated-admins-mgmt-role.yaml @@ -0,0 +1,46 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: dedicated-admins-mgmt-role +rules: + - apiGroups: + - '' + verbs: + - create + - edit + - delete + - get + - list + - patch + - update + - watch + resources: + - configmaps + - secrets + - apiGroups: + - image.openshift.io + verbs: + - create + - edit + - delete + - get + - list + - patch + - update + - watch + resources: + - imagestreams + - apiGroups: + - build.openshift.io + verbs: + - create + - edit + - delete + - get + - list + - patch + - update + - watch + resources: + - builds + - buildconfigs \ No newline at end of file diff --git a/config/osd-configs/dedicated-admins-mgmt-rolebinding.yaml b/config/osd-configs/dedicated-admins-mgmt-rolebinding.yaml new file mode 100644 index 00000000000..bf6c16db462 --- /dev/null +++ b/config/osd-configs/dedicated-admins-mgmt-rolebinding.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: dedicated-admins-mgmt-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: dedicated-admins-mgmt-role +subjects: +- kind: Group + name: dedicated-admins \ No newline at end of file diff --git a/config/osd-configs/kustomization.yaml b/config/osd-configs/kustomization.yaml new file mode 100644 index 00000000000..13a6d7c927c --- /dev/null +++ b/config/osd-configs/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- dedicated-admins-mgmt-role.yaml +- dedicated-admins-mgmt-rolebinding.yaml \ No newline at end of file diff --git a/config/partners/anaconda/base/anaconda-ce-validator-cron.yaml b/config/partners/anaconda/base/anaconda-ce-validator-cron.yaml new file mode 100644 index 00000000000..3847ae3ce36 --- /dev/null +++ b/config/partners/anaconda/base/anaconda-ce-validator-cron.yaml @@ -0,0 +1,116 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: anaconda-ce-periodic-validator + namespace: redhat-ods-applications + labels: + opendatahub.io/modified: "false" +spec: + schedule: "0 0 * * *" + concurrencyPolicy: "Replace" + startingDeadlineSeconds: 200 + suspend: true + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + metadata: + labels: + parent: "anaconda-ce-periodic-validator" + spec: + serviceAccount: "rhods-dashboard" + imagePullSecrets: + - name: addon-managed-odh-pullsecret + containers: + - name: anaconda-ce-validator + image: registry.redhat.io/openshift4/ose-cli@sha256:75bf9b911b6481dcf29f7942240d1555adaa607eec7fc61bedb7f624f87c36d4 + command: + - /bin/sh + - -c + - > + #!/bin/sh + + IMAGESTREAM_NAME='s2i-minimal-notebook-anaconda' + CONFIGMAP_NAME='anaconda-ce-validation-result' + BUILDCONFIG_NAME='s2i-minimal-notebook-anaconda' + ANACONDA_VERSION='v0.2.2-anaconda' + + function generate_imagestream() { + echo '{"apiVersion":"image.openshift.io/v1","kind":"ImageStream","metadata":{"annotations":{"opendatahub.io/notebook-image-order":"10","opendatahub.io/notebook-image-desc":"Notebook with Anaconda CE tools instead of pip.","opendatahub.io/notebook-image-name":"Anaconda Commercial Edition","opendatahub.io/notebook-image-url":"https://github.com/red-hat-data-services/notebooks"},"labels":{"component.opendatahub.io/name":"jupyterhub","opendatahub.io/modified":"false","opendatahub.io/notebook-image":"true"},"name":"s2i-minimal-notebook-anaconda"},"spec":{"lookupPolicy":{"local":true},"tags":[{"name":"2023.1","annotations":{"opendatahub.io/default-image":"true","opendatahub.io/notebook-python-dependencies":"[{\"name\":\"JupyterLab\",\"version\": \"3.5\"}, {\"name\": \"Notebook\",\"version\": \"6.5\"}]","opendatahub.io/notebook-software":"[{\"name\":\"Python\",\"version\":\"v3.8\"}]","opendatahub.io/workbench-image-recommended":"true","openshift.io/imported-from":"quay.io/modh/odh-anaconda-notebook"},"from":{"kind":"DockerImage","name":"quay.io/modh/odh-anaconda-notebook@sha256:380c07bf79f5ec7d22441cde276c50b5eb2a459485cde05087837639a566ae3d"},"generation":2,"importPolicy":{"importMode":"Legacy"},"referencePolicy":{"type":"Local"}}]}}' + } + + function create_imagestream() { + generate_imagestream | oc apply -f- + } + + function delete_imagestream() { + generate_imagestream | oc delete -f- + } + + function get_variable() { + cat "/etc/secret-volume/${1}" + } + + function verify_configmap_exists() { + if ! oc get configmap "${CONFIGMAP_NAME}" &>/dev/null; then + echo "Result ConfigMap doesn't exist, creating" + oc create configmap "${CONFIGMAP_NAME}" --from-literal validation_result="false" + fi + } + + function write_configmap_value() { + oc patch configmap "${CONFIGMAP_NAME}" -p '"data": { "validation_result": "'${1}'" }' + } + + function write_last_valid_time() { + oc patch configmap "${CONFIGMAP_NAME}" -p '"data": { "last_valid_time": "'$(date -Is)'" }' + } + + function success() { + echo "Validation succeeded, enabling image" + create_imagestream + verify_configmap_exists + write_configmap_value true + write_last_valid_time + } + + function failure() { + echo "Validation failed, disabling image" + verify_configmap_exists + write_configmap_value false + } + + CURL_RESULT=$(curl -w 'RESP_CODE:%{response_code}' -IHEAD "https://repo.anaconda.cloud/repo/t/$(get_variable Anaconda_ce_key)/main/noarch/repodata.json" 2>/dev/null) + CURL_CODE=$(echo "${CURL_RESULT}" | grep -o 'RESP_CODE:[1-5][0-9][0-9]'| cut -d':' -f2) + + echo "Validation result: ${CURL_CODE}" + + if [ "${CURL_CODE}" == 200 ]; then + success + elif [ "${CURL_CODE}" == 403 ]; then + failure + else + echo "Return code ${CURL_CODE} from validation check, possibly upstream error. Exiting." + echo "Result from curl:" + echo "${CURL_RESULT}" + fi + + exit 0 + + volumeMounts: + - name: secret-volume + mountPath: /etc/secret-volume + readOnly: true + resources: + limits: + cpu: 100m + memory: 256Mi + requests: + cpu: 100m + memory: 256Mi + volumes: + - name: secret-volume + secret: + secretName: anaconda-ce-access + restartPolicy: Never \ No newline at end of file diff --git a/config/partners/anaconda/base/kustomization.yaml b/config/partners/anaconda/base/kustomization.yaml new file mode 100644 index 00000000000..9cc666432ee --- /dev/null +++ b/config/partners/anaconda/base/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- anaconda-ce-validator-cron.yaml + +commonLabels: + opendatahub.io/component: "true" + component.opendatahub.io/name: anaconda-ce \ No newline at end of file diff --git a/controllers/dscinitialization/dscinitialization_controller.go b/controllers/dscinitialization/dscinitialization_controller.go index dc99f32081b..12eca9cb9ae 100644 --- a/controllers/dscinitialization/dscinitialization_controller.go +++ b/controllers/dscinitialization/dscinitialization_controller.go @@ -29,6 +29,7 @@ import ( networkingv1 "k8s.io/api/networking/v1" rbacv1 "k8s.io/api/rbac/v1" k8serr "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" @@ -252,6 +253,9 @@ func (r *DSCInitializationReconciler) Reconcile(ctx context.Context, req ctrl.Re } if instance.Spec.Monitoring.ManagementState == operatorv1.Managed { log.Info("Monitoring enabled in initialization stage", "cluster", "Managed Service Mode") + if err := r.configureMonitoring(ctx, instance); err != nil { + return ctrl.Result{}, err + } err := r.configureManagedMonitoring(ctx, instance, "init") if err != nil { return reconcile.Result{}, err @@ -346,6 +350,8 @@ func (r *DSCInitializationReconciler) SetupWithManager(ctx context.Context, mgr Owns( &routev1.Route{}, builder.WithPredicates(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}))). + Owns(&corev1.PersistentVolumeClaim{}, + builder.WithPredicates(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}))). Watches( &dscv1.DataScienceCluster{}, handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, a client.Object) []reconcile.Request { @@ -463,3 +469,42 @@ func (r *DSCInitializationReconciler) watchAuthResource(ctx context.Context, a c return nil } + +func (r *DSCInitializationReconciler) configureMonitoring(ctx context.Context, dsci *dsciv1.DSCInitialization) error { + // Create Monitoring CR singleton + defaultMonitoring := client.Object(&serviceApi.Monitoring{ + TypeMeta: metav1.TypeMeta{ + Kind: serviceApi.MonitoringKind, + APIVersion: serviceApi.GroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: serviceApi.MonitoringInstanceName, + OwnerReferences: []metav1.OwnerReference{{ + APIVersion: dsciv1.GroupVersion.String(), + Kind: dsci.Kind, + Name: dsci.Name, + UID: dsci.UID, + }, + }, + }, + Spec: serviceApi.MonitoringSpec{ + MonitoringCommonSpec: serviceApi.MonitoringCommonSpec{ + Namespace: dsci.Spec.Monitoring.Namespace, + }, + }, + }, + ) + + if dsci.Spec.Monitoring.ManagementState == operatorv1.Managed { + err := r.Create(ctx, defaultMonitoring) + if err != nil && !k8serr.IsAlreadyExists(err) { + return err + } + } else { + err := r.Delete(ctx, defaultMonitoring) + if err != nil && !k8serr.IsNotFound(err) { + return err + } + } + return nil +} diff --git a/controllers/dscinitialization/monitoring.go b/controllers/dscinitialization/monitoring.go index 2d34ed0df31..f3f0df71636 100644 --- a/controllers/dscinitialization/monitoring.go +++ b/controllers/dscinitialization/monitoring.go @@ -57,6 +57,8 @@ func (r *DSCInitializationReconciler) configureManagedMonitoring(ctx context.Con "(.*)-(.*)odh-model-controller(.*).rules": "", "(.*)-(.*)ray(.*).rules": "", "(.*)-(.*)trustyai(.*).rules": "", + "(.*)-(.*)kueue(.*).rules": "", + "(.*)-(.*)trainingoperator(.*).rules": "", }) if err != nil { log.Error(err, "error to remove previous enabled component rules") diff --git a/controllers/services/monitoring/monitoring.go b/controllers/services/monitoring/monitoring.go new file mode 100644 index 00000000000..43901b44ffd --- /dev/null +++ b/controllers/services/monitoring/monitoring.go @@ -0,0 +1,125 @@ +package monitoring + +import ( + "context" + "os" + "path/filepath" + "strings" + + "gopkg.in/yaml.v2" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + serviceApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/services/v1alpha1" + odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" +) + +var ( + ComponentName = serviceApi.MonitoringServiceName + prometheusConfigPath = filepath.Join(odhdeploy.DefaultManifestPath, ComponentName, "prometheus", "apps", "prometheus-configs.yaml") +) + +// UpdatePrometheusConfig update prometheus-configs.yaml to include/exclude .rules +// parameter enable when set to true to add new rules, when set to false to remove existing rules. +func UpdatePrometheusConfig(ctx context.Context, enable bool, component string) error { + l := logf.FromContext(ctx) + + // create a struct to mock poremtheus.yml + type ConfigMap struct { + APIVersion string `yaml:"apiVersion"` + Kind string `yaml:"kind"` + Metadata struct { + Name string `yaml:"name"` + Namespace string `yaml:"namespace"` + } `yaml:"metadata"` + Data struct { + PrometheusYML string `yaml:"prometheus.yml"` + OperatorRules string `yaml:"operator-recording.rules"` + DeadManSnitchRules string `yaml:"deadmanssnitch-alerting.rules"` + CFRRules string `yaml:"codeflare-recording.rules"` + CRARules string `yaml:"codeflare-alerting.rules"` + DashboardRRules string `yaml:"rhods-dashboard-recording.rules"` + DashboardARules string `yaml:"rhods-dashboard-alerting.rules"` + DSPRRules string `yaml:"data-science-pipelines-operator-recording.rules"` + DSPARules string `yaml:"data-science-pipelines-operator-alerting.rules"` + MMRRules string `yaml:"model-mesh-recording.rules"` + MMARules string `yaml:"model-mesh-alerting.rules"` + OdhModelRRules string `yaml:"odh-model-controller-recording.rules"` + OdhModelARules string `yaml:"odh-model-controller-alerting.rules"` + RayARules string `yaml:"ray-alerting.rules"` + WorkbenchesRRules string `yaml:"workbenches-recording.rules"` + WorkbenchesARules string `yaml:"workbenches-alerting.rules"` + KserveRRules string `yaml:"kserve-recording.rules"` + KserveARules string `yaml:"kserve-alerting.rules"` + TrustyAIRRules string `yaml:"trustyai-recording.rules"` + TrustyAIARules string `yaml:"trustyai-alerting.rules"` + KueueRRules string `yaml:"kueue-recording.rules"` + KueueARules string `yaml:"kueue-alerting.rules"` + TrainingOperatorRRules string `yaml:"trainingoperator-recording.rules"` + TrainingOperatorARules string `yaml:"trainingoperator-alerting.rules"` + ModelRegistryRRules string `yaml:"model-registry-operator-recording.rules"` + ModelRegistryARules string `yaml:"model-registry-operator-alerting.rules"` + } `yaml:"data"` + } + + var configMap ConfigMap + // prometheusContent will represent content of prometheus.yml due to its dynamic struct + var prometheusContent map[interface{}]interface{} + + // read prometheus.yml from local disk /opt/mainfests/monitoring/prometheus/apps/ + yamlData, err := os.ReadFile(prometheusConfigPath) + if err != nil { + return err + } + if err := yaml.Unmarshal(yamlData, &configMap); err != nil { + return err + } + + // get prometheus.yml part from configmap + if err := yaml.Unmarshal([]byte(configMap.Data.PrometheusYML), &prometheusContent); err != nil { + return err + } + + // to add component rules when it is not there yet + if enable { + // Check if the rule not yet exists in rule_files + if !strings.Contains(configMap.Data.PrometheusYML, component+"*.rules") { + // check if have rule_files + if ruleFiles, ok := prometheusContent["rule_files"]; ok { + if ruleList, isList := ruleFiles.([]interface{}); isList { + // add new component rules back to rule_files + ruleList = append(ruleList, component+"*.rules") + prometheusContent["rule_files"] = ruleList + } + } + } + } else { // to remove component rules if it is there + l.Info("Removing prometheus rule: " + component + "*.rules") + if ruleList, ok := prometheusContent["rule_files"].([]interface{}); ok { + for i, item := range ruleList { + if rule, isStr := item.(string); isStr && rule == component+"*.rules" { + ruleList = append(ruleList[:i], ruleList[i+1:]...) + + break + } + } + prometheusContent["rule_files"] = ruleList + } + } + + // Marshal back + newDataYAML, err := yaml.Marshal(&prometheusContent) + if err != nil { + return err + } + configMap.Data.PrometheusYML = string(newDataYAML) + + newyamlData, err := yaml.Marshal(&configMap) + if err != nil { + return err + } + + // Write the modified content back to the file + err = os.WriteFile(prometheusConfigPath, newyamlData, 0) + + return err +} diff --git a/controllers/services/monitoring/monitoring_controller.go b/controllers/services/monitoring/monitoring_controller.go index 4188e6bdc83..ce6b4530f25 100644 --- a/controllers/services/monitoring/monitoring_controller.go +++ b/controllers/services/monitoring/monitoring_controller.go @@ -20,46 +20,19 @@ import ( "context" "fmt" - routev1 "github.com/openshift/api/route/v1" - monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" - extv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" ctrl "sigs.k8s.io/controller-runtime" + dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" serviceApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/services/v1alpha1" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/deploy" - "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/render/kustomize" - "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/updatestatus" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/handlers" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/predicates/resources" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/reconciler" - "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/labels" ) -const serviceName = "monitoring" - // NewServiceReconciler creates a ServiceReconciler for the Monitoring API. func NewServiceReconciler(ctx context.Context, mgr ctrl.Manager) error { _, err := reconciler.ReconcilerFor(mgr, &serviceApi.Monitoring{}). - // operands - owned - Owns(&corev1.ConfigMap{}). - Owns(&corev1.Secret{}). - Owns(&rbacv1.ClusterRoleBinding{}). - Owns(&rbacv1.ClusterRole{}). - Owns(&rbacv1.Role{}). - Owns(&rbacv1.RoleBinding{}). - Owns(&corev1.ServiceAccount{}). - Owns(&corev1.Service{}). - Owns(&corev1.PersistentVolumeClaim{}). - Owns(&monitoringv1.ServiceMonitor{}). - Owns(&monitoringv1.PrometheusRule{}). - // By default, a predicated for changed generation is added by the Owns() - // method, however for deployments, we also need to retrieve status info - // hence we need a dedicated predicate to react to replicas status change - Owns(&appsv1.Deployment{}, reconciler.WithPredicates(resources.NewDeploymentPredicate())). - // operands - openshift - Owns(&routev1.Route{}). // operands - watched // // By default the Watches functions adds: @@ -69,29 +42,13 @@ func NewServiceReconciler(ctx context.Context, mgr ctrl.Manager) error { // for to objects that have the label components.platform.opendatahub.io/part-of // or services.platform.opendatahub.io/part-of set to the current owner // - Watches(&extv1.CustomResourceDefinition{}). + Watches(&dscv1.DataScienceCluster{}, reconciler.WithEventHandler(handlers.ToNamed(serviceApi.MonitoringInstanceName)), + reconciler.WithPredicates(resources.DSCSpecUpdatePredicate)). // actions WithAction(initialize). - WithAction(kustomize.NewAction( - kustomize.WithCache(), - // Those are the default labels added by the legacy deploy method - // and should be preserved as the original plugin were affecting - // deployment selectors that are immutable once created, so it won't - // be possible to actually amend the labels in a non-disruptive - // manner. - // - // Additional labels/annotations MUST be added by the deploy action - // so they would affect only objects metadata without side effects - // kustomize.WithLabel(labels.ODH.Component(componentName), "true"), - kustomize.WithLabel(labels.K8SCommon.PartOf, serviceName), - )). + WithAction(updatePrometheusConfigMap). WithAction(deploy.NewAction( deploy.WithCache(), - deploy.WithFieldOwner(serviceApi.MonitoringInstanceName), - deploy.WithLabel(labels.PlatformPartOf, serviceApi.MonitoringServiceName), - )). - WithAction(updatestatus.NewAction( - updatestatus.WithSelectorLabel(labels.PlatformPartOf, serviceApi.MonitoringServiceName), )). WithAction(updateStatus). Build(ctx) diff --git a/controllers/services/monitoring/monitoring_controller_actions.go b/controllers/services/monitoring/monitoring_controller_actions.go index b6e6d58502d..25f87a18574 100644 --- a/controllers/services/monitoring/monitoring_controller_actions.go +++ b/controllers/services/monitoring/monitoring_controller_actions.go @@ -5,44 +5,109 @@ import ( "errors" "fmt" - routev1 "github.com/openshift/api/route/v1" + k8serr "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + componentApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/components/v1alpha1" + dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" serviceApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/services/v1alpha1" + "github.com/opendatahub-io/opendatahub-operator/v2/controllers/status" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" + cr "github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry" odhtypes "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/types" - "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/labels" - "github.com/opendatahub-io/opendatahub-operator/v2/pkg/resources" + odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" ) +// initialize handles all pre-deployment configurations. func initialize(ctx context.Context, rr *odhtypes.ReconciliationRequest) error { + log := logf.FromContext(ctx) + // Only handle manifests setup and initial configurations + platform := rr.Release.Name + switch platform { + case cluster.ManagedRhoai: + // Only set prometheus configmap path + rr.Manifests = []odhtypes.ManifestInfo{ + { + Path: odhdeploy.DefaultManifestPath, + ContextDir: "monitoring/prometheus/apps", + }, + } + + default: + log.Info("Monitoring enabled, won't apply changes in this mode", "cluster", platform) + } + return nil } -func updateStatus(ctx context.Context, rr *odhtypes.ReconciliationRequest) error { - d, ok := rr.Instance.(*serviceApi.Monitoring) - if !ok { - return errors.New("instance is not of type *services.Monitoring") +func updatePrometheusConfigMap(ctx context.Context, rr *odhtypes.ReconciliationRequest) error { + // Map component names to their rule prefixes + dscList := &dscv1.DataScienceClusterList{} + if err := rr.Client.List(ctx, dscList); err != nil { + return fmt.Errorf("failed to list DSC: %w", err) + } + if len(dscList.Items) == 0 { + return nil + } + dsc := &dscList.Items[0] + componentRules := map[string]string{ + componentApi.DashboardComponentName: "rhods-dashboard", + componentApi.WorkbenchesComponentName: "workbenches", + componentApi.KueueComponentName: "kueue", + componentApi.CodeFlareComponentName: "codeflare", + componentApi.DataSciencePipelinesComponentName: "data-science-pipelines-operator", + componentApi.ModelMeshServingComponentName: "model-mesh", + componentApi.RayComponentName: "ray", + componentApi.TrustyAIComponentName: "trustyai", + componentApi.KserveComponentName: "kserve", + componentApi.TrainingOperatorComponentName: "trainingoperator", + componentApi.ModelRegistryComponentName: "model-registry-operator", + componentApi.ModelControllerComponentName: "odh-model-controller", } - // url - rl := routev1.RouteList{} - err := rr.Client.List( - ctx, - &rl, - client.InNamespace(rr.DSCI.Spec.Monitoring.Namespace), - client.MatchingLabels(map[string]string{ - labels.PlatformPartOf: serviceApi.MonitoringServiceName, - }), - ) + err := cr.ForEach(func(ch cr.ComponentHandler) error { + var enabled bool + ci := ch.NewCRObject(dsc) + // read the component instance to get tha actual status + err := rr.Client.Get(ctx, client.ObjectKeyFromObject(ci), ci) + switch { + case k8serr.IsNotFound(err): + enabled = false + case err != nil: + enabled = false + return fmt.Errorf("error getting component state component: %v enabled: %v %w ", ch.GetName(), enabled, err) + default: + enabled = meta.IsStatusConditionTrue(ci.GetStatus().Conditions, status.ConditionTypeReady) + } + // Check for shared components + if ch.GetName() == componentApi.KserveComponentName || ch.GetName() == componentApi.ModelMeshServingComponentName { + if err := UpdatePrometheusConfig(ctx, enabled, componentRules[componentApi.ModelControllerComponentName]); err != nil { + return err + } + } + + if err := UpdatePrometheusConfig(ctx, enabled, componentRules[ch.GetName()]); err != nil { + return err + } + return nil + }) if err != nil { - return fmt.Errorf("failed to list routes: %w", err) + return err } + return nil +} - d.Status.URL = "" - if len(rl.Items) == 1 { - d.Status.URL = resources.IngressHost(rl.Items[0]) +func updateStatus(ctx context.Context, rr *odhtypes.ReconciliationRequest) error { + m, ok := rr.Instance.(*serviceApi.Monitoring) + if !ok { + return errors.New("instance is not of type *services.Monitoring") } + m.Status.Phase = "Ready" + m.Status.ObservedGeneration = m.GetObjectMeta().GetGeneration() + return nil } diff --git a/main.go b/main.go index e9bd6cb6557..fa09f221964 100644 --- a/main.go +++ b/main.go @@ -70,6 +70,7 @@ import ( dscictrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/dscinitialization" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/secretgenerator" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/services/auth" + "github.com/opendatahub-io/opendatahub-operator/v2/controllers/services/monitoring" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/setupcontroller" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/webhook" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" @@ -139,7 +140,7 @@ func initComponents(_ context.Context, p cluster.Platform) error { }) } -func main() { //nolint:funlen,maintidx +func main() { //nolint:funlen,maintidx,gocyclo var metricsAddr string var enableLeaderElection bool var probeAddr string @@ -353,6 +354,12 @@ func main() { //nolint:funlen,maintidx os.Exit(1) } + if platform == cluster.ManagedRhoai { + if err := monitoring.NewServiceReconciler(ctx, mgr); err != nil { + os.Exit(1) + } + } + // get old release version before we create default DSCI CR oldReleaseVersion, _ := upgrade.GetDeployedRelease(ctx, setupClient) diff --git a/pkg/cluster/gvk/gvk.go b/pkg/cluster/gvk/gvk.go index 0c3fd589682..240e4408e38 100644 --- a/pkg/cluster/gvk/gvk.go +++ b/pkg/cluster/gvk/gvk.go @@ -8,6 +8,7 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" componentApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/components/v1alpha1" + serviceApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/services/v1alpha1" ) var ( @@ -178,6 +179,12 @@ var ( Kind: componentApi.TrainingOperatorKind, } + Monitoring = schema.GroupVersionKind{ + Group: serviceApi.GroupVersion.Group, + Version: serviceApi.GroupVersion.Version, + Kind: serviceApi.MonitoringKind, + } + CustomResourceDefinition = schema.GroupVersionKind{ Group: "apiextensions.k8s.io", Version: "v1", diff --git a/pkg/controller/handlers/handlers.go b/pkg/controller/handlers/handlers.go index 5dad19f4c2d..95edff8c614 100644 --- a/pkg/controller/handlers/handlers.go +++ b/pkg/controller/handlers/handlers.go @@ -7,6 +7,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" ) func LabelToName(key string) handler.EventHandler { @@ -61,3 +63,22 @@ func ToNamed(name string) handler.EventHandler { }} }) } + +func ToAddonParamReq() handler.EventHandler { + return handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []reconcile.Request { + operatorNs, err := cluster.GetOperatorNamespace() + if err != nil { + return nil + } + + if obj.GetName() == "addon-managed-odh-parameters" && obj.GetNamespace() == operatorNs { + return []reconcile.Request{{ + NamespacedName: types.NamespacedName{ + Name: "addon-managed-odh-parameters", + Namespace: operatorNs, + }, + }} + } + return nil + }) +} diff --git a/pkg/controller/predicates/component/component.go b/pkg/controller/predicates/component/component.go index d26badb5d38..7208c747b4a 100644 --- a/pkg/controller/predicates/component/component.go +++ b/pkg/controller/predicates/component/component.go @@ -34,3 +34,17 @@ func ForAnnotation(name string, value string) predicate.Funcs { }, } } + +func ForLabelAllEvents(name string, value string) predicate.Funcs { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return resources.HasLabel(e.Object, name, value) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return resources.HasLabel(e.Object, name, value) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return resources.HasLabel(e.ObjectNew, name, value) || resources.HasLabel(e.ObjectOld, name, value) + }, + } +} diff --git a/pkg/controller/predicates/resources/resources.go b/pkg/controller/predicates/resources/resources.go index 827eab1b7f4..8d725af1491 100644 --- a/pkg/controller/predicates/resources/resources.go +++ b/pkg/controller/predicates/resources/resources.go @@ -1,9 +1,14 @@ package resources import ( + "reflect" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/predicate" + + dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" ) var _ predicate.Predicate = DeploymentPredicate{} @@ -53,3 +58,41 @@ func Deleted() predicate.Funcs { }, } } + +// Content predicates moved from original controller. +var CMContentChangedPredicate = predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldCM, _ := e.ObjectOld.(*corev1.ConfigMap) + newCM, _ := e.ObjectNew.(*corev1.ConfigMap) + return !reflect.DeepEqual(oldCM.Data, newCM.Data) + }, +} + +var SecretContentChangedPredicate = predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldSecret, _ := e.ObjectOld.(*corev1.Secret) + newSecret, _ := e.ObjectNew.(*corev1.Secret) + return !reflect.DeepEqual(oldSecret.Data, newSecret.Data) + }, +} + +var DSCDeletionPredicate = predicate.Funcs{ + DeleteFunc: func(e event.DeleteEvent) bool { + return true + }, +} + +var DSCSpecUpdatePredicate = predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldDSC, ok := e.ObjectOld.(*dscv1.DataScienceCluster) + if !ok { + return false + } + newDSC, ok := e.ObjectNew.(*dscv1.DataScienceCluster) + if !ok { + return false + } + // Compare components state + return !reflect.DeepEqual(oldDSC.Spec.Components, newDSC.Spec.Components) + }, +} diff --git a/pkg/services/monitoring/prometheus.go b/pkg/services/monitoring/prometheus.go index d09d093b675..16eacb70094 100644 --- a/pkg/services/monitoring/prometheus.go +++ b/pkg/services/monitoring/prometheus.go @@ -7,7 +7,6 @@ import ( "strings" "gopkg.in/yaml.v2" - "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/log" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" @@ -19,7 +18,7 @@ var ( // UpdatePrometheusConfig update prometheus-configs.yaml to include/exclude .rules // parameter enable when set to true to add new rules, when set to false to remove existing rules. -func UpdatePrometheusConfig(ctx context.Context, _ client.Client, enable bool, component string) error { +func UpdatePrometheusConfig(ctx context.Context, enable bool, component string) error { l := logf.FromContext(ctx) // create a struct to mock poremtheus.yml diff --git a/tests/e2e/controller_test.go b/tests/e2e/controller_test.go index 41f58bfd80f..887847b995e 100644 --- a/tests/e2e/controller_test.go +++ b/tests/e2e/controller_test.go @@ -56,6 +56,11 @@ var ( componentApi.ModelMeshServingComponentName: modelMeshServingTestSuite, componentApi.ModelControllerComponentName: modelControllerTestSuite, } + + servicesTestSuites = map[string]TestFn{ + serviceApi.MonitoringServiceName: monitoringTestSuite, + serviceApi.AuthServiceName: authControllerTestSuite, + } ) type arrayFlags []string @@ -76,7 +81,7 @@ type testContextConfig struct { operatorControllerTest bool webhookTest bool components arrayFlags - authControllerTest bool + services arrayFlags } // Holds information specific to individual tests. @@ -186,9 +191,16 @@ func TestOdhOperator(t *testing.T) { } }) - if testOpts.authControllerTest { - t.Run("test auth controller", authControllerTestSuite) - } + t.Run("services", func(t *testing.T) { + for k, v := range servicesTestSuites { + if len(testOpts.services) != 0 && !slices.Contains(testOpts.services, k) { + t.Logf("Skipping tests for services %s", k) + continue + } + + t.Run(k, v) + } + }) // Run deletion if skipDeletion is not set if !testOpts.skipDeletion { @@ -212,16 +224,23 @@ func TestMain(m *testing.M) { componentNames := strings.Join(maps.Keys(componentsTestSuites), ", ") flag.Var(&testOpts.components, "test-component", "run tests for the specified component. valid components names are: "+componentNames) - flag.BoolVar(&testOpts.authControllerTest, "test-auth-controller", true, "run auth controller tests") + for _, n := range testOpts.components { + if _, ok := componentsTestSuites[n]; !ok { + fmt.Printf("test-component: unknown component %s, valid values are: %s", n, componentNames) + os.Exit(1) + } + } - flag.Parse() + serviceNames := strings.Join(maps.Keys(servicesTestSuites), ", ") + flag.Var(&testOpts.services, "test-service", "run tests for the specified service. valid service names are: "+serviceNames) for _, n := range testOpts.components { if _, ok := componentsTestSuites[n]; !ok { - fmt.Printf("test-component: unknown component %s, valid values are: %s", n, componentNames) + fmt.Printf("test-service: unknown service %s, valid values are: %s", n, serviceNames) os.Exit(1) } } + flag.Parse() os.Exit(m.Run()) } diff --git a/tests/e2e/monitoring_test.go b/tests/e2e/monitoring_test.go new file mode 100644 index 00000000000..7a76835e28d --- /dev/null +++ b/tests/e2e/monitoring_test.go @@ -0,0 +1,70 @@ +package e2e_test + +import ( + "fmt" + "testing" + + operatorv1 "github.com/openshift/api/operator/v1" + "github.com/stretchr/testify/require" + + serviceApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/services/v1alpha1" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" +) + +type MonitoringTestCtx struct { + *testContext + testMonitoringInstance serviceApi.Monitoring +} + +func monitoringTestSuite(t *testing.T) { + t.Helper() + + tc, err := NewTestContext() + require.NoError(t, err) + + monitoringServiceCtx := MonitoringTestCtx{ + testContext: tc, + } + + t.Run(tc.testDsc.Name, func(t *testing.T) { + t.Run("Auto creation of Monitoring CR", func(t *testing.T) { + err = monitoringServiceCtx.validateMonitoringCRCreation() + require.NoError(t, err, "error getting Auth CR") + }) + t.Run("Test Monitoring CR content", func(t *testing.T) { + err = monitoringServiceCtx.validateMonitoringCRDefaultContent() + require.NoError(t, err, "unexpected content in Auth CR") + }) + // TODO: Add Managed monitoring test suite + }) +} + +func (tc *MonitoringTestCtx) validateMonitoringCRCreation() error { + if tc.testDSCI.Spec.Monitoring.ManagementState == operatorv1.Removed { + return nil + } + monitoringList := &serviceApi.MonitoringList{} + if err := tc.testContext.customClient.List(tc.ctx, monitoringList); err != nil { + return fmt.Errorf("unable to find Monitoring CR instance: %w", err) + } + + switch { + case len(monitoringList.Items) == 1: + tc.testMonitoringInstance = monitoringList.Items[0] + return nil + case len(monitoringList.Items) > 1: + return fmt.Errorf("only one Monitoring CR expected, found %v", len(monitoringList.Items)) + default: + return nil + } +} + +func (tc *MonitoringTestCtx) validateMonitoringCRDefaultContent() error { + if tc.platform == cluster.ManagedRhoai { + if tc.testMonitoringInstance.Spec.MonitoringCommonSpec.Namespace != tc.testDSCI.Spec.Monitoring.Namespace { + return fmt.Errorf("unexpected monitoring namespace reference. Expected %v, got %v", tc.testDSCI.Spec.Monitoring.Namespace, + tc.testMonitoringInstance.Spec.MonitoringCommonSpec.Namespace) + } + } + return nil +} diff --git a/tests/e2e/odh_manager_test.go b/tests/e2e/odh_manager_test.go index 7a2d85a8cce..083c51cd713 100644 --- a/tests/e2e/odh_manager_test.go +++ b/tests/e2e/odh_manager_test.go @@ -104,4 +104,10 @@ func (tc *testContext) validateOwnedCRDs(t *testing.T) { require.NoErrorf(t, tc.validateCRD("modelcontrollers.components.platform.opendatahub.io"), "error in validating CRD : modelcontrollers.components.platform.opendatahub.io") }) + + t.Run("Validate Monitoring CRD", func(t *testing.T) { + t.Parallel() + require.NoErrorf(t, tc.validateCRD("monitorings.services.platform.opendatahub.io"), + "error in validating CRD : monitorings.services.platform.opendatahub.io") + }) } diff --git a/tests/prometheus_unit_tests/codeflare-alerting.rules.yaml b/tests/prometheus_unit_tests/codeflare-alerting.rules.yaml new file mode 100644 index 00000000000..50655d5db6b --- /dev/null +++ b/tests/prometheus_unit_tests/codeflare-alerting.rules.yaml @@ -0,0 +1,65 @@ +groups: + - name: SLOs-probe_success_codeflare + rules: + - alert: CodeFlare Operator Probe Success 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md' + summary: CodeFlare Operator Probe Success 5m and 1h Burn Rate high + expr: | + sum(probe_success:burnrate5m{instance=~"codeflare-operator"}) by (instance) > (14.40 * (1-0.99950)) + and + sum(probe_success:burnrate1h{instance=~"codeflare-operator"}) by (instance) > (14.40 * (1-0.99950)) + for: 2m + labels: + severity: info + namespace: redhat-ods-applications + - alert: CodeFlare Operator Probe Success 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md' + summary: CodeFlare Operator Probe Success 30m and 6h Burn Rate high + expr: | + sum(probe_success:burnrate30m{instance=~"codeflare-operator"}) by (instance) > (6.00 * (1-0.99950)) + and + sum(probe_success:burnrate6h{instance=~"codeflare-operator"}) by (instance) > (6.00 * (1-0.99950)) + for: 15m + labels: + severity: info + namespace: redhat-ods-applications + - alert: CodeFlare Operator Probe Success 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md' + summary: CodeFlare Operator Probe Success 2h and 1d Burn Rate high + expr: | + sum(probe_success:burnrate2h{instance=~"codeflare-operator"}) by (instance) > (3.00 * (1-0.99950)) + and + sum(probe_success:burnrate1d{instance=~"codeflare-operator"}) by (instance) > (3.00 * (1-0.99950)) + for: 1h + labels: + severity: info + namespace: redhat-ods-applications + + - name: Distributed Workloads CodeFlare + interval: 1m + rules: + - alert: CodeFlare Operator is not running + expr: absent(up{job=~'CodeFlare Operator'}) or up{job=~'CodeFlare Operator'} != 1 + labels: + severity: info + namespace: redhat-ods-applications + annotations: + description: This alert fires when the CodeFlare Operator is not running. + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md' + summary: Alerting for CodeFlare Operator + - alert: CodeFlare Operator taking too long to be up + expr: absent_over_time(up{job="CodeFlare Operator"}[2m]) == 1 + labels: + severity: info + namespace: redhat-ods-applications + annotations: + description: This alert fires when the CodeFlare Operator takes over 2 min. to come back online. Either CodeFlare Operator is not running and failing to become ready, is misconfigured, or the metrics endpoint is not responding. + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-absent-over-time.md' + summary: Alerting for CodeFlare Operator + diff --git a/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.rules.yaml b/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.rules.yaml new file mode 100644 index 00000000000..ba54f8b362a --- /dev/null +++ b/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.rules.yaml @@ -0,0 +1,139 @@ +groups: + - name: SLOs-haproxy_backend_http_responses_dsp + rules: + - alert: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' + summary: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high + expr: | + sum(haproxy_backend_http_responses_total:burnrate5m{component="dsp"}) by (exported_namespace) > (14.40 * (1-0.99950)) + and + sum(haproxy_backend_http_responses_total:burnrate1h{component="dsp"}) by (exported_namespace) > (14.40 * (1-0.99950)) + for: 2m + labels: + severity: info + namespace: redhat-ods-applications + - alert: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' + summary: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high + expr: | + sum(haproxy_backend_http_responses_total:burnrate30m{component="dsp"}) by (exported_namespace) > (6.00 * (1-0.99950)) + and + sum(haproxy_backend_http_responses_total:burnrate6h{component="dsp"}) by (exported_namespace) > (6.00 * (1-0.99950)) + for: 15m + labels: + severity: info + namespace: redhat-ods-applications + - alert: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' + summary: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high + expr: | + sum(haproxy_backend_http_responses_total:burnrate2h{component="dsp"}) by (exported_namespace) > (3.00 * (1-0.99950)) + and + sum(haproxy_backend_http_responses_total:burnrate1d{component="dsp"}) by (exported_namespace) > (3.00 * (1-0.99950)) + for: 1h + labels: + severity: info + namespace: redhat-ods-applications + - alert: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' + summary: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high + expr: | + sum(haproxy_backend_http_responses_total:burnrate6h{component="dsp"}) by (exported_namespace) > (1.00 * (1-0.99950)) + and + sum(haproxy_backend_http_responses_total:burnrate3d{component="dsp"}) by (exported_namespace) > (1.00 * (1-0.99950)) + for: 3h + labels: + severity: info + namespace: redhat-ods-applications + - name: SLOs-probe_success_dsp + rules: + - alert: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md" + summary: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high + expr: | + sum(probe_success:burnrate5m{instance=~"data-science-pipelines-operator"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"data-science-pipelines-operator"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md" + summary: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high + expr: | + sum(probe_success:burnrate30m{instance=~"data-science-pipelines-operator"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"data-science-pipelines-operator"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md" + summary: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high + expr: | + sum(probe_success:burnrate2h{instance=~"data-science-pipelines-operator"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"data-science-pipelines-operator"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + namespace: redhat-ods-applications + - name: RHODS Data Science Pipelines + rules: + - alert: Data Science Pipeline Application Unavailable + annotations: + message: 'Data Science Pipelines Application is down!' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-component-readiness-status.md' + summary: The Data Science Pipelines Application CustomResource "{{ $labels.dspa_name }}" in namespace "{{ $labels.dspa_namespace }}" has been NotReady for more than 5 minutes + expr: min(max_over_time(data_science_pipelines_application_ready[3m])) by (dspa_name, dspa_namespace) == 0 + for: 2m + labels: + severity: info + namespace: redhat-ods-applications + - alert: Data Science Pipeline APIServer Unavailable + annotations: + message: 'Data Science Pipelines APIServer component is down!' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-component-readiness-status.md' + summary: A Data Science Pipelines APIServer pod owned by DSPA "{{ $labels.dspa_name }}" in namespace "{{ $labels.dspa_namespace }}" has been NotReady for more than 5 minutes + expr: min(max_over_time(data_science_pipelines_application_apiserver_ready[3m])) by (dspa_name, dspa_namespace) == 0 + for: 2m + labels: + severity: info + namespace: redhat-ods-applications + - alert: Data Science Pipeline PersistenceAgent Unavailable + annotations: + message: 'Data Science Pipelines PersistenceAgent component is down!' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-component-readiness-status.md' + summary: A Data Science Pipelines PersistenceAgent pod owned by DSPA "{{ $labels.dspa_name }}" in namespace "{{ $labels.dspa_namespace }}" has been NotReady for more than 5 minutes + expr: min(max_over_time(data_science_pipelines_application_persistenceagent_ready[3m])) by (dspa_name, dspa_namespace) == 0 + for: 2m + labels: + severity: info + namespace: redhat-ods-applications + - alert: Data Science Pipeline ScheduledWorkflows Unavailable + annotations: + message: 'Data Science Pipelines ScheduledWorkflows component is down!' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-component-readiness-status.md' + summary: A Data Science Pipelines ScheduledWorkflow controller pod owned by DSPA "{{ $labels.dspa_name }}" in namespace "{{ $labels.dspa_namespace }}" has been NotReady for more than 5 minutes + expr: min(max_over_time(data_science_pipelines_application_scheduledworkflow_ready[3m])) by (dspa_name, dspa_namespace) == 0 + for: 2m + labels: + severity: info + namespace: redhat-ods-applications + diff --git a/tests/prometheus_unit_tests/kserve-alerting.rules.yaml b/tests/prometheus_unit_tests/kserve-alerting.rules.yaml new file mode 100644 index 00000000000..59df8d61c0f --- /dev/null +++ b/tests/prometheus_unit_tests/kserve-alerting.rules.yaml @@ -0,0 +1,40 @@ +groups: + - name: SLOs-probe_success_kserve + rules: + - alert: Kserve Controller Probe Success 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" + summary: Kserve Controller Probe Success 5m and 1h Burn Rate high + expr: | + sum(probe_success:burnrate5m{instance=~"kserve-controller-manager"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"kserve-controller-manager"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + - alert: Kserve Controller Probe Success 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" + summary: Kserve Controller Probe Success 30m and 6h Burn Rate high + expr: | + sum(probe_success:burnrate30m{instance=~"kserve-controller-manager"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"kserve-controller-manager"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + - alert: Kserve Controller Probe Success 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" + summary: Kserve Controller Probe Success 2h and 1d Burn Rate high + expr: | + sum(probe_success:burnrate2h{instance=~"kserve-controller-manager"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"kserve-controller-manager"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + diff --git a/tests/prometheus_unit_tests/kueue-alerting.rules.yaml b/tests/prometheus_unit_tests/kueue-alerting.rules.yaml new file mode 100644 index 00000000000..db383ee8532 --- /dev/null +++ b/tests/prometheus_unit_tests/kueue-alerting.rules.yaml @@ -0,0 +1,13 @@ +groups: +- name: Distributed Workloads Kueue + interval: 1m + rules: + - alert: Kueue Operator is not running + expr: absent(up{job=~'Kueue Operator'}) or up{job=~'Kueue Operator'} != 1 + labels: + severity: warning + annotations: + description: This alert fires when the Kueue Operator is not running. + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kueue-operator-availability.md' + summary: Alerting for Kueue Operator + diff --git a/tests/prometheus_unit_tests/model-mesh-alerting.rules.yaml b/tests/prometheus_unit_tests/model-mesh-alerting.rules.yaml new file mode 100644 index 00000000000..a905e14e579 --- /dev/null +++ b/tests/prometheus_unit_tests/model-mesh-alerting.rules.yaml @@ -0,0 +1,43 @@ +groups: + - name: SLOs-probe_success_modelmesh + rules: + - alert: Modelmesh Controller Probe Success 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md" + summary: Modelmesh Controller Probe Success 5m and 1h Burn Rate high + expr: | + sum(probe_success:burnrate5m{instance=~"modelmesh-controller"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"modelmesh-controller"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Modelmesh Controller Probe Success 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md" + summary: Modelmesh Controller Probe Success 30m and 6h Burn Rate high + expr: | + sum(probe_success:burnrate30m{instance=~"modelmesh-controller"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"modelmesh-controller"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Modelmesh Controller Probe Success 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md" + summary: Modelmesh Controller Probe Success 2h and 1d Burn Rate high + expr: | + sum(probe_success:burnrate2h{instance=~"modelmesh-controller"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"modelmesh-controller"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + namespace: redhat-ods-applications + diff --git a/tests/prometheus_unit_tests/odh-model-controller-alerting.rules.yaml b/tests/prometheus_unit_tests/odh-model-controller-alerting.rules.yaml new file mode 100644 index 00000000000..5529229d5cd --- /dev/null +++ b/tests/prometheus_unit_tests/odh-model-controller-alerting.rules.yaml @@ -0,0 +1,43 @@ +groups: + - name: SLOs-probe_success_model_controller + rules: + - alert: ODH Model Controller Probe Success 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" + summary: ODH Model Controller Probe Success 5m and 1h Burn Rate high + expr: | + sum(probe_success:burnrate5m{instance=~"odh-model-controller"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"odh-model-controller"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: ODH Model Controller Probe Success 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" + summary: ODH Model Controller Probe Success 30m and 6h Burn Rate high + expr: | + sum(probe_success:burnrate30m{instance=~"odh-model-controller"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"odh-model-controller"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: ODH Model Controller Probe Success 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" + summary: ODH Model Controller Probe Success 2h and 1d Burn Rate high + expr: | + sum(probe_success:burnrate2h{instance=~"odh-model-controller"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"odh-model-controller"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + namespace: redhat-ods-applications + diff --git a/tests/prometheus_unit_tests/ray-alerting.rules.yaml b/tests/prometheus_unit_tests/ray-alerting.rules.yaml new file mode 100644 index 00000000000..c0a2a5ab186 --- /dev/null +++ b/tests/prometheus_unit_tests/ray-alerting.rules.yaml @@ -0,0 +1,13 @@ +groups: +- name: Distributed Workloads Kuberay + interval: 1m + rules: + - alert: KubeRay Operator is not running + expr: absent(up{job=~'KubeRay Operator'}) or up{job=~'KubeRay Operator'} != 1 + labels: + severity: warning + annotations: + description: This alert fires when the KubeRay Operator is not running. + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md' + summary: Alerting for KubeRay Operator + diff --git a/tests/prometheus_unit_tests/rhods-dashboard-alerting.rules.yaml b/tests/prometheus_unit_tests/rhods-dashboard-alerting.rules.yaml new file mode 100644 index 00000000000..eaf2c791d17 --- /dev/null +++ b/tests/prometheus_unit_tests/rhods-dashboard-alerting.rules.yaml @@ -0,0 +1,110 @@ +groups: + - name: SLOs-haproxy_backend_http_responses_dashboard + rules: + - alert: RHODS Dashboard Route Error 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' + summary: RHODS Dashboard Route Error 5m and 1h Burn Rate high + expr: | + sum(haproxy_backend_http_responses_total:burnrate5m{route=~"rhods-dashboard"}) by (route) > (14.40 * (1-0.99950)) + and + sum(haproxy_backend_http_responses_total:burnrate1h{route=~"rhods-dashboard"}) by (route) > (14.40 * (1-0.99950)) + for: 2m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: RHODS Dashboard Route Error 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' + summary: RHODS Dashboard Route Error 30m and 6h Burn Rate high + expr: | + sum(haproxy_backend_http_responses_total:burnrate30m{route=~"rhods-dashboard"}) by (route) > (6.00 * (1-0.99950)) + and + sum(haproxy_backend_http_responses_total:burnrate6h{route=~"rhods-dashboard"}) by (route) > (6.00 * (1-0.99950)) + for: 15m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: RHODS Dashboard Route Error 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' + summary: RHODS Dashboard Route Error 2h and 1d Burn Rate high + expr: | + sum(haproxy_backend_http_responses_total:burnrate2h{route=~"rhods-dashboard"}) by (route) > (3.00 * (1-0.99950)) + and + sum(haproxy_backend_http_responses_total:burnrate1d{route=~"rhods-dashboard"}) by (route) > (3.00 * (1-0.99950)) + for: 1h + labels: + severity: warning + namespace: redhat-ods-applications + - alert: RHODS Dashboard Route Error 6h and 3d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' + summary: RHODS Dashboard Route Error 6h and 3d Burn Rate high + expr: | + sum(haproxy_backend_http_responses_total:burnrate6h{route=~"rhods-dashboard"}) by (route) > (1.00 * (1-0.99950)) + and + sum(haproxy_backend_http_responses_total:burnrate3d{route=~"rhods-dashboard"}) by (route) > (1.00 * (1-0.99950)) + for: 3h + labels: + severity: warning + namespace: redhat-ods-applications + - name: SLOs-probe_success_dashboard + rules: + - alert: RHODS Dashboard Probe Success 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.name }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" + summary: RHODS Dashboard Probe Success 5m and 1h Burn Rate high + expr: | + sum(probe_success:burnrate5m{name=~"rhods-dashboard"}) by (name) > (14.40 * (1-0.98)) + and + sum(probe_success:burnrate1h{name=~"rhods-dashboard"}) by (name) > (14.40 * (1-0.98)) + for: 2m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: RHODS Dashboard Probe Success 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.name }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" + summary: RHODS Dashboard Probe Success 30m and 6h Burn Rate high + expr: | + sum(probe_success:burnrate30m{name=~"rhods-dashboard"}) by (name) > (6.00 * (1-0.98)) + and + sum(probe_success:burnrate6h{name=~"rhods-dashboard"}) by (name) > (6.00 * (1-0.98)) + for: 15m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: RHODS Dashboard Probe Success 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.name }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" + summary: RHODS Dashboard Probe Success 2h and 1d Burn Rate high + expr: | + sum(probe_success:burnrate2h{name=~"rhods-dashboard"}) by (name) > (3.00 * (1-0.98)) + and + sum(probe_success:burnrate1d{name=~"rhods-dashboard"}) by (name) > (3.00 * (1-0.98)) + for: 1h + labels: + severity: warning + namespace: redhat-ods-applications + - alert: RHODS Dashboard Probe Success 6h and 3d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.name }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" + summary: RHODS Dashboard Probe Success 6h and 3d Burn Rate high + expr: | + sum(probe_success:burnrate6h{name=~"rhods-dashboard"}) by (name) > (1.00 * (1-0.98)) + and + sum(probe_success:burnrate3d{name=~"rhods-dashboard"}) by (name) > (1.00 * (1-0.98)) + for: 3h + labels: + severity: warning + namespace: redhat-ods-applications + diff --git a/tests/prometheus_unit_tests/trainingoperator-alerting.rules.yaml b/tests/prometheus_unit_tests/trainingoperator-alerting.rules.yaml new file mode 100644 index 00000000000..d13a30b3c45 --- /dev/null +++ b/tests/prometheus_unit_tests/trainingoperator-alerting.rules.yaml @@ -0,0 +1,13 @@ +groups: +- name: KubeFlow Training Operator + interval: 1m + rules: + - alert: KubeFlow Training Operator is not running + expr: absent(up{job=~'KubeFlow Training Operator'}) or up{job=~'KubeFlow Training Operator'} != 1 + labels: + severity: warning + annotations: + description: This alert fires when the KubeFlow Training Operator is not running. + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/training-operator-availability.md' + summary: Alerting for KubeFlow Training Operator + diff --git a/tests/prometheus_unit_tests/trustyai-alerting.rules.yaml b/tests/prometheus_unit_tests/trustyai-alerting.rules.yaml new file mode 100644 index 00000000000..adc3f419510 --- /dev/null +++ b/tests/prometheus_unit_tests/trustyai-alerting.rules.yaml @@ -0,0 +1,43 @@ +groups: + - name: SLOs-probe_success_trustyai + rules: + - alert: TrustyAI Controller Probe Success 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" + summary: TrustyAI Controller Probe Success 5m and 1h Burn Rate high + expr: | + sum(probe_success:burnrate5m{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + instance: trustyai-service-operator-controller-manager + - alert: TrustyAI Controller Probe Success 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" + summary: TrustyAI Controller Probe Success 30m and 6h Burn Rate high + expr: | + sum(probe_success:burnrate30m{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + instance: trustyai-service-operator-controller-manager + - alert: TrustyAI Controller Probe Success 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" + summary: TrustyAI Controller Probe Success 2h and 1d Burn Rate high + expr: | + sum(probe_success:burnrate2h{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + instance: trustyai-service-operator-controller-manager + diff --git a/tests/prometheus_unit_tests/workbenches-alerting.rules.yaml b/tests/prometheus_unit_tests/workbenches-alerting.rules.yaml new file mode 100644 index 00000000000..a46af8fd2ed --- /dev/null +++ b/tests/prometheus_unit_tests/workbenches-alerting.rules.yaml @@ -0,0 +1,102 @@ +groups: + - name: RHODS-PVC-Usage + rules: + - alert: User notebook pvc usage above 90% + annotations: + message: 'The user notebook {{ $labels.persistentvolumeclaim }} is using 90% of its Volume. You might want to decrease the amount of data stored on the server or you can reach out to your cluster admin to increase the storage capacity to prevent disruptions and loss of data. Please back up your data before increasing the storage limit.' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/tree/main/RHODS" + summary: User notebook pvc usage above 90% + expr: kubelet_volume_stats_used_bytes{persistentvolumeclaim=~".*jupyterhub-nb-.*"} / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"jupyterhub-nb-.*"} > 0.9 and kubelet_volume_stats_used_bytes{persistentvolumeclaim=~".*jupyterhub-nb-.*"} / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"jupyterhub-nb-.*"} < 0.99 + for: 2m + labels: + severity: warning + route: user-notifications + - alert: User notebook pvc usage at 100% + annotations: + message: 'The user notebook {{ $labels.persistentvolumeclaim }} is using 100% of its Volume. You might want to decrease the amount of data stored on the server or you can reach out to your cluster admin to increase the storage capacity to prevent disruptions and loss of data. Please back up your data before increasing the storage limit.' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/tree/main/RHODS" + summary: User notebook pvc usage at 100% + expr: kubelet_volume_stats_used_bytes{persistentvolumeclaim=~".*jupyterhub-nb-.*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"jupyterhub-nb-.*"} > 0.99 + for: 2m + labels: + severity: warning + route: user-notifications + + - name: RHODS Notebook controllers + rules: + - alert: Kubeflow notebook controller pod is not running + annotations: + message: 'Kubeflow Notebook controller is down!' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-kfnbc-notebook-controller-alert.md" + summary: Kubeflow notebook controller pod is not running + expr: absent(up{job=~'Kubeflow Notebook Controller Service Metrics'}) + for: 5m + labels: + severity: warning + namespace: redhat-ods-applications + - alert: ODH notebook controller pod is not running + annotations: + message: 'ODH notebook controller is down!' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-odh-notebook-controller-alert.md" + summary: ODH notebook controller pod is not running + expr: absent(up{job=~'ODH Notebook Controller Service Metrics'}) + for: 5m + labels: + severity: warning + namespace: redhat-ods-applications + + - name: SLOs-probe_success_workbench + rules: + - alert: RHODS Jupyter Probe Success 5m and 1h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md" + summary: RHODS Jupyter Probe Success 5m and 1h Burn Rate high + expr: | + sum(probe_success:burnrate5m{instance=~"notebook-spawner"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"notebook-spawner"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + instance: notebook-spawner + - alert: RHODS Jupyter Probe Success 30m and 6h Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md" + summary: RHODS Jupyter Probe Success 30m and 6h Burn Rate high + expr: | + sum(probe_success:burnrate30m{instance=~"notebook-spawner"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"notebook-spawner"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + instance: notebook-spawner + - alert: RHODS Jupyter Probe Success 2h and 1d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md" + summary: RHODS Jupyter Probe Success 2h and 1d Burn Rate high + expr: | + sum(probe_success:burnrate2h{instance=~"notebook-spawner"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"notebook-spawner"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + instance: notebook-spawner + - alert: RHODS Jupyter Probe Success 6h and 3d Burn Rate high + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md" + summary: RHODS Jupyter Probe Success 6h and 3d Burn Rate high + expr: | + sum(probe_success:burnrate6h{instance=~"notebook-spawner"}) by (instance) > (1.00 * (1-0.98000)) + and + sum(probe_success:burnrate3d{instance=~"notebook-spawner"}) by (instance) > (1.00 * (1-0.98000)) + for: 3h + labels: + severity: warning + instance: notebook-spawner +