From 83e3e39913aa55957fe06359ae67fb0e4ded0dad Mon Sep 17 00:00:00 2001 From: Dhiraj Bokde Date: Thu, 1 Feb 2024 01:21:59 -0800 Subject: [PATCH] feat: Add ModelRegistry component (#775) (#776) Squashed commit due to buildability since ComponentInterface has changed. Other patches squashed as well to avoid double squashing due to merge policy. modelregistry: regenerate autogenerated files Run `make generate manifests` after all the changes Signed-off-by: Yauheni Kaliuta feat: Add ModelRegistry component (#775) (#776) * feat: Add ModelRegistry component (#775) * fix: Fix modelregistry odh overlays path * fix: fix dsc_create_test tests err nil check * fix: refactor ModelRegistry.ReconcileComponent for new parameters * chore: added modelregistry to README.md * fix: add missing rbac rules for deploymentconfigs and daemonsets * chore: code lint cleanup * fix: added check for nil DevFlags in model-registry component * fix: add nil check for dscispec.DevFlags in model-registry ReconcileComponent * fix: remove RBAC rules for daemonsets and deploymentconfigs * fix(chore): fix lint errors in dsc_deletion_test.go (cherry picked from commit 112d3f14cbe54361d936582dcb44fbd71a69f862) Signed-off-by: Yauheni Kaliuta modelregistry: partial: chore: removes SetImageParamsMap from ComponentInterface (#897) Partial application of already applied commit d10a7643c13db0a5dffdb1700f7dfa8e657907ba Author: Bartosz Majsak Date: Thu Mar 7 15:43:37 2024 +0100 chore: removes SetImageParamsMap from ComponentInterface (#897) As it's not used by any component, acting as a simple pass-return loop. This makes the API contract a bit cleaner. Signed-off-by: Yauheni Kaliuta modelregistry: partial: chore: remove the need of passing rest config (#895) Partial application of already applied commit ca7fa9868489fe8e1ec5f67e64773b2e84062cf6 Author: Bartosz Majsak Date: Fri Mar 8 17:40:54 2024 +0100 chore: remove the need of passing rest config (#895) * chore: fixes ComponentInterface docs By removing reference to non-existing func. This function has been in use outside of this component. * fix: removes rest config As we are already using client.Client interface we do not have to instantiate other typed clients to e.g. list resources using their own funcs. Generic client.Client is sufficient for these needs. Additionally this change adds ctx propogation for these calls. Signed-off-by: Yauheni Kaliuta modelregistry: partial: feat(logger): for both controller level and component level (#837) Partial application of already applied commit d8a83a28d2e3d312040886f9769c19c34fe12522 Author: Wen Zhou Date: Mon Apr 1 22:06:16 2024 +0200 feat(logger): for both controller level and component level (#837) * feat(logger): for both controller level and component level Signed-off-by: Wen Zhou * update(logger): use logr instead of uber's zap Signed-off-by: Wen Zhou * update: do not log error only print Signed-off-by: Wen Zhou * update: use zap.Options for both and tune levels Signed-off-by: Wen Zhou * update: move setting into common function Signed-off-by: Wen Zhou Signed-off-by: Zhou, Wen --------- Signed-off-by: Wen Zhou Signed-off-by: Zhou, Wen Signed-off-by: Yauheni Kaliuta update(modelregistry): rename image name (#877) Signed-off-by: Wen Zhou (cherry picked from commit b4e4d6f590f5ffae2282125e1f1b7753fd4cb962) modelregistry: partial: chore: cleanup duplicated functions packages and add more for godoc (#981) Partial application of already applied commit 96c85f2ab0324334f207fcd800e8998086ee9f8f Author: Wen Zhou Date: Tue Apr 23 14:05:24 2024 +0200 chore: cleanup duplicated functions packages and add more for godoc (#981) * chore: cleanup duplicated functions/package and add godoc - move GetPlatform() from deploy package to cluster package - move const ManagedRhods SelfManagedRhods OpenDataHub from deploy to cluster package - move WaitForDeploymentAvailable() monitoring package to cluster package - remove monitoring package - move UpdatePodSecurityRolebinding() from common package to cluster package - deprecate GetDomain from common package, to only use GetDomain from cluster package. - remove gvk package, move its GVK to cluster package - move DeleteExistingSubscription() from deploy package to upgrade package - do not export getSubscription() Signed-off-by: Wen Zhou * update: remove gvk into one file but under cluster package Signed-off-by: Wen Zhou * update: rename variable, removing GVK from it Signed-off-by: Wen Zhou * update: move gvk into a sub package under cluster Signed-off-by: Wen Zhou --------- Signed-off-by: Wen Zhou Signed-off-by: Yauheni Kaliuta feat(mr): create namespace for Model Registry (#930) * feat(mr): create namespace for smm Signed-off-by: Wen Zhou * fix: rebase Signed-off-by: Zhou, Wen * update: code review comments Signed-off-by: Wen Zhou * fix(doc): wrong comments Signed-off-by: Wen Zhou * update: remove label to keep namespace even opreator is uninstalled Signed-off-by: Wen Zhou --------- Signed-off-by: Wen Zhou Signed-off-by: Zhou, Wen (cherry picked from commit 1188ce1576078a5fc16872fca78c4c004aa0c721) feat(mr): add model registry odh extras manifests, fixes RHOAIENG-5112 (#953) (cherry picked from commit 7c3e81b29345ed3d34e2f5d3a3b5750a5aa90202) modelregistry: partial: chore: Open up util functions for context propagation (#1033) Partial application of already applied commit 105adaec65c94f60ebc32c27ff15a51c0e1a752c Author: Aslak Knutsen Date: Tue Jun 4 15:16:21 2024 +0200 chore: Open up util functions for context propagation (#1033) context should be determined by the caller and propagated down the call chain. Signed-off-by: Yauheni Kaliuta modelregistry: partial: chore: remove duplicated platform call in each component (#1055) Partial application of already applied commit 1b04761fb609ed1693956257f541f1e92e4aa2f9 Author: Wen Zhou Date: Fri Jun 14 14:47:33 2024 +0200 chore: remove duplicated platform call in each component (#1055) - get in DSC and pass into compoment Signed-off-by: Wen Zhou Signed-off-by: Yauheni Kaliuta modelregistry: update api docs run `make api-docs` add +groupName=datasciencecluster.opendatahub.io On backporting of 1b86e4214436 ("Update readme.md (#890)") Signed-off-by: Yauheni Kaliuta modelregistry: partial: chore(lint): enable contextcheck and containedctx (#1070) Partial application of already applied: commit 06e21a490df651c36474c90b9314fa7230fcb851 Author: Luca Burgazzoli Date: Tue Jun 25 17:15:13 2024 +0200 chore(lint): enable contextcheck and containedctx (#1070) * chore(lint): enable contextcheck Signed-off-by: Luca Burgazzoli * chore(lint): enable containedctx Signed-off-by: Luca Burgazzoli * Fix PR review findings * Fix rebase --------- Signed-off-by: Luca Burgazzoli Signed-off-by: Yauheni Kaliuta refactor: dashboard with new manifests structure (#1065) Partial application of already applied: commit 438f4c26cb4e6c31831aed9a338a86f50a9cc2a9 Author: Wen Zhou Date: Tue Jul 2 16:56:25 2024 +0200 refactor: dashboard with new manifests structure (#1065) * refactor: dashboard with new manifests structure - change type of platform, skip convert to string - add more support for ApplyParam() to not only take ENV but also anything from ExtraParamMaps * update: simplify override function * update: add value for Unknown platform --------- Signed-off-by: Wen Zhou Signed-off-by: Yauheni Kaliuta feat: add managed model registry prometheus config handling logic, part of RHOAIENG-4273 (#1150) (cherry picked from commit 72fc80f489b78879701349233ad45d30825a823e) Adjusted Kueue and TrainingOperator rules Signed-off-by: Yauheni Kaliuta feat: add default cert for model registry, fixes RHOAIENG-9909 (#1165) Conflicts: ApplyParams arguments due to missing: d84cd337ff1d ("update: remove unnecessary param from ApplyParams() (#1180)") * feat: add default cert for model registry, fixes RHOAIENG-9909 * fix: fixed lint errors * fix: add servicemesh feature check for MR, add MR enable check in e2e default cert test * fix: changed MR servicemesh status check to look for Managed state * fix: ignore missing model-registry default cert if already removed (cherry picked from commit 4c411a63bd67d0aec5a66a736a70db0b474230c3) feat: add servicemeshmember for model registry namespace, fixes RHOAIENG-11831 (#1202) * feat: add servicemeshmember for model registry namespace, fixes RHOAIENG-11831 * fix: ignore error if MR smm already exists * code cleanup for readability Co-authored-by: Bartosz Majsak * Avoid shadowing package name in variable Co-authored-by: Bartosz Majsak * chore: rename createServicemeshMember to enrollToServiceMesh, add log messages --------- Co-authored-by: Bartosz Majsak (cherry picked from commit 8f3d01395e615432f7e5bf898401e48c39a6e446) feat: add managed model registry prometheus job, metrics, and alering rules, fixes RHOAIENG-4273 (cherry picked from commit f811d673ba5a1b16e2e6800d25e72925d4f9af87) --- README.md | 2 + .../v1/datasciencecluster_types.go | 4 + .../v1/zz_generated.deepcopy.go | 1 + ...er.opendatahub.io_datascienceclusters.yaml | 43 ++++ .../rhods-operator.clusterserviceversion.yaml | 29 +++ components/component.go | 2 + components/modelregistry/modelregistry.go | 211 ++++++++++++++++++ .../resources/servicemesh-member.tmpl.yaml | 9 + .../modelregistry/zz_generated.deepcopy.go | 40 ++++ ...er.opendatahub.io_datascienceclusters.yaml | 44 ++++ .../prometheus/apps/prometheus-configs.yaml | 114 ++++++++++ config/rbac/role.yaml | 26 +++ ...asciencecluster_v1_datasciencecluster.yaml | 2 + .../datasciencecluster/kubebuilder_rbac.go | 4 + docs/api-overview.md | 25 +++ get_all_manifests.sh | 3 +- pkg/upgrade/upgrade.go | 4 + tests/e2e/dsc_creation_test.go | 68 ++++++ tests/e2e/helper_test.go | 6 + 19 files changed, 636 insertions(+), 1 deletion(-) create mode 100644 components/modelregistry/modelregistry.go create mode 100644 components/modelregistry/resources/servicemesh-member.tmpl.yaml create mode 100644 components/modelregistry/zz_generated.deepcopy.go diff --git a/README.md b/README.md index 6fde201eb96..25ab15603a7 100644 --- a/README.md +++ b/README.md @@ -317,6 +317,8 @@ spec: managementState: Managed workbenches: managementState: Managed + modelregistry: + managementState: Managed ``` 2. Enable only Dashboard and Workbenches diff --git a/apis/datasciencecluster/v1/datasciencecluster_types.go b/apis/datasciencecluster/v1/datasciencecluster_types.go index b24633ad56b..dcdf4b9b057 100644 --- a/apis/datasciencecluster/v1/datasciencecluster_types.go +++ b/apis/datasciencecluster/v1/datasciencecluster_types.go @@ -31,6 +31,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" @@ -80,6 +81,9 @@ type Components struct { //Training Operator component configuration. TrainingOperator trainingoperator.TrainingOperator `json:"trainingoperator,omitempty"` + + // ModelRegistry component configuration. + ModelRegistry modelregistry.ModelRegistry `json:"modelregistry,omitempty"` } // DataScienceClusterStatus defines the observed state of DataScienceCluster. diff --git a/apis/datasciencecluster/v1/zz_generated.deepcopy.go b/apis/datasciencecluster/v1/zz_generated.deepcopy.go index ccb61f7b60b..38fae1c3f09 100644 --- a/apis/datasciencecluster/v1/zz_generated.deepcopy.go +++ b/apis/datasciencecluster/v1/zz_generated.deepcopy.go @@ -40,6 +40,7 @@ func (in *Components) DeepCopyInto(out *Components) { in.Ray.DeepCopyInto(&out.Ray) in.TrustyAI.DeepCopyInto(&out.TrustyAI) in.TrainingOperator.DeepCopyInto(&out.TrainingOperator) + in.ModelRegistry.DeepCopyInto(&out.ModelRegistry) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Components. diff --git a/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml b/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml index 6972bf35135..0a9ba701939 100644 --- a/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml +++ b/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml @@ -385,6 +385,49 @@ spec: pattern: ^(Managed|Unmanaged|Force|Removed)$ type: string type: object + modelregistry: + description: ModelRegistry component configuration. + properties: + devFlags: + description: Add developer fields + properties: + manifests: + description: List of custom manifests for the given component + items: + properties: + contextDir: + default: "" + description: contextDir is the relative path to + the folder containing manifests in a repository + type: string + sourcePath: + default: "" + description: 'sourcePath is the subpath within contextDir + where kustomize builds start. Examples include + any sub-folder or path: `base`, `overlays/dev`, + `default`, `odh` etc' + type: string + uri: + default: "" + description: uri is the URI point to a git repo + with tag/branch. e.g https://github.com/org/repo/tarball/ + type: string + type: object + type: array + type: object + managementState: + description: "Set to one of the following values: \n - \"Managed\" + : the operator is actively managing the component and trying + to keep it active. It will only upgrade the component if + it is safe to do so \n - \"Removed\" : the operator is actively + managing the component and will not install it, or if it + is installed, the operator will try to remove it" + enum: + - Managed + - Removed + pattern: ^(Managed|Unmanaged|Force|Removed)$ + type: string + type: object ray: description: Ray component configuration. properties: diff --git a/bundle/manifests/rhods-operator.clusterserviceversion.yaml b/bundle/manifests/rhods-operator.clusterserviceversion.yaml index 3df62a61064..b35ff2da79a 100644 --- a/bundle/manifests/rhods-operator.clusterserviceversion.yaml +++ b/bundle/manifests/rhods-operator.clusterserviceversion.yaml @@ -46,6 +46,9 @@ metadata: "modelmeshserving": { "managementState": "Managed" }, + "modelregistry": { + "managementState": "Removed" + }, "ray": { "managementState": "Managed" }, @@ -1032,6 +1035,32 @@ spec: - update - use - watch + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/finalizers + verbs: + - update + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/status + verbs: + - get + - patch + - update - apiGroups: - monitoring.coreos.com resources: diff --git a/components/component.go b/components/component.go index 39eb85a08d9..c8fe864ec12 100644 --- a/components/component.go +++ b/components/component.go @@ -133,6 +133,8 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, compone TrustyAIARules string `yaml:"trustyai-alerting.rules"` KserveRRules string `yaml:"kserve-recording.rules"` KserveARules string `yaml:"kserve-alerting.rules"` + ModelRegistryRRules string `yaml:"model-registry-operator-recording.rules"` + ModelRegistryARules string `yaml:"model-registry-operator-alerting.rules"` } `yaml:"data"` } var configMap ConfigMap diff --git a/components/modelregistry/modelregistry.go b/components/modelregistry/modelregistry.go new file mode 100644 index 00000000000..89ded5e9fa2 --- /dev/null +++ b/components/modelregistry/modelregistry.go @@ -0,0 +1,211 @@ +// Package modelregistry provides utility functions to config ModelRegistry, an ML Model metadata repository service +// +groupName=datasciencecluster.opendatahub.io +package modelregistry + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "strings" + "text/template" + + "github.com/go-logr/logr" + operatorv1 "github.com/openshift/api/operator/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" + infrav1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/infrastructure/v1" + "github.com/opendatahub-io/opendatahub-operator/v2/components" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/conversion" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + + _ "embed" +) + +const DefaultModelRegistryCert = "default-modelregistry-cert" + +var ( + ComponentName = "model-registry-operator" + Path = deploy.DefaultManifestPath + "/" + ComponentName + "/overlays/odh" + // we should not apply this label to the namespace, as it triggered namspace deletion during operator uninstall + // modelRegistryLabels = cluster.WithLabels( + // labels.ODH.OwnedNamespace, "true", + // ). + ModelRegistriesNamespace = "odh-model-registries" +) + +// Verifies that ModelRegistry implements ComponentInterface. +var _ components.ComponentInterface = (*ModelRegistry)(nil) + +// ModelRegistry struct holds the configuration for the ModelRegistry component. +// +kubebuilder:object:generate=true +type ModelRegistry struct { + components.Component `json:""` +} + +func (m *ModelRegistry) OverrideManifests(ctx context.Context, _ cluster.Platform) error { + // If devflags are set, update default manifests path + if len(m.DevFlags.Manifests) != 0 { + manifestConfig := m.DevFlags.Manifests[0] + if err := deploy.DownloadManifests(ctx, ComponentName, manifestConfig); err != nil { + return err + } + // If overlay is defined, update paths + defaultKustomizePath := "overlays/odh" + if manifestConfig.SourcePath != "" { + defaultKustomizePath = manifestConfig.SourcePath + } + Path = filepath.Join(deploy.DefaultManifestPath, ComponentName, defaultKustomizePath) + } + + return nil +} + +func (m *ModelRegistry) GetComponentName() string { + return ComponentName +} + +func (m *ModelRegistry) ReconcileComponent(ctx context.Context, cli client.Client, logger logr.Logger, + owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, platform cluster.Platform, _ bool) error { + l := m.ConfigComponentLogger(logger, ComponentName, dscispec) + var imageParamMap = map[string]string{ + "IMAGES_MODELREGISTRY_OPERATOR": "RELATED_IMAGE_ODH_MODEL_REGISTRY_OPERATOR_IMAGE", + "IMAGES_GRPC_SERVICE": "RELATED_IMAGE_ODH_MLMD_GRPC_SERVER_IMAGE", + "IMAGES_REST_SERVICE": "RELATED_IMAGE_ODH_MODEL_REGISTRY_IMAGE", + } + enabled := m.GetManagementState() == operatorv1.Managed + monitoringEnabled := dscispec.Monitoring.ManagementState == operatorv1.Managed + + if enabled { + // return error if ServiceMesh is not enabled, as it's a required feature + if dscispec.ServiceMesh == nil || dscispec.ServiceMesh.ManagementState != operatorv1.Managed { + return errors.New("ServiceMesh needs to be set to 'Managed' in DSCI CR, it is required by Model Registry") + } + + if err := m.createDependencies(ctx, cli, dscispec); err != nil { + return err + } + + if m.DevFlags != nil { + // Download manifests and update paths + if err := m.OverrideManifests(ctx, platform); err != nil { + return err + } + } + + // Update image parameters only when we do not have customized manifests set + if (dscispec.DevFlags == nil || dscispec.DevFlags.ManifestsUri == "") && (m.DevFlags == nil || len(m.DevFlags.Manifests) == 0) { + extraParamsMap := map[string]string{ + "DEFAULT_CERT": DefaultModelRegistryCert, + } + if err := deploy.ApplyParams(Path, imageParamMap, false, extraParamsMap); err != nil { + return fmt.Errorf("failed to update image from %s : %w", Path, err) + } + } + + // Create model registries namespace + // We do not delete this namespace even when ModelRegistry is Removed or when operator is uninstalled. + ns, err := cluster.CreateNamespace(ctx, cli, ModelRegistriesNamespace) + if err != nil { + return err + } + l.Info("created model registry namespace", "namespace", ModelRegistriesNamespace) + // create servicemeshmember here, for now until post MVP solution + err = enrollToServiceMesh(ctx, cli, dscispec, ns) + if err != nil { + return err + } + l.Info("created model registry servicemesh member", "namespace", ModelRegistriesNamespace) + } else { + err := m.removeDependencies(ctx, cli, dscispec) + if err != nil { + return err + } + } + + // Deploy ModelRegistry Operator + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, Path, dscispec.ApplicationsNamespace, m.GetComponentName(), enabled); err != nil { + return err + } + l.Info("apply manifests done") + + // Create additional model registry resources, componentEnabled=true because these extras are never deleted! + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, Path+"/extras", dscispec.ApplicationsNamespace, m.GetComponentName(), true); err != nil { + return err + } + l.Info("apply extra manifests done") + + // CloudService Monitoring handling + if platform == cluster.ManagedRhods { + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + l.Info("deployment is done, updating monitoring rules") + } + if err := m.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil { + return err + } + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, + filepath.Join(deploy.DefaultManifestPath, "monitoring", "prometheus", "apps"), + dscispec.Monitoring.Namespace, + "prometheus", true); err != nil { + return err + } + l.Info("updating SRE monitoring done") + } + return nil +} + +func (m *ModelRegistry) createDependencies(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec) error { + // create DefaultModelRegistryCert + if err := cluster.PropagateDefaultIngressCertificate(ctx, cli, DefaultModelRegistryCert, dscispec.ServiceMesh.ControlPlane.Namespace); err != nil { + return err + } + return nil +} + +func (m *ModelRegistry) removeDependencies(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec) error { + // delete DefaultModelRegistryCert + certSecret := corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: DefaultModelRegistryCert, + Namespace: dscispec.ServiceMesh.ControlPlane.Namespace, + }, + } + // ignore error if the secret has already been removed + if err := cli.Delete(ctx, &certSecret); client.IgnoreNotFound(err) != nil { + return err + } + return nil +} + +//go:embed resources/servicemesh-member.tmpl.yaml +var smmTemplate string + +func enrollToServiceMesh(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec, namespace *corev1.Namespace) error { + tmpl, err := template.New("servicemeshmember").Parse(smmTemplate) + if err != nil { + return fmt.Errorf("error parsing servicemeshmember template: %w", err) + } + builder := strings.Builder{} + controlPlaneData := struct { + Namespace string + ControlPlane *infrav1.ControlPlaneSpec + }{Namespace: namespace.Name, ControlPlane: &dscispec.ServiceMesh.ControlPlane} + + if err = tmpl.Execute(&builder, controlPlaneData); err != nil { + return fmt.Errorf("error executing servicemeshmember template: %w", err) + } + + unstrObj, err := conversion.StrToUnstructured(builder.String()) + if err != nil || len(unstrObj) != 1 { + return fmt.Errorf("error converting servicemeshmember template: %w", err) + } + + return client.IgnoreAlreadyExists(cli.Create(ctx, unstrObj[0])) +} diff --git a/components/modelregistry/resources/servicemesh-member.tmpl.yaml b/components/modelregistry/resources/servicemesh-member.tmpl.yaml new file mode 100644 index 00000000000..8665f2ba54f --- /dev/null +++ b/components/modelregistry/resources/servicemesh-member.tmpl.yaml @@ -0,0 +1,9 @@ +apiVersion: maistra.io/v1 +kind: ServiceMeshMember +metadata: + name: default + namespace: {{.Namespace}} +spec: + controlPlaneRef: + namespace: {{ .ControlPlane.Namespace }} + name: {{ .ControlPlane.Name }} diff --git a/components/modelregistry/zz_generated.deepcopy.go b/components/modelregistry/zz_generated.deepcopy.go new file mode 100644 index 00000000000..3ed241dd7f1 --- /dev/null +++ b/components/modelregistry/zz_generated.deepcopy.go @@ -0,0 +1,40 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package modelregistry + +import () + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelRegistry) DeepCopyInto(out *ModelRegistry) { + *out = *in + in.Component.DeepCopyInto(&out.Component) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelRegistry. +func (in *ModelRegistry) DeepCopy() *ModelRegistry { + if in == nil { + return nil + } + out := new(ModelRegistry) + in.DeepCopyInto(out) + return out +} diff --git a/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml b/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml index 601118bb015..a50f2d24dd2 100644 --- a/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml +++ b/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml @@ -386,6 +386,50 @@ spec: pattern: ^(Managed|Unmanaged|Force|Removed)$ type: string type: object + modelregistry: + description: ModelRegistry component configuration. + properties: + devFlags: + description: Add developer fields + properties: + manifests: + description: List of custom manifests for the given component + items: + properties: + contextDir: + default: manifests + description: contextDir is the relative path to + the folder containing manifests in a repository, + default value "manifests" + type: string + sourcePath: + default: "" + description: 'sourcePath is the subpath within contextDir + where kustomize builds start. Examples include + any sub-folder or path: `base`, `overlays/dev`, + `default`, `odh` etc.' + type: string + uri: + default: "" + description: uri is the URI point to a git repo + with tag/branch. e.g. https://github.com/org/repo/tarball/ + type: string + type: object + type: array + type: object + managementState: + description: "Set to one of the following values: \n - \"Managed\" + : the operator is actively managing the component and trying + to keep it active. It will only upgrade the component if + it is safe to do so \n - \"Removed\" : the operator is actively + managing the component and will not install it, or if it + is installed, the operator will try to remove it" + enum: + - Managed + - Removed + pattern: ^(Managed|Unmanaged|Force|Removed)$ + type: string + type: object ray: description: Ray component configuration. properties: diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index b19ec0394ba..933421afca7 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -372,6 +372,31 @@ data: target_label: __address__ replacement: ${1}:8080 + - job_name: 'Model Registry Operator' + honor_labels: true + metrics_path: /metrics + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - + relabel_configs: + - source_labels: [__meta_kubernetes_service_name] + regex: ^(model-registry-operator-controller-manager-metrics-service)$ + target_label: kubernetes_name + action: keep + - source_labels: [__address__] + regex: (.+):(\d+) + target_label: __address__ + replacement: ${1}:8443 + - job_name: 'RHOAI Metrics' honor_labels: true scheme: http @@ -1577,3 +1602,92 @@ data: labels: severity: warning instance: trustyai-service-operator-controller-manager + + model-registry-operator-recording.rules: | + groups: + - name: SLOs - Model Registry Operator + rules: + - expr: | + absent(up{job=~'Model Registry Operator'}) * 0 or vector(1) + labels: + instance: model-registry-operator + record: probe_success + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[1d])) + labels: + instance: model-registry-operator + record: probe_success:burnrate1d + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[1h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate1h + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[2h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate2h + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[30m])) + labels: + instance: model-registry-operator + record: probe_success:burnrate30m + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[3d])) + labels: + instance: model-registry-operator + record: probe_success:burnrate3d + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[5m])) + labels: + instance: model-registry-operator + record: probe_success:burnrate5m + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[6h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate6h + + model-registry-operator-alerting.rules: | + groups: + - name: SLOs-probe_success_model_controller + rules: + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate5m{instance=~"model-registry-operator"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"model-registry-operator"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate30m{instance=~"model-registry-operator"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"model-registry-operator"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate2h{instance=~"model-registry-operator"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"model-registry-operator"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + namespace: redhat-ods-applications diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index eeac23e42d1..dc61fbb68e1 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -785,6 +785,32 @@ rules: - update - use - watch +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/finalizers + verbs: + - update +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/status + verbs: + - get + - patch + - update - apiGroups: - monitoring.coreos.com resources: diff --git a/config/samples/datasciencecluster_v1_datasciencecluster.yaml b/config/samples/datasciencecluster_v1_datasciencecluster.yaml index 4ffd8b417e8..7838ccfcda6 100644 --- a/config/samples/datasciencecluster_v1_datasciencecluster.yaml +++ b/config/samples/datasciencecluster_v1_datasciencecluster.yaml @@ -39,4 +39,6 @@ spec: workbenches: managementState: "Managed" trustyai: + managementState: "Removed" + modelregistry: managementState: "Removed" diff --git a/controllers/datasciencecluster/kubebuilder_rbac.go b/controllers/datasciencecluster/kubebuilder_rbac.go index d23cd070851..017c752f7f7 100644 --- a/controllers/datasciencecluster/kubebuilder_rbac.go +++ b/controllers/datasciencecluster/kubebuilder_rbac.go @@ -122,6 +122,10 @@ package datasciencecluster // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=probes,verbs=get;create;patch;delete;deletecollection // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheusrules,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries/finalizers,verbs=update + // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses/finalizers,verbs=get;create;patch;delete;deletecollection // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses/status,verbs=get;create;patch;delete;deletecollection diff --git a/docs/api-overview.md b/docs/api-overview.md index 57c93b9c08f..ead3204d4fb 100644 --- a/docs/api-overview.md +++ b/docs/api-overview.md @@ -49,6 +49,7 @@ _Appears in:_ - [Kserve](#kserve) - [Kueue](#kueue) - [ModelMeshServing](#modelmeshserving) +- [ModelRegistry](#modelregistry) - [Ray](#ray) - [TrainingOperator](#trainingoperator) - [TrustyAI](#trustyai) @@ -230,6 +231,29 @@ _Appears in:_ +## datasciencecluster.opendatahub.io/modelregistry + +Package modelregistry provides utility functions to config ModelRegistry, an ML Model metadata repository service + + + +#### ModelRegistry + + + +ModelRegistry struct holds the configuration for the ModelRegistry component. + + + +_Appears in:_ +- [Components](#components) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `Component` _[Component](#component)_ | | | | + + + ## datasciencecluster.opendatahub.io/ray Package ray provides utility functions to config Ray as part of the stack @@ -356,6 +380,7 @@ _Appears in:_ | `ray` _[Ray](#ray)_ | Ray component configuration. | | | | `trustyai` _[TrustyAI](#trustyai)_ | TrustyAI component configuration. | | | | `trainingoperator` _[TrainingOperator](#trainingoperator)_ | Training Operator component configuration. | | | +| `modelregistry` _[ModelRegistry](#modelregistry)_ | ModelRegistry component configuration. | | | #### ControlPlaneSpec diff --git a/get_all_manifests.sh b/get_all_manifests.sh index a1f81ec925e..548adf9ac1e 100755 --- a/get_all_manifests.sh +++ b/get_all_manifests.sh @@ -5,7 +5,7 @@ GITHUB_URL="https://github.com/" # update to use different git repo for legacy manifests MANIFEST_ORG="red-hat-data-services" -# component: notebook, dsp, kserve, dashbaord, cf/ray, trustyai, modelmesh. +# component: notebook, dsp, kserve, dashbaord, cf/ray, trustyai, modelmesh, modelregistry. # in the format of "repo-org:repo-name:branch-name:source-folder:target-folder". declare -A COMPONENT_MANIFESTS=( ["codeflare"]="red-hat-data-services:codeflare-operator:rhoai-2.12:config:codeflare" @@ -21,6 +21,7 @@ declare -A COMPONENT_MANIFESTS=( ["kserve"]="red-hat-data-services:kserve:rhoai-2.12:config:kserve" ["odh-dashboard"]="red-hat-data-services:odh-dashboard:rhoai-2.12:manifests:dashboard" ["trainingoperator"]="red-hat-data-services:training-operator:rhoai-2.12:manifests:trainingoperator" + ["modelregistry"]="opendatahub-io:model-registry-operator:main:config:model-registry-operator" ) # Allow overwriting repo using flags component=repo diff --git a/pkg/upgrade/upgrade.go b/pkg/upgrade/upgrade.go index eba6c148123..9f073bdcdc6 100644 --- a/pkg/upgrade/upgrade.go +++ b/pkg/upgrade/upgrade.go @@ -34,6 +34,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" @@ -95,6 +96,9 @@ func CreateDefaultDSC(ctx context.Context, cli client.Client) error { TrustyAI: trustyai.TrustyAI{ Component: components.Component{ManagementState: operatorv1.Removed}, }, + ModelRegistry: modelregistry.ModelRegistry{ + Component: components.Component{ManagementState: operatorv1.Removed}, + }, }, }, } diff --git a/tests/e2e/dsc_creation_test.go b/tests/e2e/dsc_creation_test.go index cd90a259538..3239a7d3be7 100644 --- a/tests/e2e/dsc_creation_test.go +++ b/tests/e2e/dsc_creation_test.go @@ -18,11 +18,13 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" infrav1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/infrastructure/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/feature/serverless" ) @@ -75,6 +77,14 @@ func creationTestSuite(t *testing.T) { err = testCtx.testDefaultCertsAvailable() require.NoError(t, err, "error getting default cert secrets for Kserve") }) + t.Run("Validate default model registry cert available", func(t *testing.T) { + err = testCtx.testDefaultModelRegistryCertAvailable() + require.NoError(t, err, "error getting default cert secret for ModelRegistry") + }) + t.Run("Validate model registry servicemeshmember available", func(t *testing.T) { + err = testCtx.testMRServiceMeshMember() + require.NoError(t, err, "error getting servicemeshmember for Model Registry") + }) t.Run("Validate Controller reconcile", func(t *testing.T) { // only test Dashboard component for now err = testCtx.testUpdateComponentReconcile() @@ -417,6 +427,64 @@ func (tc *testContext) testDefaultCertsAvailable() error { return nil } +func (tc *testContext) testDefaultModelRegistryCertAvailable() error { + // return if MR is not set to Managed + if tc.testDsc.Spec.Components.ModelRegistry.ManagementState != operatorv1.Managed { + return nil + } + + // Get expected cert secrets + defaultIngressCtrl, err := cluster.FindAvailableIngressController(tc.ctx, tc.customClient) + if err != nil { + return fmt.Errorf("failed to get ingress controller: %w", err) + } + + defaultIngressCertName := cluster.GetDefaultIngressCertSecretName(defaultIngressCtrl) + + defaultIngressSecret, err := cluster.GetSecret(tc.ctx, tc.customClient, "openshift-ingress", defaultIngressCertName) + if err != nil { + return err + } + + // Verify secret from Control Plane namespace matches the default MR cert secret + defaultMRSecretName := modelregistry.DefaultModelRegistryCert + defaultMRSecret, err := cluster.GetSecret(tc.ctx, tc.customClient, tc.testDSCI.Spec.ServiceMesh.ControlPlane.Namespace, + defaultMRSecretName) + if err != nil { + return err + } + + if defaultMRSecret.Type != defaultIngressSecret.Type { + return fmt.Errorf("wrong type of MR cert secret is created for %v. Expected %v, Got %v", defaultMRSecretName, defaultIngressSecret.Type, defaultMRSecret.Type) + } + + if string(defaultIngressSecret.Data["tls.crt"]) != string(defaultMRSecret.Data["tls.crt"]) { + return fmt.Errorf("default MR cert secret not expected. Epected %v, Got %v", defaultIngressSecret.Data["tls.crt"], defaultMRSecret.Data["tls.crt"]) + } + + if string(defaultIngressSecret.Data["tls.key"]) != string(defaultMRSecret.Data["tls.key"]) { + return fmt.Errorf("default MR cert secret not expected. Epected %v, Got %v", defaultIngressSecret.Data["tls.crt"], defaultMRSecret.Data["tls.crt"]) + } + return nil +} + +func (tc *testContext) testMRServiceMeshMember() error { + if tc.testDsc.Spec.Components.ModelRegistry.ManagementState != operatorv1.Managed { + return nil + } + + // Get unstructured ServiceMeshMember + smm := unstructured.Unstructured{} + smm.SetAPIVersion("maistra.io/v1") + smm.SetKind("ServiceMeshMember") + err := tc.customClient.Get(tc.ctx, + client.ObjectKey{Namespace: modelregistry.ModelRegistriesNamespace, Name: "default"}, &smm) + if err != nil { + return fmt.Errorf("failed to get servicemesh member: %w", err) + } + return nil +} + func (tc *testContext) testUpdateComponentReconcile() error { // Test Updating Dashboard Replicas diff --git a/tests/e2e/helper_test.go b/tests/e2e/helper_test.go index a16866b83f2..c313fa33432 100644 --- a/tests/e2e/helper_test.go +++ b/tests/e2e/helper_test.go @@ -30,6 +30,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" @@ -156,6 +157,11 @@ func setupDSCInstance(name string) *dscv1.DataScienceCluster { ManagementState: operatorv1.Removed, }, }, + ModelRegistry: modelregistry.ModelRegistry{ + Component: components.Component{ + ManagementState: operatorv1.Managed, + }, + }, }, }, }