From ec8d4fb482fcea42d9a46d2dd8787841d29284aa Mon Sep 17 00:00:00 2001 From: Laurentiu Bradin <109964136+z103cb@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:41:33 +0300 Subject: [PATCH] CARRY: Add RHOAI manifests (#3) --- manifests/rhoai/kubeflow-training-roles.yaml | 73 +++++++++++++++++++ manifests/rhoai/kustomization.yaml | 45 ++++++++++++ manifests/rhoai/manager_config_patch.yaml | 12 +++ .../manager_delete_metrics_service_patch.yaml | 6 ++ manifests/rhoai/manager_metrics_patch.yaml | 12 +++ manifests/rhoai/monitor.yaml | 12 +++ manifests/rhoai/params.env | 1 + manifests/rhoai/params.yaml | 3 + 8 files changed, 164 insertions(+) create mode 100644 manifests/rhoai/kubeflow-training-roles.yaml create mode 100644 manifests/rhoai/kustomization.yaml create mode 100644 manifests/rhoai/manager_config_patch.yaml create mode 100644 manifests/rhoai/manager_delete_metrics_service_patch.yaml create mode 100644 manifests/rhoai/manager_metrics_patch.yaml create mode 100644 manifests/rhoai/monitor.yaml create mode 100644 manifests/rhoai/params.env create mode 100644 manifests/rhoai/params.yaml diff --git a/manifests/rhoai/kubeflow-training-roles.yaml b/manifests/rhoai/kubeflow-training-roles.yaml new file mode 100644 index 0000000000..579316ac6c --- /dev/null +++ b/manifests/rhoai/kubeflow-training-roles.yaml @@ -0,0 +1,73 @@ +#This file has been copied from ../overlays/kubeflow +#The original labels have ben commented out for documentation purposes +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: training-edit + labels: +# rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true" +# rbac.authorization.kubeflow.org/aggregate-to-kubeflow-training-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" + rbac.authorization.k8s.io/aggregate-to-admin: "true" +rules: + - apiGroups: + - kubeflow.org + resources: + - mpijobs + - tfjobs + - pytorchjobs + - mxjobs + - xgboostjobs + - paddlejobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - kubeflow.org + resources: + - mpijobs/status + - tfjobs/status + - pytorchjobs/status + - mxjobs/status + - xgboostjobs/status + - paddlejobs/status + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: training-view + labels: +# rbac.authorization.kubeflow.org/aggregate-to-kubeflow-view: "true" + rbac.authorization.k8s.io/aggregate-to-view: "true" +rules: + - apiGroups: + - kubeflow.org + resources: + - mpijobs + - tfjobs + - pytorchjobs + - mxjobs + - xgboostjobs + - paddlejobs + verbs: + - get + - list + - watch + - apiGroups: + - kubeflow.org + resources: + - mpijobs/status + - tfjobs/status + - pytorchjobs/status + - mxjobs/status + - xgboostjobs/status + - paddlejobs/status + verbs: + - get diff --git a/manifests/rhoai/kustomization.yaml b/manifests/rhoai/kustomization.yaml new file mode 100644 index 0000000000..702fe6e4e6 --- /dev/null +++ b/manifests/rhoai/kustomization.yaml @@ -0,0 +1,45 @@ +# RHOAI configuration for Kubeflow Training Operator (KFTO) + +# Adds namespace to all resources. +namespace: opendatahub + +# Value of this field is prepended to the +# names of all resources, e.g. a deployment named +# "wordpress" becomes "alices-wordpress". +# Note that it should also match with the prefix (text before '-') of the namespace +# field above. +namePrefix: kubeflow- + +configMapGenerator: +- name: rhoai-config + envs: + - params.env + +configurations: + - params.yaml + +vars: +- name: image + objref: + kind: ConfigMap + name: rhoai-config + apiVersion: v1 + fieldref: + fieldpath: data.odh-training-operator-controller-image + +# Labels to add to all resources and selectors. +commonLabels: + app.kubernetes.io/name: training-operator + app.kubernetes.io/component: controller + +resources: +- ../base +- kubeflow-training-roles.yaml +- monitor.yaml + +patches: +# Mount the controller config file for loading manager configurations +# through a ComponentConfig type +- path: manager_config_patch.yaml +- path: manager_metrics_patch.yaml +- path: manager_delete_metrics_service_patch.yaml diff --git a/manifests/rhoai/manager_config_patch.yaml b/manifests/rhoai/manager_config_patch.yaml new file mode 100644 index 0000000000..3a7ec25e18 --- /dev/null +++ b/manifests/rhoai/manager_config_patch.yaml @@ -0,0 +1,12 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: training-operator +spec: + template: + spec: + containers: + - name: training-operator + image: $(image) + args: + - "--zap-log-level=2" diff --git a/manifests/rhoai/manager_delete_metrics_service_patch.yaml b/manifests/rhoai/manager_delete_metrics_service_patch.yaml new file mode 100644 index 0000000000..e373d0c38c --- /dev/null +++ b/manifests/rhoai/manager_delete_metrics_service_patch.yaml @@ -0,0 +1,6 @@ +# Delete the service created in base +$patch: delete +apiVersion: v1 +kind: Service +metadata: + name: training-operator diff --git a/manifests/rhoai/manager_metrics_patch.yaml b/manifests/rhoai/manager_metrics_patch.yaml new file mode 100644 index 0000000000..2d53ca8ade --- /dev/null +++ b/manifests/rhoai/manager_metrics_patch.yaml @@ -0,0 +1,12 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: training-operator +spec: + template: + spec: + containers: + - name: training-operator + ports: + - containerPort: 8080 + name: metrics diff --git a/manifests/rhoai/monitor.yaml b/manifests/rhoai/monitor.yaml new file mode 100644 index 0000000000..ca8fac49d1 --- /dev/null +++ b/manifests/rhoai/monitor.yaml @@ -0,0 +1,12 @@ +# Prometheus Pod Monitor (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: training-operator-metrics-monitor +spec: + selector: + matchLabels: + app.kubernetes.io/name: training-operator + app.kubernetes.io/component: controller + podMetricsEndpoints: + - port: metrics diff --git a/manifests/rhoai/params.env b/manifests/rhoai/params.env new file mode 100644 index 0000000000..04a2aa7a83 --- /dev/null +++ b/manifests/rhoai/params.env @@ -0,0 +1 @@ +odh-training-operator-controller-image=docker.io/kubeflow/training-operator:v1-855e096 diff --git a/manifests/rhoai/params.yaml b/manifests/rhoai/params.yaml new file mode 100644 index 0000000000..43509ff293 --- /dev/null +++ b/manifests/rhoai/params.yaml @@ -0,0 +1,3 @@ +varReference: + - path: spec/template/spec/containers[]/image + kind: Deployment