diff --git a/bootstrap/ic-shared-llm/deployment-hftgi.yaml b/bootstrap/ic-shared-llm/deployment-hftgi.yaml index 32d9d93f..845af92a 100644 --- a/bootstrap/ic-shared-llm/deployment-hftgi.yaml +++ b/bootstrap/ic-shared-llm/deployment-hftgi.yaml @@ -5,6 +5,8 @@ metadata: namespace: ic-shared-llm labels: app: llm-flant5 + annotations: + argocd.argoproj.io/sync-wave: "2" spec: replicas: 1 selector: diff --git a/bootstrap/ic-shared-llm/deployment.yaml b/bootstrap/ic-shared-llm/deployment.yaml deleted file mode 100644 index 6ba8c63c..00000000 --- a/bootstrap/ic-shared-llm/deployment.yaml +++ /dev/null @@ -1,111 +0,0 @@ -kind: Deployment -apiVersion: apps/v1 -metadata: - name: llm - namespace: ic-shared-llm - labels: - app: llm -spec: - replicas: 1 - selector: - matchLabels: - app: llm - template: - metadata: - creationTimestamp: null - labels: - app: llm - spec: - restartPolicy: Always - schedulerName: default-scheduler - affinity: {} - terminationGracePeriodSeconds: 120 - securityContext: {} - containers: - - resources: - limits: - cpu: '8' - memory: 24Gi - nvidia.com/gpu: '1' - requests: - cpu: '6' - readinessProbe: - httpGet: - path: /health - port: http - scheme: HTTP - timeoutSeconds: 5 - periodSeconds: 30 - successThreshold: 1 - failureThreshold: 3 - terminationMessagePath: /dev/termination-log - name: server - livenessProbe: - httpGet: - path: /health - port: http - scheme: HTTP - timeoutSeconds: 8 - periodSeconds: 100 - successThreshold: 1 - failureThreshold: 3 - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hftoken - key: token - args: [ - "--model", - "mistralai/Mistral-7B-Instruct-v0.2", - "--download-dir", - "/models-cache", - "--dtype", "float16", - "--max-model-len", "6144" ] - securityContext: - capabilities: - drop: - - ALL - runAsNonRoot: true - allowPrivilegeEscalation: false - seccompProfile: - type: RuntimeDefault - ports: - - name: http - containerPort: 8000 - protocol: TCP - imagePullPolicy: IfNotPresent - startupProbe: - httpGet: - path: /health - port: http - scheme: HTTP - timeoutSeconds: 1 - periodSeconds: 30 - successThreshold: 1 - failureThreshold: 24 - initialDelaySeconds: 60 - volumeMounts: - - name: models-cache - mountPath: /models-cache - - name: shm - mountPath: /dev/shm - terminationMessagePolicy: File - image: 'quay.io/rh-aiservices-bu/vllm-openai-ubi9:0.4.2' - volumes: - - name: models-cache - persistentVolumeClaim: - claimName: models-cache - - name: shm - emptyDir: - medium: Memory - sizeLimit: 1Gi - dnsPolicy: ClusterFirst - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - strategy: - type: Recreate - revisionHistoryLimit: 10 - progressDeadlineSeconds: 600 \ No newline at end of file diff --git a/bootstrap/ic-shared-llm/fix-odf-config.yaml b/bootstrap/ic-shared-llm/fix-odf-config.yaml index c2e6284a..2dfd4175 100644 --- a/bootstrap/ic-shared-llm/fix-odf-config.yaml +++ b/bootstrap/ic-shared-llm/fix-odf-config.yaml @@ -3,6 +3,8 @@ apiVersion: v1 metadata: name: rook-ceph-operator-config namespace: openshift-storage + annotations: + argocd.argoproj.io/sync-wave: "0" data: CSI_PLUGIN_TOLERATIONS: | - key: nvidia.com/gpu diff --git a/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml b/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml new file mode 100644 index 00000000..b4817bc0 --- /dev/null +++ b/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml @@ -0,0 +1,38 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + annotations: + openshift.io/display-name: granite-7b-instruct + serving.knative.openshift.io/enablePassthrough: 'true' + sidecar.istio.io/inject: 'true' + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + argocd.argoproj.io/sync-wave: "2" + argocd.argoproj.io/compare-options: IgnoreExtraneous + argocd.argoproj.io/sync-options: Prune=false + name: granite-7b-instruct + namespace: ic-shared-llm + labels: + opendatahub.io/dashboard: 'true' +spec: + predictor: + maxReplicas: 1 + minReplicas: 1 + model: + modelFormat: + name: vLLM + name: '' + resources: + limits: + cpu: '6' + memory: 24Gi + nvidia.com/gpu: '1' + requests: + cpu: '1' + memory: 8Gi + nvidia.com/gpu: '1' + runtime: vllm + storageUri: oci://quay.io/rh-aiservices-bu/granite-7b-instruct-modelcar:0.1 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists \ No newline at end of file diff --git a/bootstrap/ic-shared-llm/job-enable-modelcar.yaml b/bootstrap/ic-shared-llm/job-enable-modelcar.yaml new file mode 100644 index 00000000..7a90611c --- /dev/null +++ b/bootstrap/ic-shared-llm/job-enable-modelcar.yaml @@ -0,0 +1,49 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: patch-inferenceservice-config + namespace: ic-shared-llm +annotations: + argocd.argoproj.io/sync-wave: "1" + argocd.argoproj.io/hook: Sync + argocd.argoproj.io/hook-delete-policy: HookSucceeded +spec: + backoffLimit: 4 + template: + spec: + serviceAccount: modelcar-enable-sa + serviceAccountName: modelcar-enable-sa + containers: + - name: patch-configmap + image: registry.redhat.io/openshift4/ose-cli:v4.15.0 + command: ["/bin/sh", "-c"] + args: + - | + # Wait for the operator to be in "Ready" state + echo "Waiting for the operator to be Ready..." + until [ "$(oc get dsci -n redhat-ods-applications default-dsci -o jsonpath='{.status.phase}')" = "Ready" ]; do + echo "Operator not ready, retrying in 10s..." + sleep 10 + done + echo "Operator is Ready!" + + # Fetch current storageInitializer config + config=$(oc get configmap inferenceservice-config -n redhat-ods-applications -o jsonpath='{.data.storageInitializer}') + + # Check if "enableModelcar" is already enabled + if echo "$config" | grep '"enableModelcar": false'; then + echo "Patching configmap to enable modelcar..." + + # Modify the config to enable modelcar using sed + newValue=$(echo "$config" | sed 's/"enableModelcar": false/"enableModelcar": true/') + newValueEscaped=$(echo "$newValue" | sed 's/\"/\\\"/g') + + # Patch the configmap with the new value + oc patch configmap inferenceservice-config -n redhat-ods-applications --type='json' -p "[{\"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": \"$newValueEscaped\"}]" + else + echo "Modelcar is already enabled, no patching needed." + fi + + # Restart the KServe controller to apply changes + oc delete pod -n redhat-ods-applications -l control-plane=kserve-controller-manager + restartPolicy: OnFailure diff --git a/bootstrap/ic-shared-llm/kustomization.yaml b/bootstrap/ic-shared-llm/kustomization.yaml index 46daabff..02d3b969 100644 --- a/bootstrap/ic-shared-llm/kustomization.yaml +++ b/bootstrap/ic-shared-llm/kustomization.yaml @@ -9,12 +9,12 @@ resources: # wave 0 - namespace.yaml - fix-odf-config.yaml -- token.yaml +- rbac-job-enable-modelcar.yaml # wave 1 -- pvc.yaml +- job-enable-modelcar.yaml +# wave 2 +- service-runtime-vllm-granite-modelcar.yaml - pvc-hftgi.yaml -- deployment.yaml -- service.yaml +- inference-service-granite-modelcar.yaml - deployment-hftgi.yaml -- service-hftgi.yaml -# wave 2 +- service-hftgi.yaml \ No newline at end of file diff --git a/bootstrap/ic-shared-llm/pvc-hftgi.yaml b/bootstrap/ic-shared-llm/pvc-hftgi.yaml index 0a1950bb..d503bdc3 100644 --- a/bootstrap/ic-shared-llm/pvc-hftgi.yaml +++ b/bootstrap/ic-shared-llm/pvc-hftgi.yaml @@ -7,7 +7,7 @@ metadata: labels: app: ic-shared-llm annotations: - argocd.argoproj.io/sync-wave: "0" + argocd.argoproj.io/sync-wave: "2" spec: accessModes: - ReadWriteMany diff --git a/bootstrap/ic-shared-llm/pvc.yaml b/bootstrap/ic-shared-llm/pvc.yaml deleted file mode 100644 index be8f7a86..00000000 --- a/bootstrap/ic-shared-llm/pvc.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: models-cache - namespace: ic-shared-llm - labels: - app: ic-shared-llm - annotations: - argocd.argoproj.io/sync-wave: "0" -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 50Gi - storageClassName: ocs-storagecluster-cephfs - volumeMode: Filesystem \ No newline at end of file diff --git a/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml b/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml new file mode 100644 index 00000000..33c1e457 --- /dev/null +++ b/bootstrap/ic-shared-llm/rbac-job-enable-modelcar.yaml @@ -0,0 +1,68 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: modelcar-enable-sa + namespace: ic-shared-llm + annotations: + argocd.argoproj.io/sync-wave: "0" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: modelcar-enable-patch-role + namespace: redhat-ods-applications + annotations: + argocd.argoproj.io/sync-wave: "0" +rules: +- apiGroups: ["redhat.com"] + resources: ["dsci"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "patch"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: modelcar-enable-patch-rolebinding + namespace: redhat-ods-applications + annotations: + argocd.argoproj.io/sync-wave: "0" +subjects: +- kind: ServiceAccount + name: modelcar-enable-sa + namespace: ic-shared-llm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: modelcar-enable-patch-role # Fixed to bind the correct Role +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: modelcar-dsc-read + annotations: + argocd.argoproj.io/sync-wave: "0" +rules: +- apiGroups: ["dscinitialization.opendatahub.io"] + resources: ["dscinitializations"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: modelcar-dsc-read-binding + annotations: + argocd.argoproj.io/sync-wave: "0" +subjects: +- kind: ServiceAccount + name: modelcar-enable-sa + namespace: ic-shared-llm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: modelcar-dsc-read diff --git a/bootstrap/ic-shared-llm/service-hftgi.yaml b/bootstrap/ic-shared-llm/service-hftgi.yaml index 08b3b8ba..01700f92 100644 --- a/bootstrap/ic-shared-llm/service-hftgi.yaml +++ b/bootstrap/ic-shared-llm/service-hftgi.yaml @@ -1,10 +1,13 @@ kind: Service apiVersion: v1 + metadata: name: llm-flant5 namespace: ic-shared-llm labels: app: llm-flant5 + annotations: + argocd.argoproj.io/sync-wave: "2" spec: clusterIP: None ipFamilies: diff --git a/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml b/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml new file mode 100644 index 00000000..26c01bf1 --- /dev/null +++ b/bootstrap/ic-shared-llm/service-runtime-vllm-granite-modelcar.yaml @@ -0,0 +1,50 @@ +--- +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + annotations: + opendatahub.io/accelerator-name: migrated-gpu + opendatahub.io/apiProtocol: REST + opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' + opendatahub.io/template-display-name: vLLM ServingRuntime for KServe + opendatahub.io/template-name: vllm-runtime + openshift.io/display-name: vllm + argocd.argoproj.io/sync-wave: "2" + name: vllm + namespace: ic-shared-llm + labels: + opendatahub.io/dashboard: 'true' +spec: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: '8080' + containers: + - args: + - '--port=8080' + - '--model=/mnt/models' + - '--served-model-name={{.Name}}' + - '--distributed-executor-backend=mp' + command: + - python + - '-m' + - vllm.entrypoints.openai.api_server + env: + - name: HF_HOME + value: /tmp/hf_home + image: 'quay.io/modh/vllm@sha256:b51fde66f162f1a78e8c027320dddf214732d5345953b1599a84fe0f0168c619' + name: kserve-container + ports: + - containerPort: 8080 + protocol: TCP + volumeMounts: + - mountPath: /dev/shm + name: shm + multiModel: false + supportedModelFormats: + - autoSelect: true + name: vLLM + volumes: + - emptyDir: + medium: Memory + sizeLimit: 2Gi + name: shm \ No newline at end of file diff --git a/bootstrap/ic-shared-llm/service.yaml b/bootstrap/ic-shared-llm/service.yaml deleted file mode 100644 index 10a107a9..00000000 --- a/bootstrap/ic-shared-llm/service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -kind: Service -apiVersion: v1 -metadata: - name: llm - namespace: ic-shared-llm - labels: - app: llm -spec: - clusterIP: None - ipFamilies: - - IPv4 - ports: - - name: http - protocol: TCP - port: 8000 - targetPort: http - type: ClusterIP - ipFamilyPolicy: SingleStack - sessionAffinity: None - selector: - app: llm \ No newline at end of file diff --git a/bootstrap/ic-shared-llm/token.yaml b/bootstrap/ic-shared-llm/token.yaml deleted file mode 100644 index bc705ffb..00000000 --- a/bootstrap/ic-shared-llm/token.yaml +++ /dev/null @@ -1,8 +0,0 @@ -kind: Secret -apiVersion: v1 -metadata: - name: hftoken - namespace: ic-shared-llm -data: - token: aGZfUkhKeElqSElXcGJXb3NKVlJsa2VLQ2VVcmlxZ2JsS0VDRgo= -type: Opaque \ No newline at end of file