vllm-project · Jeffwan · Sep 4, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/Makefile b/Makefile
@@ -1,6 +1,8 @@
 # Image URL to use all building/pushing image targets
+AIBRIX_REPO ?= aibrix
 IMG ?= controller:latest
 PLUGINS_IMG ?= aibrix/plugins:v0.1.0
+RUNTIME_IMG ?= ${AIBRIX_REPO}/runtime:latest
 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
 ENVTEST_K8S_VERSION = 1.29.0
 
@@ -114,6 +116,10 @@ docker-build: ## Build docker image with the manager.
 docker-build-plugins: ## Build docker image with the manager.
 	$(CONTAINER_TOOL) build -t ${PLUGINS_IMG} -f gateway.Dockerfile .
 
+.PHONY: docker-build-runtime
+docker-build-runtime: ## Build docker image with the AI Runime.
+	$(CONTAINER_TOOL) build -t ${RUNTIME_IMG} -f runtime.Dockerfile .
+
 .PHONY: docker-push
 docker-push: ## Push docker image with the manager.
 	$(CONTAINER_TOOL) push ${IMG}
@@ -141,6 +147,10 @@ build-installer: manifests generate kustomize ## Generate a consolidated YAML wi
 	cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG}
 	$(KUSTOMIZE) build config/default > dist/install.yaml
 
+.PHONY: docker-buildx-runtime
+docker-buildx-runtime:
+	$(CONTAINER_TOOL) buildx build --push --platform=${PLATFORMS} -f runtime.Dockerfile . -t ${AIBRIX_REPO}/${RUNTIME_IMG}
+
 ##@ Deployment
 
 ifndef ignore-not-found

diff --git a/docs/tutorial/runtime/README.md b/docs/tutorial/runtime/README.md
@@ -0,0 +1,21 @@
+# AIBrix Runtime Demo
+
+## Model Download
+AIBrix runtime support to download model from different sources.
+
+- Download model from HuggingFace
+```shell
+kubectl apply -f runtime-hf-download.yaml
+```
+
+- Download model from AWS S3
+```shell
+kubectl apply -f runtime-s3-download.yaml
+```
+
+- Download model from TOS
+```shell
+kubectl apply -f runtime-tos-download.yaml
+```
+
+## Metrics Merge
diff --git a/docs/tutorial/runtime/runtime-hf-download.yaml b/docs/tutorial/runtime/runtime-hf-download.yaml
@@ -0,0 +1,146 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    models.aibricks.ai: deepseek-coder-6.7b-instruct
+    models.aibricks.com/model-name: deepseek-coder-6.7b-instruct
+  name: aibricks-model-deepseek-coder-6.7b-instruct
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      models.aibricks.ai: deepseek-coder-6.7b-instruct
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        models.aibricks.ai: deepseek-coder-6.7b-instruct
+    spec:
+      containers:
+      - command:
+        - python3
+        - -m
+        - vllm.entrypoints.openai.api_server
+        - --host
+        - "0.0.0.0"
+        - --port
+        - "8000"
+        - --model
+        - /models/deepseek-ai/deepseek-coder-6.7b-instruct
+        - --served-model-name
+        - deepseek-ai/deepseek-coder-6.7b-instruct
+        - --distributed-executor-backend
+        - ray
+        - --trust-remote-code
+        image: vllm/vllm-openai:v0.5.5
+        imagePullPolicy: Always
+        livenessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 90
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        name: vllm-openai
+        ports:
+        - containerPort: 8000
+          protocol: TCP
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 90
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          limits:
+            nvidia.com/gpu: "1"
+          requests:
+            nvidia.com/gpu: "1"
+        # We need to use dataset cache
+        volumeMounts:
+        - mountPath: /models
+          name: model-hostpath
+        - name: dshm
+          mountPath: /dev/shm
+      - name: aibrix-runtime
+        image: aibrix/runtime:latest
+        command:
+        - bash
+        - entrypoint.sh
+        ports:
+        - containerPort: 8080
+          protocol: TCP
+        volumeMounts:
+        - mountPath: /models
+          name: model-hostpath
+      initContainers:
+      - name: model-init
+        image: aibrix/runtime:latest
+        command:
+        - python
+        - -m
+        - aibrix.downloader
+        - --model-uri
+        - deepseek-ai/deepseek-coder-6.7b-instruct
+        - --local-dir
+        - /models/
+        env:
+        - name: DOWNLOADER_MODEL_NAME
+          value: deepseek-ai/deepseek-coder-6.7b-instruct
+        - name: DOWNLOADER_ALLOW_FILE_SUFFIX
+          value: json, safetensors
+        - name: HF_TOKEN
+          value: <input your hf token, if needed>
+        - name: HF_ENDPOINT
+          value: <input your hf endpoint, if needed>
+        - name: HF_REVISION
+          value: <input your mdoel revision, if needed>
+        volumeMounts:
+        - mountPath: /models
+          name: model-hostpath
+      volumes:
+      - emptyDir: {}
+        name: model-hostpath
+      - name: dshm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "10Gi"
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    models.aibricks.ai: deepseek-coder-6.7b-instruct
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: aibricks-model-deepseek-coder-6-7b-instruct-svc
+  namespace: default
+spec:
+  ports:
+  - name: serve
+    port: 8000
+    protocol: TCP
+    targetPort: 8000
+  - name: http
+    port: 8080
+    protocol: TCP
+    targetPort: 8080
+  selector:
+    models.aibricks.ai: deepseek-coder-6.7b-instruct
+  type: ClusterIP
diff --git a/docs/tutorial/runtime/runtime-s3-download.yaml b/docs/tutorial/runtime/runtime-s3-download.yaml
@@ -0,0 +1,148 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    models.aibricks.ai: deepseek-coder-6.7b-instruct
+    models.aibricks.com/model-name: deepseek-coder-6.7b-instruct
+  name: aibricks-model-deepseek-coder-6.7b-instruct
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      models.aibricks.ai: deepseek-coder-6.7b-instruct
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        models.aibricks.ai: deepseek-coder-6.7b-instruct
+    spec:
+      containers:
+      - command:
+        - python3
+        - -m
+        - vllm.entrypoints.openai.api_server
+        - --host
+        - "0.0.0.0"
+        - --port
+        - "8000"
+        - --model
+        - /models/deepseek-ai/deepseek-coder-6.7b-instruct
+        - --served-model-name
+        - deepseek-ai/deepseek-coder-6.7b-instruct
+        - --distributed-executor-backend
+        - ray
+        - --trust-remote-code
+        image: vllm/vllm-openai:v0.5.5
+        imagePullPolicy: Always
+        livenessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 90
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        name: vllm-openai
+        ports:
+        - containerPort: 8000
+          protocol: TCP
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 90
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          limits:
+            nvidia.com/gpu: "1"
+          requests:
+            nvidia.com/gpu: "1"
+        # We need to use dataset cache
+        volumeMounts:
+        - mountPath: /models
+          name: model-hostpath
+        - name: dshm
+          mountPath: /dev/shm
+      - name: aibrix-runtime
+        image: aibrix/runtime:latest
+        command:
+        - bash
+        - entrypoint.sh
+        ports:
+        - containerPort: 8080
+          protocol: TCP
+        volumeMounts:
+        - mountPath: /models
+          name: model-hostpath
+      initContainers:
+      - name: model-init
+        image: aibrix/runtime:latest
+        command:
+        - python
+        - -m
+        - aibrix.downloader
+        - --model-uri
+        - s3://<input your s3 bucket name>/<input your s3 bucket path>
+        - --local-dir
+        - /models/
+        env:
+        - name: DOWNLOADER_MODEL_NAME
+          value: deepseek-ai/deepseek-coder-6.7b-instruct
+        - name: DOWNLOADER_ALLOW_FILE_SUFFIX
+          value: json, safetensors
+        - name: AWS_ACCESS_KEY_ID
+          value: <input your s3 access key>
+        - name: AWS_SECRET_ACCESS_KEY
+          value: <input your s3 secret key>
+        - name: AWS_ENDPOINT_URL
+          value: <input your s3 endpoint>
+        - name: AWS_REGION
+          value: <input your s3 region>
+        volumeMounts:
+        - mountPath: /models
+          name: model-hostpath
+      volumes:
+      - emptyDir: {}
+        name: model-hostpath
+      - name: dshm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "10Gi"
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    models.aibricks.ai: deepseek-coder-6.7b-instruct
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: aibricks-model-deepseek-coder-6-7b-instruct-svc
+  namespace: default
+spec:
+  ports:
+  - name: serve
+    port: 8000
+    protocol: TCP
+    targetPort: 8000
+  - name: http
+    port: 8080
+    protocol: TCP
+    targetPort: 8080
+  selector:
+    models.aibricks.ai: deepseek-coder-6.7b-instruct
+  type: ClusterIP