Add health check api interface for runtime (#451)

* Add health check api interface for runtime 1. Add /healthz and /ready in runtime app server 2. Update existing test deployments to include the health check and necessary env * Fix the format issue * Remove model mount in the mocked app deployment * Use night tag to test current image * Update ci workflow to load nightly images
vllm-project · Dec 2, 2024 · 808d71f · 808d71f
1 parent f1659ab
commit 808d71f
Show file tree

Hide file tree

Showing 8 changed files with 144 additions and 0 deletions.
diff --git a/.github/workflows/installation-tests.yml b/.github/workflows/installation-tests.yml
@@ -82,7 +82,14 @@ jobs:
         run: |
           for image in controller-manager plugins runtime users; do
             docker load < ${image}-image/${image}.tar
+          
+            # Retag the image
+            # This is for application integration, since it is not that easy to override all commits in manifest
+            # It is better to use nightly to represent the latest image
+            docker tag aibrix/${image}:${{ github.sha }} aibrix/${image}:nightly
+          
             kind load docker-image aibrix/${image}:${{ github.sha }} --name installation-test
+            kind load docker-image aibrix/${image}:nightly --name installation-test
           done
           
       - name: Deploy controller with the built image

diff --git a/benchmarks/autoscaling/7b.yaml b/benchmarks/autoscaling/7b.yaml
@@ -83,12 +83,29 @@ spec:
             - aibrix_runtime
             - --port
             - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
           ports:
             - containerPort: 8080
               protocol: TCP
           volumeMounts:
             - mountPath: /models
               name: model-hostpath
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
       initContainers:
         - name: init-model
           image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.1.1

diff --git a/benchmarks/gateway/7b.yaml b/benchmarks/gateway/7b.yaml
@@ -83,12 +83,29 @@ spec:
             - aibrix_runtime
             - --port
             - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
           ports:
             - containerPort: 8080
               protocol: TCP
           volumeMounts:
             - mountPath: /models
               name: model-hostpath
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
       initContainers:
         - name: init-model
           image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.1.1

diff --git a/docs/development/app/deployment.yaml b/docs/development/app/deployment.yaml
@@ -38,6 +38,32 @@ spec:
               valueFrom:
                 fieldRef:
                   fieldPath: status.podIP
+        - name: aibrix-runtime
+          image: aibrix/runtime:nightly
+          command:
+            - aibrix_runtime
+            - --port
+            - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
+          ports:
+            - containerPort: 8080
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
 ---
 # Debug only: Make sure pod can be visited from controller that deployed in mac.
 apiVersion: v1

diff --git a/docs/tutorial/runtime/runtime-hf-download.yaml b/docs/tutorial/runtime/runtime-hf-download.yaml
@@ -81,12 +81,29 @@ spec:
         - aibrix_runtime
         - --port
         - "8080"
+        env:
+          - name: INFERENCE_ENGINE
+            value: vllm
+          - name: INFERENCE_ENGINE_ENDPOINT
+            value: http://localhost:8000
         ports:
         - containerPort: 8080
           protocol: TCP
         volumeMounts:
         - mountPath: /models
           name: model-hostpath
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8080
+          initialDelaySeconds: 3
+          periodSeconds: 2
+        readinessProbe:
+          httpGet:
+            path: /ready
+            port: 8080
+          initialDelaySeconds: 5
+          periodSeconds: 10
       initContainers:
       - name: model-init
         image: aibrix/runtime:latest

diff --git a/docs/tutorial/runtime/runtime-s3-download.yaml b/docs/tutorial/runtime/runtime-s3-download.yaml
@@ -81,12 +81,29 @@ spec:
         - aibrix_runtime
         - --port
         - "8080"
+        env:
+          - name: INFERENCE_ENGINE
+            value: vllm
+          - name: INFERENCE_ENGINE_ENDPOINT
+            value: http://localhost:8000
         ports:
         - containerPort: 8080
           protocol: TCP
         volumeMounts:
         - mountPath: /models
           name: model-hostpath
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8080
+          initialDelaySeconds: 3
+          periodSeconds: 2
+        readinessProbe:
+          httpGet:
+            path: /ready
+            port: 8080
+          initialDelaySeconds: 5
+          periodSeconds: 10
       initContainers:
       - name: model-init
         image: aibrix/runtime:latest

diff --git a/docs/tutorial/runtime/runtime-tos-download.yaml b/docs/tutorial/runtime/runtime-tos-download.yaml
@@ -81,12 +81,29 @@ spec:
         - aibrix_runtime
         - --port
         - "8080"
+        env:
+          - name: INFERENCE_ENGINE
+            value: vllm
+          - name: INFERENCE_ENGINE_ENDPOINT
+            value: http://localhost:8000
         ports:
         - containerPort: 8080
           protocol: TCP
         volumeMounts:
         - mountPath: /models
           name: model-hostpath
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8080
+          initialDelaySeconds: 3
+          periodSeconds: 2
+        readinessProbe:
+          httpGet:
+            path: /ready
+            port: 8080
+          initialDelaySeconds: 5
+          periodSeconds: 10
       initContainers:
       - name: model-init
         image: aibrix/runtime:latest

diff --git a/python/aibrix/aibrix/app.py b/python/aibrix/aibrix/app.py
@@ -90,6 +90,18 @@ def init_app_state(state: State) -> None:
     )
 
 
+def inference_engine_ready() -> bool:
+    try:
+        # Check if the engine is initialized
+        # Seems no need to check engine's status since main container has its own checks.
+        return (
+            True if envs.INFERENCE_ENGINE and envs.INFERENCE_ENGINE_ENDPOINT else False
+        )
+    except Exception as e:
+        logger.error(f"Readiness check failed: {e}")
+        return False
+
+
 @router.post("/v1/lora_adapter/load")
 async def load_lora_adapter(request: LoadLoraAdapterRequest, raw_request: Request):
     response = await inference_engine(raw_request).load_lora_adapter(request)
@@ -108,6 +120,20 @@ async def unload_lora_adapter(request: UnloadLoraAdapterRequest, raw_request: Re
     return Response(status_code=200, content=response)
 
 
+@router.get("/healthz")
+async def liveness_check():
+    # Simply return a 200 status for liveness check
+    return JSONResponse(content={"status": "ok"}, status_code=200)
+
+
+@router.get("/ready")
+async def readiness_check():
+    # Check if the inference engine is ready
+    if inference_engine_ready():
+        return JSONResponse(content={"status": "ready"}, status_code=200)
+    return JSONResponse(content={"status": "not ready"}, status_code=503)
+
+
 def build_app(args: argparse.Namespace):
     if args.enable_fastapi_docs:
         app = FastAPI(debug=False)