feat: Update Llama Endpoint (#738)

**Reason for Change**: Update llama endpoint from /healthz -> /health --------- Signed-off-by: Ishaan Sehgal <ishaanforthewin@gmail.com>
kaito-project · Nov 30, 2024 · f5d0958 · f5d0958
1 parent 3dbb660
commit f5d0958
Show file tree

Hide file tree

Showing 10 changed files with 23 additions and 26 deletions.
diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
@@ -257,13 +257,9 @@ jobs:
 
       - name: Test healthz endpoint
         run: |
-            if [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
-                curl -s http://localhost:5000/healthz
-            else
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
-                curl -s http://localhost:5000/health
-            fi
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+            curl -s http://localhost:5000/health
+
       - name: Test inference endpoint
         run: |
             echo "Testing inference for ${{ matrix.model.name }}"

diff --git a/docker/presets/models/llama-2/Dockerfile b/docker/presets/models/llama-2/Dockerfile
@@ -30,4 +30,4 @@ ARG VERSION
 RUN echo $VERSION > /workspace/llama/version.txt
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
-ADD presets/workspace/inference/${MODEL_TYPE} /workspace/llama/llama-2
+ADD kaito/presets/workspace/inference/${MODEL_TYPE} /workspace/llama/llama-2
diff --git a/pkg/workspace/inference/preset-inferences.go b/pkg/workspace/inference/preset-inferences.go
@@ -23,7 +23,7 @@ import (
 )
 
 const (
-	ProbePath     = "/healthz"
+	ProbePath     = "/health"
 	Port5000      = 5000
 	InferenceFile = "inference_api.py"
 )

diff --git a/presets/workspace/inference/llama2-chat/inference_api.py b/presets/workspace/inference/llama2-chat/inference_api.py
@@ -101,7 +101,7 @@ def setup_main_routes():
     def home():
         return "Server is running", 200
 
-    @app_main.get("/healthz")
+    @app_main.get("/health")
     def health_check():
         if not torch.cuda.is_available():
             raise HTTPException(status_code=500, detail="No GPU available")

diff --git a/presets/workspace/inference/llama2-completion/inference_api.py b/presets/workspace/inference/llama2-completion/inference_api.py
@@ -101,7 +101,7 @@ def setup_main_routes():
     def home():
         return "Server is running", 200
 
-    @app_main.get("/healthz")
+    @app_main.get("/health")
     def health_check():
         if not torch.cuda.is_available():
             raise HTTPException(status_code=500, detail="No GPU available")
@@ -181,7 +181,7 @@ def get_metrics():
             return {"error": str(e)}
 
 def setup_worker_routes(): 
-    @app_worker.get("/healthz")
+    @app_worker.get("/health")
     def health_check():
         if not torch.cuda.is_available():
             raise HTTPException(status_code=500, detail="No GPU available")

diff --git a/presets/workspace/models/supported_models.yaml b/presets/workspace/models/supported_models.yaml
@@ -3,28 +3,29 @@ models:
   - name: llama-2-7b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-7b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-13b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-13b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-70b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-70b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
     # Tag history:
+    # 0.0.4 - Update endpoint /healthz -> /health (#738)
     # 0.0.3 - Inference API Cleanup (#233)
     # 0.0.2 - Eliminate Unnecessary Process Group Creation in Worker Initialization (#244)
     # 0.0.1 - Initial Release

diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
@@ -43,13 +43,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml
@@ -43,13 +43,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
@@ -27,13 +27,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml
@@ -27,13 +27,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10