kaito-project · ishaansehgal99 · Nov 30, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
@@ -257,13 +257,9 @@ jobs:
 
       - name: Test healthz endpoint
         run: |
-            if [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
-                curl -s http://localhost:5000/healthz
-            else
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
-                curl -s http://localhost:5000/health
-            fi
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+            curl -s http://localhost:5000/health
+
       - name: Test inference endpoint
         run: |
             echo "Testing inference for ${{ matrix.model.name }}"

@@ -30,4 +30,4 @@ ARG VERSION
 RUN echo $VERSION > /workspace/llama/version.txt
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
-ADD presets/workspace/inference/${MODEL_TYPE} /workspace/llama/llama-2
+ADD kaito/presets/workspace/inference/${MODEL_TYPE} /workspace/llama/llama-2
@@ -23,7 +23,7 @@ import (
 )
 
 const (
-	ProbePath     = "/healthz"
+	ProbePath     = "/health"
 	Port5000      = 5000
 	InferenceFile = "inference_api.py"
 )

@@ -101,7 +101,7 @@ def setup_main_routes():
     def home():
         return "Server is running", 200
 
-    @app_main.get("/healthz")
+    @app_main.get("/health")
     def health_check():
         if not torch.cuda.is_available():
             raise HTTPException(status_code=500, detail="No GPU available")

@@ -101,7 +101,7 @@ def setup_main_routes():
     def home():
         return "Server is running", 200
 
-    @app_main.get("/healthz")
+    @app_main.get("/health")
     def health_check():
         if not torch.cuda.is_available():
             raise HTTPException(status_code=500, detail="No GPU available")
@@ -181,7 +181,7 @@ def get_metrics():
             return {"error": str(e)}
 
 def setup_worker_routes(): 
-    @app_worker.get("/healthz")
+    @app_worker.get("/health")
     def health_check():
         if not torch.cuda.is_available():
             raise HTTPException(status_code=500, detail="No GPU available")

@@ -3,28 +3,29 @@ models:
   - name: llama-2-7b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-7b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-13b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-13b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-70b
     type: llama2-completion
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
   - name: llama-2-70b-chat
     type: llama2-chat
     runtime: llama-2
-    tag: 0.0.3
+    tag: 0.0.4
     # Tag history:
+    # 0.0.4 - Update endpoint /healthz -> /health (#738)
     # 0.0.3 - Inference API Cleanup (#233)
     # 0.0.2 - Eliminate Unnecessary Process Group Creation in Worker Initialization (#244)
     # 0.0.1 - Initial Release

@@ -43,13 +43,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

@@ -43,13 +43,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

@@ -27,13 +27,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10

@@ -27,13 +27,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10