feat: Add Phi-3 Manifests and Custom E2E Run Flag (#491)

**Reason for Change**: Adds Phi-3 manifests and allows us to run e2e for just Phi-3 Models
kaito-project · Jul 2, 2024 · 421bd5f · 421bd5f
1 parent 21d7768
commit 421bd5f
Show file tree

Hide file tree

Showing 10 changed files with 281 additions and 0 deletions.
diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
@@ -11,11 +11,16 @@ on:
                 type: boolean
                 default: false
                 description: "Test all models for E2E"
+            force-run-all-phi-models:
+                type: boolean
+                default: false
+                description: "Test all Phi models for E2E"
 
 env:
     GO_VERSION: "1.22"
     BRANCH_NAME: ${{ github.head_ref || github.ref_name}} 
     FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}
+    FORCE_RUN_ALL_PHI:  ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all-phi-models== 'true' }}
 
 permissions:
     id-token: write
@@ -43,6 +48,7 @@ jobs:
         run: |
             PR_BRANCH=${{ env.BRANCH_NAME }} \
             FORCE_RUN_ALL=${{ env.FORCE_RUN_ALL }} \
+            FORCE_RUN_ALL_PHI=${{ env.FORCE_RUN_ALL_PHI }} \
             python3 .github/workflows/kind-cluster/determine_models.py
 
       - name: Print Determined Models

diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py
@@ -118,10 +118,13 @@ def check_modified_models(pr_branch):
 def main():
     pr_branch = os.environ.get("PR_BRANCH", "main") # If not specified default to 'main'
     force_run_all = os.environ.get("FORCE_RUN_ALL", "false") # If not specified default to False
+    force_run_all_phi = os.environ.get("FORCE_RUN_ALL_PHI", "false") # If not specified default to False
 
     affected_models = []
     if force_run_all != "false":
         affected_models = [model['name'] for model in YAML_PR['models']]
+    elif force_run_all_phi != "false": 
+        affected_models = [model['name'] for model in YAML_PR['models'] if 'phi-3' in model['name']]
     else:
         # Logic to determine affected models
         # Example: affected_models = ['model1', 'model2', 'model3']

diff --git a/presets/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml b/presets/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: phi-3-medium-128k-instruct
+spec:
+  selector:
+    app: phi-3-medium-128k-instruct
+  ports:
+  - protocol: TCP
+    port: 80
+    targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/presets/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml b/presets/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-medium-128k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-medium-128k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-medium-128k-instruct
+    spec:
+      containers:
+      - name: phi-3-medium-128k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /healthz
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3medium12
diff --git a/presets/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml b/presets/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: phi-3-medium-4k-instruct
+spec:
+  selector:
+    app: phi-3-medium-4k-instruct
+  ports:
+  - protocol: TCP
+    port: 80
+    targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/presets/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml b/presets/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-medium-4k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-medium-4k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-medium-4k-instruct
+    spec:
+      containers:
+      - name: phi-3-medium-4k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /healthz
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3medium4k
diff --git a/presets/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml b/presets/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: phi-3-small-128k-instruct
+spec:
+  selector:
+    app: phi-3-small-128k-instruct
+  ports:
+  - protocol: TCP
+    port: 80
+    targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/presets/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml b/presets/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-small-128k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-small-128k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-small-128k-instruct
+    spec:
+      containers:
+      - name: phi-3-small-128k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-small-128k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /healthz
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3small128
diff --git a/presets/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml b/presets/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: phi-3-small-8k-instruct
+spec:
+  selector:
+    app: phi-3-small-8k-instruct
+  ports:
+  - protocol: TCP
+    port: 80
+    targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/presets/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml b/presets/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-small-8k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-small-8k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-small-8k-instruct
+    spec:
+      containers:
+      - name: phi-3-small-8k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-small-8k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /healthz
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3small8ki