From a7f18970c315b097f97ab04209ef5e5f965e6d1a Mon Sep 17 00:00:00 2001
From: jerryzhuang <zhuangqhc@gmail.com>
Date: Wed, 27 Nov 2024 21:56:56 +1100
Subject: [PATCH 1/2] feat: add tuning test to preset test

Signed-off-by: jerryzhuang <zhuangqhc@gmail.com>
---
 .github/workflows/e2e-preset-test.yml         |   1 -
 .github/workflows/e2e-preset-tuning-test.yml  | 136 ++++++++++++++++++
 .github/workflows/e2e-workflow.yml            |   9 --
 .../falcon-40b-instruct.yaml                  |  56 --------
 .../test/manifests/falcon-40b/falcon-40b.yaml |  56 --------
 .../falcon-7b-instruct.yaml                   |  55 -------
 .../test/manifests/falcon-7b/falcon-7b.yaml   |  55 -------
 .../mistral-7b-instruct.yaml                  |  55 -------
 .../test/manifests/mistral-7b/mistral-7b.yaml |  55 -------
 .../workspace/test/manifests/phi-2/phi-2.yaml |  55 -------
 .../phi-3-medium-128k-instruct.yaml           |  55 -------
 .../phi-3-medium-4k-instruct.yaml             |  55 -------
 .../phi-3-mini-128k-instruct.yaml             |  55 -------
 .../phi-3-mini-4k-instruct.yaml               |  55 -------
 presets/workspace/test/tuning/tuning-job.yaml | 113 +++++++++++++++
 15 files changed, 249 insertions(+), 617 deletions(-)
 create mode 100644 .github/workflows/e2e-preset-tuning-test.yml
 delete mode 100644 presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml
 delete mode 100644 presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
 delete mode 100644 presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-2/phi-2.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct.yaml
 create mode 100644 presets/workspace/test/tuning/tuning-job.yaml

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index 8c79783da..8583985b3 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -397,4 +397,3 @@ jobs:
                     --resource-group llm-test
                 fi
             fi
-          
diff --git a/.github/workflows/e2e-preset-tuning-test.yml b/.github/workflows/e2e-preset-tuning-test.yml
new file mode 100644
index 000000000..6446c466c
--- /dev/null
+++ b/.github/workflows/e2e-preset-tuning-test.yml
@@ -0,0 +1,136 @@
+name: E2E Preset tuning Test
+
+on:
+    workflow_run:
+        workflows: ["Build and Push Preset Models"]
+        types:
+            - completed
+    workflow_dispatch: {}
+
+env:
+    GO_VERSION: "1.22"
+
+permissions:
+    id-token: write
+    contents: read
+
+jobs:
+  e2e-preset-tuning-tests:
+    needs: determine-models
+    if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
+    runs-on: ubuntu-latest
+    environment: preset-env
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.2.2
+        with:
+            submodules: true
+            fetch-depth: 0
+
+      - name: 'Az CLI login'
+        uses: azure/login@v2.2.0
+        with:
+            client-id: ${{ secrets.AZURE_CLIENT_ID }}
+            tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+            allow-no-subscriptions: true
+
+      - name: 'Set ACR Subscription'
+        run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}}
+
+      - name: Set up kubectl context
+        run: |
+          az aks get-credentials --resource-group llm-test --name GitRunner
+
+      - name: Get test meta
+        id: get_test_meta
+        run: |
+            echo "TAG=0.0.7" >> $GITHUB_OUTPUT
+            echo "NODEPOOL_NAME=tuning" >> $GITHUB_OUTPUT
+            echo "NODE_COUNT=1" >> $GITHUB_OUTPUT
+            echo "NODE_VM_SIZE=Standard_NC6s_v3" >> $GITHUB_OUTPUT
+            echo "NODE_OSDISK_SIZE=100" >> $GITHUB_OUTPUT
+
+      - name: Create Nodepool
+        run: |
+            NODEPOOL_EXIST=$(az aks nodepool show \
+                            --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                            --cluster-name GitRunner \
+                            --resource-group llm-test \
+                            --query 'name' -o tsv || echo "")
+            echo "NODEPOOL_EXIST: $NODEPOOL_EXIST"
+            if [ -z "$NODEPOOL_EXIST" ]; then
+                az aks nodepool add \
+                    --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                    --cluster-name GitRunner \
+                    --resource-group llm-test \
+                    --node-count ${{ steps.get_test_meta.outputs.NODE_COUNT }} \
+                    --node-vm-size ${{ steps.get_test_meta.outputs.NODE_VM_SIZE }} \
+                    --node-osdisk-size ${{ steps.get_test_meta.outputs.NODE_OSDISK_SIZE }} \
+                    --labels pool=${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                    --node-taints sku=gpu:NoSchedule \
+                    --aks-custom-headers UseGPUDedicatedVHD=true
+            else
+                NODEPOOL_STATE=$(az aks nodepool show \
+                                --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                                --cluster-name GitRunner \
+                                --resource-group llm-test \
+                                --query 'provisioningState' -o tsv)
+                echo "NODEPOOL_STATE: $NODEPOOL_STATE"
+                if [ "$NODEPOOL_STATE" != "Succeeded" ]; then
+                    echo "Nodepool exists but is not in a Succeeded state. Please check manually."
+                    exit 1
+                else
+                    echo "Nodepool already exists and is in a running state."
+                fi
+            fi
+
+      - name: Replace repo and Deploy Resource to K8s
+        run: |
+            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/tuning/tuning-job.yaml
+            sed -i "s/TAG_HERE/${{ steps.get_test_meta.outputs.TAG }}/g" presets/workspace/test/tuning/tuning-job.yaml
+            kubectl apply -f presets/workspace/test/tuning/tuning-job.yaml
+
+      - name: Wait for tuning job to be ready
+        shell: bash {0}
+        run: |
+            retval_complete=1
+            retval_failed=1
+            count=0
+            max_retries=60
+            while [[ $retval_complete -ne 0 ]] && [[ $retval_failed -ne 0 ]] && [[ $count -lt $max_retries ]]; do
+                sleep 10
+                output=$(kubectl wait --for=condition=failed job/tuning-example --timeout=0 2>&1)
+                retval_failed=$?
+                output=$(kubectl wait --for=condition=complete job/tuning-example --timeout=0 2>&1)
+                retval_complete=$?
+                count=$((count + 1))
+            done
+
+            if [ $retval_failed -eq 0 ]; then
+                echo "Job failed. Please check logs."
+                exit 1
+            elif [ $retval_complete -ne 0 ]; then
+                echo "Job timeout."
+                exit 1
+            else
+                echo "Job succeeded."
+            fi
+      - name: Cleanup
+        if: always()
+        run: |
+            kubectl delete --wait=true -f presets/workspace/test/tuning/tuning-job.yaml
+
+            # Check and Delete AKS Nodepool if it exists
+            NODEPOOL_EXIST=$(az aks nodepool show \
+                            --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                            --cluster-name GitRunner \
+                            --resource-group llm-test \
+                            --query 'name' -o tsv || echo "")
+
+            if [ -n "$NODEPOOL_EXIST" ]; then
+                echo "deleting nodepool"
+                az aks nodepool delete \
+                --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                --cluster-name GitRunner \
+                --resource-group llm-test
+            fi
diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml
index 182143771..183e76231 100644
--- a/.github/workflows/e2e-workflow.yml
+++ b/.github/workflows/e2e-workflow.yml
@@ -189,15 +189,6 @@ jobs:
           KARPENTER_VERSION: ${{ vars.KARPENTER_VERSION }}
           KARPENTER_NAMESPACE: ${{ env.KARPENTER_NAMESPACE }}
 
-      - name: build KAITO image
-        if: ${{ !inputs.isRelease }}
-        shell: bash
-        run: |
-          make docker-build-workspace
-        env:
-          REGISTRY: ${{ env.REGISTRY }}
-          VERSION: ${{ env.VERSION }}
-
       - name: Install KAITO Workspace helm chart
         shell: bash
         run: |
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
deleted file mode 100644
index a44043894..000000000
--- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-40b-instruct
-spec:
-  progressDeadlineSeconds: 1800
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon40bins
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml
deleted file mode 100644
index 514d12e60..000000000
--- a/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-40b
-spec:
-  progressDeadlineSeconds: 1800
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon40b
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
deleted file mode 100644
index 399b78a3c..000000000
--- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-7b-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon7binst
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml
deleted file mode 100644
index 8e5786c6e..000000000
--- a/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon7b
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
deleted file mode 100644
index 75179683f..000000000
--- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral
-  template:
-    metadata:
-      labels:
-        app: mistral
-    spec:
-      containers:
-      - name: mistral-instruct-container
-        image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: mistral7bins
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml
deleted file mode 100644
index 3eff5594f..000000000
--- a/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral
-  template:
-    metadata:
-      labels:
-        app: mistral
-    spec:
-      containers:
-      - name: mistral-container
-        image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: mistral7b
diff --git a/presets/workspace/test/manifests/phi-2/phi-2.yaml b/presets/workspace/test/manifests/phi-2/phi-2.yaml
deleted file mode 100644
index cbc6f94e7..000000000
--- a/presets/workspace/test/manifests/phi-2/phi-2.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-2
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-2
-  template:
-    metadata:
-      labels:
-        app: phi-2
-    spec:
-      containers:
-      - name: phi-2-container
-        image: REPO_HERE.azurecr.io/phi-2:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi2
diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml
deleted file mode 100644
index 0adb122e4..000000000
--- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-medium-128k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-medium-128k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-medium-128k-instruct
-    spec:
-      containers:
-      - name: phi-3-medium-128k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3medium12
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml
deleted file mode 100644
index 1d0d64e47..000000000
--- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-medium-4k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-medium-4k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-medium-4k-instruct
-    spec:
-      containers:
-      - name: phi-3-medium-4k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3medium4k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct.yaml
deleted file mode 100644
index cf8898015..000000000
--- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-mini-128k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-mini-128k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-mini-128k-instruct
-    spec:
-      containers:
-      - name: phi-3-mini-128k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3mini128k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct.yaml
deleted file mode 100644
index 1d7069a38..000000000
--- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-mini-4k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-mini-4k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-mini-4k-instruct
-    spec:
-      containers:
-      - name: phi-3-mini-4k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3mini4kin
\ No newline at end of file
diff --git a/presets/workspace/test/tuning/tuning-job.yaml b/presets/workspace/test/tuning/tuning-job.yaml
new file mode 100644
index 000000000..5357bbb67
--- /dev/null
+++ b/presets/workspace/test/tuning/tuning-job.yaml
@@ -0,0 +1,113 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: tuning-example
+spec:
+  backoffLimit: 0
+  completionMode: NonIndexed
+  completions: 1
+  manualSelector: false
+  parallelism: 1
+  podReplacementPolicy: TerminatingOrFailed
+  suspend: false
+  template:
+    spec:
+      containers:
+      - command:
+        - /bin/sh
+        - -c
+        - cd /workspace/tfs && python3 metrics_server.py & accelerate launch --num_processes=1 /workspace/tfs/fine_tuning.py
+        env:
+        - name: DEFAULT_TARGET_MODULES
+          value: query_key_value
+        - name: PYTORCH_CUDA_ALLOC_CONF
+          value: expandable_segments:True
+        image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE
+        imagePullPolicy: Always
+        name: tuning
+        ports:
+        - containerPort: 5000
+          protocol: TCP
+        resources:
+          limits:
+            nvidia.com/gpu: "1"
+          requests:
+            nvidia.com/gpu: "1"
+        volumeMounts:
+        - mountPath: /mnt/config
+          name: config-volume
+        - mountPath: /mnt/results
+          name: results-volume
+        - mountPath: /mnt/data
+          name: data-volume
+      dnsPolicy: ClusterFirst
+      initContainers:
+      - command:
+        - sh
+        - -c
+        - ls -la /data && cp -r /data/* /mnt/data && ls -la /mnt/data
+        image: REPO_HERE.azurecr.io/e2e-dataset:0.0.1
+        imagePullPolicy: IfNotPresent
+        name: data-extractor
+        volumeMounts:
+        - mountPath: /mnt/data
+          name: data-volume
+      restartPolicy: Never
+      terminationGracePeriodSeconds: 30
+      tolerations:
+      - effect: NoSchedule
+        key: gpu
+        operator: Equal
+      - effect: NoSchedule
+        key: sku
+        value: gpu
+      volumes:
+      - configMap:
+          defaultMode: 420
+          name: e2e-qlora-params-template
+        name: config-volume
+      - emptyDir: {}
+        name: results-volume
+      - emptyDir: {}
+        name: data-volume
+      nodeSelector:
+        pool: tuning
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: e2e-qlora-params-template
+data:
+  training_config.yaml: |
+    training_config:
+      ModelConfig:
+        torch_dtype: "bfloat16"
+        local_files_only: true
+        device_map: "auto"
+        chat_template: "/workspace/chat_templates/falcon-instruct.jinja"
+
+      QuantizationConfig:
+        load_in_4bit: true
+        bnb_4bit_quant_type: "nf4"
+        bnb_4bit_compute_dtype: "bfloat16"
+        bnb_4bit_use_double_quant: true
+
+      LoraConfig:
+        r: 8
+        lora_alpha: 8
+        lora_dropout: 0.0
+        target_modules: ['query_key_value']
+
+      TrainingArguments:
+        output_dir: "/mnt/results"
+        ddp_find_unused_parameters: false
+        save_strategy: "epoch"
+        per_device_train_batch_size: 1
+        max_steps: 2  # Adding this line to limit training to 2 steps
+
+      DataCollator:
+        mlm: true
+
+      DatasetConfig:
+        shuffle_dataset: true
+        train_test_split: 1
\ No newline at end of file

From 882de5857429a3ef5892aff849b7b378563898ad Mon Sep 17 00:00:00 2001
From: jerryzhuang <zhuangqhc@gmail.com>
Date: Thu, 28 Nov 2024 10:09:14 +1100
Subject: [PATCH 2/2] use config

Signed-off-by: jerryzhuang <zhuangqhc@gmail.com>
---
 .github/e2e-preset-configs.json               |  8 ++++++
 .github/workflows/e2e-preset-tuning-test.yml  | 25 +++++++++----------
 presets/workspace/test/tuning/tuning-job.yaml |  2 +-
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
index 972c5e7ad..f247aa7ac 100644
--- a/.github/e2e-preset-configs.json
+++ b/.github/e2e-preset-configs.json
@@ -129,6 +129,14 @@
         "node-osdisk-size": 150,
         "OSS": false,
         "loads_adapter": false
+      },
+      {
+        "name": "tuning-example",
+        "node-count": 1,
+        "node-vm-size": "Standard_NC6s_v3",
+        "node-osdisk-size": 100,
+        "OSS": true,
+        "loads_adapter": false
       }
     ]
   }
diff --git a/.github/workflows/e2e-preset-tuning-test.yml b/.github/workflows/e2e-preset-tuning-test.yml
index 6446c466c..2379980ea 100644
--- a/.github/workflows/e2e-preset-tuning-test.yml
+++ b/.github/workflows/e2e-preset-tuning-test.yml
@@ -44,34 +44,33 @@ jobs:
       - name: Get test meta
         id: get_test_meta
         run: |
+            CONFIG=$(jq '.matrix.image[] | select(.name == "tuning-example")' .github/e2e-preset-configs.json)
+
             echo "TAG=0.0.7" >> $GITHUB_OUTPUT
-            echo "NODEPOOL_NAME=tuning" >> $GITHUB_OUTPUT
-            echo "NODE_COUNT=1" >> $GITHUB_OUTPUT
-            echo "NODE_VM_SIZE=Standard_NC6s_v3" >> $GITHUB_OUTPUT
-            echo "NODE_OSDISK_SIZE=100" >> $GITHUB_OUTPUT
+            echo "model=$CONFIG" >> $GITHUB_OUTPUT
 
       - name: Create Nodepool
         run: |
             NODEPOOL_EXIST=$(az aks nodepool show \
-                            --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                            --name ${{ steps.get_test_meta.outputs.model.name }} \
                             --cluster-name GitRunner \
                             --resource-group llm-test \
                             --query 'name' -o tsv || echo "")
             echo "NODEPOOL_EXIST: $NODEPOOL_EXIST"
             if [ -z "$NODEPOOL_EXIST" ]; then
                 az aks nodepool add \
-                    --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                    --name ${{ steps.get_test_meta.outputs.model.name }} \
                     --cluster-name GitRunner \
                     --resource-group llm-test \
-                    --node-count ${{ steps.get_test_meta.outputs.NODE_COUNT }} \
-                    --node-vm-size ${{ steps.get_test_meta.outputs.NODE_VM_SIZE }} \
-                    --node-osdisk-size ${{ steps.get_test_meta.outputs.NODE_OSDISK_SIZE }} \
-                    --labels pool=${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                    --node-count ${{ steps.get_test_meta.outputs.model.node-count }} \
+                    --node-vm-size ${{ steps.get_test_meta.outputs.model.node-vm-size }} \
+                    --node-osdisk-size ${{ steps.get_test_meta.outputs.model.node-osdisk-size }} \
+                    --labels pool=${{ steps.get_test_meta.outputs.model.name }} \
                     --node-taints sku=gpu:NoSchedule \
                     --aks-custom-headers UseGPUDedicatedVHD=true
             else
                 NODEPOOL_STATE=$(az aks nodepool show \
-                                --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                                --name ${{ steps.get_test_meta.outputs.model.name }} \
                                 --cluster-name GitRunner \
                                 --resource-group llm-test \
                                 --query 'provisioningState' -o tsv)
@@ -122,7 +121,7 @@ jobs:
 
             # Check and Delete AKS Nodepool if it exists
             NODEPOOL_EXIST=$(az aks nodepool show \
-                            --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                            --name ${{ steps.get_test_meta.outputs.model.name }} \
                             --cluster-name GitRunner \
                             --resource-group llm-test \
                             --query 'name' -o tsv || echo "")
@@ -130,7 +129,7 @@ jobs:
             if [ -n "$NODEPOOL_EXIST" ]; then
                 echo "deleting nodepool"
                 az aks nodepool delete \
-                --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \
+                --name ${{ steps.get_test_meta.outputs.model.name }} \
                 --cluster-name GitRunner \
                 --resource-group llm-test
             fi
diff --git a/presets/workspace/test/tuning/tuning-job.yaml b/presets/workspace/test/tuning/tuning-job.yaml
index 5357bbb67..08298e1df 100644
--- a/presets/workspace/test/tuning/tuning-job.yaml
+++ b/presets/workspace/test/tuning/tuning-job.yaml
@@ -71,7 +71,7 @@ spec:
       - emptyDir: {}
         name: data-volume
       nodeSelector:
-        pool: tuning
+        pool: tuning-example
 ---
 apiVersion: v1
 kind: ConfigMap