From a7f18970c315b097f97ab04209ef5e5f965e6d1a Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Wed, 27 Nov 2024 21:56:56 +1100 Subject: [PATCH 1/2] feat: add tuning test to preset test Signed-off-by: jerryzhuang --- .github/workflows/e2e-preset-test.yml | 1 - .github/workflows/e2e-preset-tuning-test.yml | 136 ++++++++++++++++++ .github/workflows/e2e-workflow.yml | 9 -- .../falcon-40b-instruct.yaml | 56 -------- .../test/manifests/falcon-40b/falcon-40b.yaml | 56 -------- .../falcon-7b-instruct.yaml | 55 ------- .../test/manifests/falcon-7b/falcon-7b.yaml | 55 ------- .../mistral-7b-instruct.yaml | 55 ------- .../test/manifests/mistral-7b/mistral-7b.yaml | 55 ------- .../workspace/test/manifests/phi-2/phi-2.yaml | 55 ------- .../phi-3-medium-128k-instruct.yaml | 55 ------- .../phi-3-medium-4k-instruct.yaml | 55 ------- .../phi-3-mini-128k-instruct.yaml | 55 ------- .../phi-3-mini-4k-instruct.yaml | 55 ------- presets/workspace/test/tuning/tuning-job.yaml | 113 +++++++++++++++ 15 files changed, 249 insertions(+), 617 deletions(-) create mode 100644 .github/workflows/e2e-preset-tuning-test.yml delete mode 100644 presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml delete mode 100644 presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml delete mode 100644 presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml delete mode 100644 presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml delete mode 100644 presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml delete mode 100644 presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml delete mode 100644 presets/workspace/test/manifests/phi-2/phi-2.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct.yaml delete mode 100644 presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct.yaml create mode 100644 presets/workspace/test/tuning/tuning-job.yaml diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 8c79783da..8583985b3 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -397,4 +397,3 @@ jobs: --resource-group llm-test fi fi - diff --git a/.github/workflows/e2e-preset-tuning-test.yml b/.github/workflows/e2e-preset-tuning-test.yml new file mode 100644 index 000000000..6446c466c --- /dev/null +++ b/.github/workflows/e2e-preset-tuning-test.yml @@ -0,0 +1,136 @@ +name: E2E Preset tuning Test + +on: + workflow_run: + workflows: ["Build and Push Preset Models"] + types: + - completed + workflow_dispatch: {} + +env: + GO_VERSION: "1.22" + +permissions: + id-token: write + contents: read + +jobs: + e2e-preset-tuning-tests: + needs: determine-models + if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' + runs-on: ubuntu-latest + environment: preset-env + steps: + - name: Checkout + uses: actions/checkout@v4.2.2 + with: + submodules: true + fetch-depth: 0 + + - name: 'Az CLI login' + uses: azure/login@v2.2.0 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + allow-no-subscriptions: true + + - name: 'Set ACR Subscription' + run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} + + - name: Set up kubectl context + run: | + az aks get-credentials --resource-group llm-test --name GitRunner + + - name: Get test meta + id: get_test_meta + run: | + echo "TAG=0.0.7" >> $GITHUB_OUTPUT + echo "NODEPOOL_NAME=tuning" >> $GITHUB_OUTPUT + echo "NODE_COUNT=1" >> $GITHUB_OUTPUT + echo "NODE_VM_SIZE=Standard_NC6s_v3" >> $GITHUB_OUTPUT + echo "NODE_OSDISK_SIZE=100" >> $GITHUB_OUTPUT + + - name: Create Nodepool + run: | + NODEPOOL_EXIST=$(az aks nodepool show \ + --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --query 'name' -o tsv || echo "") + echo "NODEPOOL_EXIST: $NODEPOOL_EXIST" + if [ -z "$NODEPOOL_EXIST" ]; then + az aks nodepool add \ + --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --node-count ${{ steps.get_test_meta.outputs.NODE_COUNT }} \ + --node-vm-size ${{ steps.get_test_meta.outputs.NODE_VM_SIZE }} \ + --node-osdisk-size ${{ steps.get_test_meta.outputs.NODE_OSDISK_SIZE }} \ + --labels pool=${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --node-taints sku=gpu:NoSchedule \ + --aks-custom-headers UseGPUDedicatedVHD=true + else + NODEPOOL_STATE=$(az aks nodepool show \ + --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --query 'provisioningState' -o tsv) + echo "NODEPOOL_STATE: $NODEPOOL_STATE" + if [ "$NODEPOOL_STATE" != "Succeeded" ]; then + echo "Nodepool exists but is not in a Succeeded state. Please check manually." + exit 1 + else + echo "Nodepool already exists and is in a running state." + fi + fi + + - name: Replace repo and Deploy Resource to K8s + run: | + sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/tuning/tuning-job.yaml + sed -i "s/TAG_HERE/${{ steps.get_test_meta.outputs.TAG }}/g" presets/workspace/test/tuning/tuning-job.yaml + kubectl apply -f presets/workspace/test/tuning/tuning-job.yaml + + - name: Wait for tuning job to be ready + shell: bash {0} + run: | + retval_complete=1 + retval_failed=1 + count=0 + max_retries=60 + while [[ $retval_complete -ne 0 ]] && [[ $retval_failed -ne 0 ]] && [[ $count -lt $max_retries ]]; do + sleep 10 + output=$(kubectl wait --for=condition=failed job/tuning-example --timeout=0 2>&1) + retval_failed=$? + output=$(kubectl wait --for=condition=complete job/tuning-example --timeout=0 2>&1) + retval_complete=$? + count=$((count + 1)) + done + + if [ $retval_failed -eq 0 ]; then + echo "Job failed. Please check logs." + exit 1 + elif [ $retval_complete -ne 0 ]; then + echo "Job timeout." + exit 1 + else + echo "Job succeeded." + fi + - name: Cleanup + if: always() + run: | + kubectl delete --wait=true -f presets/workspace/test/tuning/tuning-job.yaml + + # Check and Delete AKS Nodepool if it exists + NODEPOOL_EXIST=$(az aks nodepool show \ + --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test \ + --query 'name' -o tsv || echo "") + + if [ -n "$NODEPOOL_EXIST" ]; then + echo "deleting nodepool" + az aks nodepool delete \ + --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --cluster-name GitRunner \ + --resource-group llm-test + fi diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml index 182143771..183e76231 100644 --- a/.github/workflows/e2e-workflow.yml +++ b/.github/workflows/e2e-workflow.yml @@ -189,15 +189,6 @@ jobs: KARPENTER_VERSION: ${{ vars.KARPENTER_VERSION }} KARPENTER_NAMESPACE: ${{ env.KARPENTER_NAMESPACE }} - - name: build KAITO image - if: ${{ !inputs.isRelease }} - shell: bash - run: | - make docker-build-workspace - env: - REGISTRY: ${{ env.REGISTRY }} - VERSION: ${{ env.VERSION }} - - name: Install KAITO Workspace helm chart shell: bash run: | diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml deleted file mode 100644 index a44043894..000000000 --- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-40b-instruct -spec: - progressDeadlineSeconds: 1800 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon40bins diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml deleted file mode 100644 index 514d12e60..000000000 --- a/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-40b -spec: - progressDeadlineSeconds: 1800 - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon40b diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml deleted file mode 100644 index 399b78a3c..000000000 --- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-7b-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon7binst diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml deleted file mode 100644 index 8e5786c6e..000000000 --- a/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-7b -spec: - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - containers: - - name: falcon-container - image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: falcon7b diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml deleted file mode 100644 index 75179683f..000000000 --- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: mistral - template: - metadata: - labels: - app: mistral - spec: - containers: - - name: mistral-instruct-container - image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: mistral7bins diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml deleted file mode 100644 index 3eff5594f..000000000 --- a/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral - template: - metadata: - labels: - app: mistral - spec: - containers: - - name: mistral-container - image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: mistral7b diff --git a/presets/workspace/test/manifests/phi-2/phi-2.yaml b/presets/workspace/test/manifests/phi-2/phi-2.yaml deleted file mode 100644 index cbc6f94e7..000000000 --- a/presets/workspace/test/manifests/phi-2/phi-2.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-2 -spec: - replicas: 1 - selector: - matchLabels: - app: phi-2 - template: - metadata: - labels: - app: phi-2 - spec: - containers: - - name: phi-2-container - image: REPO_HERE.azurecr.io/phi-2:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi2 diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml deleted file mode 100644 index 0adb122e4..000000000 --- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-medium-128k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-medium-128k-instruct - template: - metadata: - labels: - app: phi-3-medium-128k-instruct - spec: - containers: - - name: phi-3-medium-128k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3medium12 \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml deleted file mode 100644 index 1d0d64e47..000000000 --- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-medium-4k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-medium-4k-instruct - template: - metadata: - labels: - app: phi-3-medium-4k-instruct - spec: - containers: - - name: phi-3-medium-4k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3medium4k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct.yaml deleted file mode 100644 index cf8898015..000000000 --- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-mini-128k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-mini-128k-instruct - template: - metadata: - labels: - app: phi-3-mini-128k-instruct - spec: - containers: - - name: phi-3-mini-128k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3mini128k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct.yaml deleted file mode 100644 index 1d7069a38..000000000 --- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: phi-3-mini-4k-instruct -spec: - replicas: 1 - selector: - matchLabels: - app: phi-3-mini-4k-instruct - template: - metadata: - labels: - app: phi-3-mini-4k-instruct - spec: - containers: - - name: phi-3-mini-4k-instruct-container - image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code - resources: - requests: - nvidia.com/gpu: 1 - limits: - nvidia.com/gpu: 1 # Requesting 1 GPU - livenessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 600 # 10 Min - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 5000 - initialDelaySeconds: 30 - periodSeconds: 10 - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - tolerations: - - effect: NoSchedule - key: sku - operator: Equal - value: gpu - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists - nodeSelector: - pool: phi3mini4kin \ No newline at end of file diff --git a/presets/workspace/test/tuning/tuning-job.yaml b/presets/workspace/test/tuning/tuning-job.yaml new file mode 100644 index 000000000..5357bbb67 --- /dev/null +++ b/presets/workspace/test/tuning/tuning-job.yaml @@ -0,0 +1,113 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: tuning-example +spec: + backoffLimit: 0 + completionMode: NonIndexed + completions: 1 + manualSelector: false + parallelism: 1 + podReplacementPolicy: TerminatingOrFailed + suspend: false + template: + spec: + containers: + - command: + - /bin/sh + - -c + - cd /workspace/tfs && python3 metrics_server.py & accelerate launch --num_processes=1 /workspace/tfs/fine_tuning.py + env: + - name: DEFAULT_TARGET_MODULES + value: query_key_value + - name: PYTORCH_CUDA_ALLOC_CONF + value: expandable_segments:True + image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE + imagePullPolicy: Always + name: tuning + ports: + - containerPort: 5000 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /mnt/config + name: config-volume + - mountPath: /mnt/results + name: results-volume + - mountPath: /mnt/data + name: data-volume + dnsPolicy: ClusterFirst + initContainers: + - command: + - sh + - -c + - ls -la /data && cp -r /data/* /mnt/data && ls -la /mnt/data + image: REPO_HERE.azurecr.io/e2e-dataset:0.0.1 + imagePullPolicy: IfNotPresent + name: data-extractor + volumeMounts: + - mountPath: /mnt/data + name: data-volume + restartPolicy: Never + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + key: gpu + operator: Equal + - effect: NoSchedule + key: sku + value: gpu + volumes: + - configMap: + defaultMode: 420 + name: e2e-qlora-params-template + name: config-volume + - emptyDir: {} + name: results-volume + - emptyDir: {} + name: data-volume + nodeSelector: + pool: tuning +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: e2e-qlora-params-template +data: + training_config.yaml: | + training_config: + ModelConfig: + torch_dtype: "bfloat16" + local_files_only: true + device_map: "auto" + chat_template: "/workspace/chat_templates/falcon-instruct.jinja" + + QuantizationConfig: + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "bfloat16" + bnb_4bit_use_double_quant: true + + LoraConfig: + r: 8 + lora_alpha: 8 + lora_dropout: 0.0 + target_modules: ['query_key_value'] + + TrainingArguments: + output_dir: "/mnt/results" + ddp_find_unused_parameters: false + save_strategy: "epoch" + per_device_train_batch_size: 1 + max_steps: 2 # Adding this line to limit training to 2 steps + + DataCollator: + mlm: true + + DatasetConfig: + shuffle_dataset: true + train_test_split: 1 \ No newline at end of file From 882de5857429a3ef5892aff849b7b378563898ad Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Thu, 28 Nov 2024 10:09:14 +1100 Subject: [PATCH 2/2] use config Signed-off-by: jerryzhuang --- .github/e2e-preset-configs.json | 8 ++++++ .github/workflows/e2e-preset-tuning-test.yml | 25 +++++++++---------- presets/workspace/test/tuning/tuning-job.yaml | 2 +- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json index 972c5e7ad..f247aa7ac 100644 --- a/.github/e2e-preset-configs.json +++ b/.github/e2e-preset-configs.json @@ -129,6 +129,14 @@ "node-osdisk-size": 150, "OSS": false, "loads_adapter": false + }, + { + "name": "tuning-example", + "node-count": 1, + "node-vm-size": "Standard_NC6s_v3", + "node-osdisk-size": 100, + "OSS": true, + "loads_adapter": false } ] } diff --git a/.github/workflows/e2e-preset-tuning-test.yml b/.github/workflows/e2e-preset-tuning-test.yml index 6446c466c..2379980ea 100644 --- a/.github/workflows/e2e-preset-tuning-test.yml +++ b/.github/workflows/e2e-preset-tuning-test.yml @@ -44,34 +44,33 @@ jobs: - name: Get test meta id: get_test_meta run: | + CONFIG=$(jq '.matrix.image[] | select(.name == "tuning-example")' .github/e2e-preset-configs.json) + echo "TAG=0.0.7" >> $GITHUB_OUTPUT - echo "NODEPOOL_NAME=tuning" >> $GITHUB_OUTPUT - echo "NODE_COUNT=1" >> $GITHUB_OUTPUT - echo "NODE_VM_SIZE=Standard_NC6s_v3" >> $GITHUB_OUTPUT - echo "NODE_OSDISK_SIZE=100" >> $GITHUB_OUTPUT + echo "model=$CONFIG" >> $GITHUB_OUTPUT - name: Create Nodepool run: | NODEPOOL_EXIST=$(az aks nodepool show \ - --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.get_test_meta.outputs.model.name }} \ --cluster-name GitRunner \ --resource-group llm-test \ --query 'name' -o tsv || echo "") echo "NODEPOOL_EXIST: $NODEPOOL_EXIST" if [ -z "$NODEPOOL_EXIST" ]; then az aks nodepool add \ - --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.get_test_meta.outputs.model.name }} \ --cluster-name GitRunner \ --resource-group llm-test \ - --node-count ${{ steps.get_test_meta.outputs.NODE_COUNT }} \ - --node-vm-size ${{ steps.get_test_meta.outputs.NODE_VM_SIZE }} \ - --node-osdisk-size ${{ steps.get_test_meta.outputs.NODE_OSDISK_SIZE }} \ - --labels pool=${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --node-count ${{ steps.get_test_meta.outputs.model.node-count }} \ + --node-vm-size ${{ steps.get_test_meta.outputs.model.node-vm-size }} \ + --node-osdisk-size ${{ steps.get_test_meta.outputs.model.node-osdisk-size }} \ + --labels pool=${{ steps.get_test_meta.outputs.model.name }} \ --node-taints sku=gpu:NoSchedule \ --aks-custom-headers UseGPUDedicatedVHD=true else NODEPOOL_STATE=$(az aks nodepool show \ - --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.get_test_meta.outputs.model.name }} \ --cluster-name GitRunner \ --resource-group llm-test \ --query 'provisioningState' -o tsv) @@ -122,7 +121,7 @@ jobs: # Check and Delete AKS Nodepool if it exists NODEPOOL_EXIST=$(az aks nodepool show \ - --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.get_test_meta.outputs.model.name }} \ --cluster-name GitRunner \ --resource-group llm-test \ --query 'name' -o tsv || echo "") @@ -130,7 +129,7 @@ jobs: if [ -n "$NODEPOOL_EXIST" ]; then echo "deleting nodepool" az aks nodepool delete \ - --name ${{ steps.get_test_meta.outputs.NODEPOOL_NAME }} \ + --name ${{ steps.get_test_meta.outputs.model.name }} \ --cluster-name GitRunner \ --resource-group llm-test fi diff --git a/presets/workspace/test/tuning/tuning-job.yaml b/presets/workspace/test/tuning/tuning-job.yaml index 5357bbb67..08298e1df 100644 --- a/presets/workspace/test/tuning/tuning-job.yaml +++ b/presets/workspace/test/tuning/tuning-job.yaml @@ -71,7 +71,7 @@ spec: - emptyDir: {} name: data-volume nodeSelector: - pool: tuning + pool: tuning-example --- apiVersion: v1 kind: ConfigMap