-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add tuning test to preset test (#741)
**Reason for Change**: <!-- What does this PR improve or fix in Kaito? Why is it needed? --> - add tuning test to preset test - remove extra preset manifests --------- Signed-off-by: jerryzhuang <zhuangqhc@gmail.com>
- Loading branch information
Showing
16 changed files
with
256 additions
and
617 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -397,4 +397,3 @@ jobs: | |
--resource-group llm-test | ||
fi | ||
fi | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
name: E2E Preset tuning Test | ||
|
||
on: | ||
workflow_run: | ||
workflows: ["Build and Push Preset Models"] | ||
types: | ||
- completed | ||
workflow_dispatch: {} | ||
|
||
env: | ||
GO_VERSION: "1.22" | ||
|
||
permissions: | ||
id-token: write | ||
contents: read | ||
|
||
jobs: | ||
e2e-preset-tuning-tests: | ||
needs: determine-models | ||
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' | ||
runs-on: ubuntu-latest | ||
environment: preset-env | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4.2.2 | ||
with: | ||
submodules: true | ||
fetch-depth: 0 | ||
|
||
- name: 'Az CLI login' | ||
uses: azure/login@v2.2.0 | ||
with: | ||
client-id: ${{ secrets.AZURE_CLIENT_ID }} | ||
tenant-id: ${{ secrets.AZURE_TENANT_ID }} | ||
allow-no-subscriptions: true | ||
|
||
- name: 'Set ACR Subscription' | ||
run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} | ||
|
||
- name: Set up kubectl context | ||
run: | | ||
az aks get-credentials --resource-group llm-test --name GitRunner | ||
- name: Get test meta | ||
id: get_test_meta | ||
run: | | ||
CONFIG=$(jq '.matrix.image[] | select(.name == "tuning-example")' .github/e2e-preset-configs.json) | ||
echo "TAG=0.0.7" >> $GITHUB_OUTPUT | ||
echo "model=$CONFIG" >> $GITHUB_OUTPUT | ||
- name: Create Nodepool | ||
run: | | ||
NODEPOOL_EXIST=$(az aks nodepool show \ | ||
--name ${{ steps.get_test_meta.outputs.model.name }} \ | ||
--cluster-name GitRunner \ | ||
--resource-group llm-test \ | ||
--query 'name' -o tsv || echo "") | ||
echo "NODEPOOL_EXIST: $NODEPOOL_EXIST" | ||
if [ -z "$NODEPOOL_EXIST" ]; then | ||
az aks nodepool add \ | ||
--name ${{ steps.get_test_meta.outputs.model.name }} \ | ||
--cluster-name GitRunner \ | ||
--resource-group llm-test \ | ||
--node-count ${{ steps.get_test_meta.outputs.model.node-count }} \ | ||
--node-vm-size ${{ steps.get_test_meta.outputs.model.node-vm-size }} \ | ||
--node-osdisk-size ${{ steps.get_test_meta.outputs.model.node-osdisk-size }} \ | ||
--labels pool=${{ steps.get_test_meta.outputs.model.name }} \ | ||
--node-taints sku=gpu:NoSchedule \ | ||
--aks-custom-headers UseGPUDedicatedVHD=true | ||
else | ||
NODEPOOL_STATE=$(az aks nodepool show \ | ||
--name ${{ steps.get_test_meta.outputs.model.name }} \ | ||
--cluster-name GitRunner \ | ||
--resource-group llm-test \ | ||
--query 'provisioningState' -o tsv) | ||
echo "NODEPOOL_STATE: $NODEPOOL_STATE" | ||
if [ "$NODEPOOL_STATE" != "Succeeded" ]; then | ||
echo "Nodepool exists but is not in a Succeeded state. Please check manually." | ||
exit 1 | ||
else | ||
echo "Nodepool already exists and is in a running state." | ||
fi | ||
fi | ||
- name: Replace repo and Deploy Resource to K8s | ||
run: | | ||
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/tuning/tuning-job.yaml | ||
sed -i "s/TAG_HERE/${{ steps.get_test_meta.outputs.TAG }}/g" presets/workspace/test/tuning/tuning-job.yaml | ||
kubectl apply -f presets/workspace/test/tuning/tuning-job.yaml | ||
- name: Wait for tuning job to be ready | ||
shell: bash {0} | ||
run: | | ||
retval_complete=1 | ||
retval_failed=1 | ||
count=0 | ||
max_retries=60 | ||
while [[ $retval_complete -ne 0 ]] && [[ $retval_failed -ne 0 ]] && [[ $count -lt $max_retries ]]; do | ||
sleep 10 | ||
output=$(kubectl wait --for=condition=failed job/tuning-example --timeout=0 2>&1) | ||
retval_failed=$? | ||
output=$(kubectl wait --for=condition=complete job/tuning-example --timeout=0 2>&1) | ||
retval_complete=$? | ||
count=$((count + 1)) | ||
done | ||
if [ $retval_failed -eq 0 ]; then | ||
echo "Job failed. Please check logs." | ||
exit 1 | ||
elif [ $retval_complete -ne 0 ]; then | ||
echo "Job timeout." | ||
exit 1 | ||
else | ||
echo "Job succeeded." | ||
fi | ||
- name: Cleanup | ||
if: always() | ||
run: | | ||
kubectl delete --wait=true -f presets/workspace/test/tuning/tuning-job.yaml | ||
# Check and Delete AKS Nodepool if it exists | ||
NODEPOOL_EXIST=$(az aks nodepool show \ | ||
--name ${{ steps.get_test_meta.outputs.model.name }} \ | ||
--cluster-name GitRunner \ | ||
--resource-group llm-test \ | ||
--query 'name' -o tsv || echo "") | ||
if [ -n "$NODEPOOL_EXIST" ]; then | ||
echo "deleting nodepool" | ||
az aks nodepool delete \ | ||
--name ${{ steps.get_test_meta.outputs.model.name }} \ | ||
--cluster-name GitRunner \ | ||
--resource-group llm-test | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 0 additions & 56 deletions
56
presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
This file was deleted.
Oops, something went wrong.
56 changes: 0 additions & 56 deletions
56
presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml
This file was deleted.
Oops, something went wrong.
55 changes: 0 additions & 55 deletions
55
presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.