E2E Preset Test #362
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: E2E Preset Test | |
on: | |
workflow_run: | |
workflows: ["Build and Push Preset Models"] | |
types: | |
- completed | |
workflow_dispatch: | |
env: | |
GO_VERSION: "1.20" | |
permissions: | |
id-token: write | |
contents: read | |
jobs: | |
determine-models: | |
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' | |
runs-on: ubuntu-latest | |
outputs: | |
matrix: ${{ steps.images.outputs.matrix }} | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
fetch-depth: 0 | |
- name: Determine Images for Testing | |
id: images | |
run: | | |
echo "Setting image tag based on presets/models/supported_models.yaml" | |
MATRIX=$(yq e -o=json '.models' presets/models/supported_models.yaml | jq -c) | |
# Read the additional configurations from e2e-preset-configs.json | |
CONFIGS=$(cat .github/e2e-preset-configs.json | jq -c '.matrix.image') | |
# Pseudocode for combining matrices | |
# COMBINED_MATRIX = [] | |
# for model in MATRIX: | |
# for config in CONFIGS: | |
# if config['name'] == model['name']: | |
# combined = {**model, **config} | |
# COMBINED_MATRIX.append(combined) | |
# break | |
COMBINED_MATRIX=$(echo $MATRIX | jq --argjson configs "$CONFIGS" -c ' | |
map(. as $model | $configs[] | select(.name == $model.name) | $model + .) | |
') | |
echo "matrix=$COMBINED_MATRIX" >> $GITHUB_OUTPUT | |
- name: Print Combined Matrix | |
run: | | |
echo "Combined Matrix:" | |
echo '${{ steps.images.outputs.matrix }}' | |
e2e-preset-tests: | |
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' | |
needs: determine-models | |
runs-on: ubuntu-latest | |
environment: e2e-test | |
strategy: | |
fail-fast: false | |
matrix: | |
# Ex matrix element: | |
# {"name":"falcon-40b","type":"text-generation","version":"#", | |
# "runtime":"tfs","tag":"0.0.1","node-count":1, | |
# "node-vm-size":"Standard_NC96ads_A100_v4", "node-osdisk-size":400} | |
model: ${{fromJson(needs.determine-models.outputs.matrix)}} | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: true | |
fetch-depth: 0 | |
- name: Install Azure CLI latest | |
run: | | |
if ! which az > /dev/null; then | |
echo "Azure CLI not found. Installing..." | |
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash | |
else | |
echo "Azure CLI already installed." | |
fi | |
- name: 'Az CLI login' | |
uses: azure/login@v1.6.1 | |
with: | |
client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
allow-no-subscriptions: true | |
- name: 'Set subscription' | |
run: az account set --subscription ${{secrets.AZURE_SUBSCRIPTION_ID}} | |
- name: 'Check if Image exists in Test ACR' | |
id: check_test_image | |
run: | | |
ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} | |
IMAGE_NAME=${{ matrix.model.name }} | |
TAG=${{ matrix.model.tag }} | |
# Use '|| true' to prevent script from exiting with an error if the repository is not found | |
TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) | |
if [[ -z "$TAGS" ]]; then | |
echo "Image $IMAGE_NAME:$TAG or repository not found in $ACR_NAME." | |
echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT | |
else | |
if echo "$TAGS" | grep -q "^$TAG$"; then | |
echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT | |
else | |
echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT | |
echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." | |
fi | |
fi | |
- name: 'Check if Image exists in Prod ACR' | |
id: check_prod_image | |
run: | | |
ACR_NAME=${{ secrets.ACR_AMR_USERNAME }} | |
IMAGE_NAME=${{ matrix.model.name }} | |
TAG=${{ matrix.model.tag }} | |
# Use '|| true' to prevent script from exiting with an error if the repository is not found | |
TAGS=$(az acr repository show-tags -n $ACR_NAME --repository $IMAGE_NAME --output tsv || true) | |
if [[ -z "$TAGS" ]]; then | |
echo "Image $IMAGE_NAME:$TAG or repository not found in $ACR_NAME." | |
echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT | |
else | |
if echo "$TAGS" | grep -q "^$TAG$"; then | |
echo "IMAGE_EXISTS=true" >> $GITHUB_OUTPUT | |
else | |
echo "IMAGE_EXISTS=false" >> $GITHUB_OUTPUT | |
echo "Image $IMAGE_NAME:$TAG not found in $ACR_NAME." | |
fi | |
fi | |
- name: Check if Image is Test and Prod ACRs | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'true' | |
run: | | |
echo "Skipping: Image already exists in both Test and Prod ACRs, remember to bump tag" | |
- name: Set up kubectl context | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
run: | | |
az aks get-credentials --resource-group llm-test --name GitRunner | |
- name: Get Nodepool Name | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
id: get_nodepool_name | |
run: | | |
NAME_SUFFIX=${{ matrix.model.name }} | |
NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/} # Removing all '-' symbols | |
if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then | |
TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES: -12} | |
else | |
TRUNCATED_NAME_SUFFIX=$NAME_SUFFIX_WITHOUT_DASHES | |
fi | |
echo "Nodepool Name: $TRUNCATED_NAME_SUFFIX" | |
echo "NODEPOOL_NAME=$TRUNCATED_NAME_SUFFIX" >> $GITHUB_OUTPUT | |
- name: Create Nodepool | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
run: | | |
NODEPOOL_EXIST=$(az aks nodepool show \ | |
--name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ | |
--cluster-name GitRunner \ | |
--resource-group llm-test \ | |
--query 'name' -o tsv || echo "") | |
echo "NODEPOOL_EXIST: $NODEPOOL_EXIST" | |
if [ -z "$NODEPOOL_EXIST" ]; then | |
az aks nodepool add \ | |
--name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ | |
--cluster-name GitRunner \ | |
--resource-group llm-test \ | |
--node-count ${{ matrix.model.node-count }} \ | |
--node-vm-size ${{ matrix.model.node-vm-size }} \ | |
--node-osdisk-size ${{ matrix.model.node-osdisk-size }} \ | |
--labels pool=${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ | |
--node-taints sku=gpu:NoSchedule \ | |
--aks-custom-headers UseGPUDedicatedVHD=true | |
else | |
NODEPOOL_STATE=$(az aks nodepool show \ | |
--name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ | |
--cluster-name GitRunner \ | |
--resource-group llm-test \ | |
--query 'provisioningState' -o tsv) | |
echo "NODEPOOL_STATE: $NODEPOOL_STATE" | |
if [ "$NODEPOOL_STATE" != "Succeeded" ]; then | |
echo "Nodepool exists but is not in a Succeeded state. Please check manually." | |
exit 1 | |
else | |
echo "Nodepool already exists and is in a running state." | |
fi | |
fi | |
- name: Create Service | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
run: kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml | |
- name: Retrieve External Service IP | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
id: get_ip | |
run: | | |
while [[ -z $SERVICE_IP ]]; do | |
SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') | |
sleep 5 | |
done | |
echo "Service IP is $SERVICE_IP" | |
echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT | |
- name: Replace IP and Deploy Statefulset to K8s | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
run: | | |
sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml | |
sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml | |
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml | |
kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-statefulset.yaml | |
- name: Wait for Statefulset to be ready | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
run: | | |
kubectl rollout status statefulset/${{ matrix.model.name }} | |
- name: Test home endpoint | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
run: | | |
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/ | |
- name: Test healthz endpoint | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
run: | | |
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz | |
- name: Test inference endpoint | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' | |
run: | | |
if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then | |
echo "Testing inference for ${{ matrix.model.name }}" | |
curl -X POST \ | |
-H "Content-Type: application/json" \ | |
-d '{ | |
"input_data": { | |
"input_string": [ | |
[ | |
{ | |
"role": "system", | |
"content": "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe." | |
}, | |
{ | |
"role": "user", | |
"content": "Write a brief birthday message to John" | |
} | |
] | |
] | |
} | |
}' \ | |
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat | |
elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then | |
echo "Testing inference for ${{ matrix.model.name }}" | |
curl -X POST \ | |
-H "Content-Type: application/json" \ | |
-d '{ | |
"prompts": [ | |
"I believe the meaning of life is", | |
"Simply put, the theory of relativity states that ", | |
"A brief message congratulating the team on the launch: Hi everyone, I just ", | |
"Translate English to French: sea otter => loutre de mer, peppermint => menthe poivrée, plush girafe => girafe peluche, cheese =>" | |
], | |
"parameters": { | |
"max_gen_len": 128 | |
} | |
}' \ | |
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate | |
else | |
echo "Testing inference for ${{ matrix.model.name }}" | |
curl -X POST \ | |
-H "accept: application/json" \ | |
-H "Content-Type: application/json" \ | |
-d '{ | |
"prompt":"Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:", | |
"return_full_text": false, | |
"clean_up_tokenization_spaces": false, | |
"prefix": null, | |
"handle_long_generation": null, | |
"generate_kwargs": { | |
"max_length":200, | |
"min_length":0, | |
"do_sample":true, | |
"early_stopping":false, | |
"num_beams":1, | |
"num_beam_groups":1, | |
"diversity_penalty":0.0, | |
"temperature":1.0, | |
"top_k":10, | |
"top_p":1, | |
"typical_p":1, | |
"repetition_penalty":1, | |
"length_penalty":1, | |
"no_repeat_ngram_size":0, | |
"encoder_no_repeat_ngram_size":0, | |
"bad_words_ids":null, | |
"num_return_sequences":1, | |
"output_scores":false, | |
"return_dict_in_generate":false, | |
"forced_bos_token_id":null, | |
"forced_eos_token_id":null, | |
"remove_invalid_values":null | |
} | |
}' \ | |
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat | |
fi | |
- name: Move from Test to Prod ACR | |
if: steps.check_test_image.outputs.IMAGE_EXISTS == 'true' && steps.check_prod_image.outputs.IMAGE_EXISTS == 'false' && github.event_name == 'workflow_dispatch' | |
run: | | |
# This should only run if: | |
# 1. All prior steps have succeeed (Given) | |
# 2. Image exists in test ACR repo but not Prod | |
# 3. Workflow was triggered manually (workflow_dispatch) | |
TEST_ACR_NAME=${{ secrets.ACR_AMRT_USERNAME }} | |
PROD_ACR_NAME=${{ secrets.ACR_AMR_USERNAME }} | |
IMAGE_NAME=${{ matrix.model.name }} | |
TAG=${{ matrix.model.tag }} | |
# Formulate the source image reference | |
SOURCE_IMAGE="$TEST_ACR_NAME.azurecr.io/$IMAGE_NAME:$TAG" | |
# Import the image from Test ACR to Prod ACR | |
az acr import --name $PROD_ACR_NAME --source $SOURCE_IMAGE --image $IMAGE_NAME:$TAG | |
- name: Cleanup | |
if: always() | |
run: | | |
# Check and Delete K8s Service if it exists | |
if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then | |
kubectl delete svc ${{ matrix.model.name }} | |
fi | |
# Check and Delete K8s StatefulSet if it exists | |
if kubectl get statefulset ${{ matrix.model.name }} > /dev/null 2>&1; then | |
kubectl delete statefulset ${{ matrix.model.name }} | |
fi | |
# Check and Delete AKS Nodepool if it exists | |
if [ -n "${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }}" ]; then | |
NODEPOOL_EXIST=$(az aks nodepool show \ | |
--name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ | |
--cluster-name GitRunner \ | |
--resource-group llm-test \ | |
--query 'name' -o tsv || echo "") | |
if [ -n "$NODEPOOL_EXIST" ]; then | |
az aks nodepool delete \ | |
--name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \ | |
--cluster-name GitRunner \ | |
--resource-group llm-test | |
fi | |
fi | |