Skip to content

Commit

Permalink
ci: Add support for Karpenter in the kaito pipelines (#569)
Browse files Browse the repository at this point in the history
**Reason for Change**:
- Update e2e pipelines in order to be able to check the gpu-provisioner
and Karpenter use cases. This will be handled by adding a new input
variable `nodeprovisioner` for the workflow_call.
- Add make target to install karpenter.
- Unify the variable names used in e2e tests.
- Simplified the e2e pipeline by using the `generate identities` script.
- Use GH variables to pass the k8s version. `AKS_K8S_VERSION`

**Requirements**

- [ ] added unit tests and e2e tests (if applicable).

**Issue Fixed**:
<!-- If this PR fixes GitHub issue 4321, add "Fixes #4321" to the next
line. -->

**Notes for Reviewers**:

---------

Signed-off-by: Heba Elayoty <hebaelayoty@gmail.com>
  • Loading branch information
helayoty authored Aug 20, 2024
1 parent 4e370fc commit 3db011e
Show file tree
Hide file tree
Showing 5 changed files with 196 additions and 95 deletions.
97 changes: 68 additions & 29 deletions .github/workflows/e2e-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ on:
git_sha:
type: string
required: true
nodeprovisioner:
type: string
required: true
tag:
type: string
isRelease:
Expand All @@ -19,7 +22,6 @@ on:
default: "eastus"
k8s_version:
type: string
default: "1.29.2"
secrets:
E2E_CLIENT_ID:
required: true
Expand All @@ -34,19 +36,18 @@ on:
E2E_ACR_AMRT_PASSWORD:
required: true

permissions:
contents: read # This is required for actions/checkout

jobs:
e2e-tests:
runs-on: ubuntu-latest
name: e2e-tests-${{ inputs.nodeprovisioner }}
permissions:
contents: read
id-token: write # This is required for requesting the JWT
environment: e2e-test
env:
GO_VERSION: "1.22"

KARPENTER_NAMESPACE: "karpenter"
GPU_PROVISIONER_NAMESPACE: "gpu-provisioner"
steps:
- name: Harden Runner
uses: step-security/harden-runner@5c7944e73c4c2a096b17a9cb74d65b6c2bbafbde # v2.9.1
Expand All @@ -67,15 +68,15 @@ jobs:
fi
echo "VERSION=${rand}" >> $GITHUB_ENV
echo "CLUSTER_NAME=kaito${rand}" >> $GITHUB_ENV
echo "CLUSTER_NAME=${{ inputs.nodeprovisioner }}${rand}" >> $GITHUB_ENV
echo "REGISTRY=${{ inputs.nodeprovisioner }}${rand}.azurecr.io" >> $GITHUB_ENV
echo "RUN_LLAMA_13B=false" >> $GITHUB_ENV
echo "REGISTRY=kaito${rand}.azurecr.io" >> $GITHUB_ENV
- name: Set Registry
if: ${{ inputs.isRelease }}
run: |
echo "REGISTRY=${{ inputs.registry }}" >> $GITHUB_ENV
echo "VERSION=$(echo ${{ inputs.tag }} | tr -d v)" >> $GITHUB_ENV
echo "REGISTRY=${{ inputs.registry }}" >> $GITHUB_ENV
echo "VERSION=$(echo ${{ inputs.tag }} | tr -d v)" >> $GITHUB_ENV
- name: Set up Go ${{ env.GO_VERSION }}
uses: actions/setup-go@v5.0.2
Expand Down Expand Up @@ -113,7 +114,7 @@ jobs:
uses: azure/CLI@v2.0.0
with:
inlineScript: |
az identity create --name gpuIdentity --resource-group ${{ env.CLUSTER_NAME }}
az identity create --name ${{ inputs.nodeprovisioner }}Identity --resource-group ${{ env.CLUSTER_NAME }}
- name: Generate APIs
run: |
Expand All @@ -127,14 +128,14 @@ jobs:
env:
REGISTRY: ${{ env.REGISTRY }}
VERSION: ${{ env.VERSION }}

- name: build adapter image
shell: bash
run: |
make docker-build-adapter
env:
REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io

- name: build dataset image
shell: bash
run: |
Expand All @@ -145,46 +146,74 @@ jobs:
- name: create cluster
shell: bash
run: |
make create-aks-cluster
if [ "${{ inputs.nodeprovisioner }}" == "gpuprovisioner" ]; then
make create-aks-cluster
else
make create-aks-cluster-for-karpenter
fi
env:
AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }}
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
AZURE_LOCATION: ${{ inputs.region }}
AKS_K8S_VERSION: ${{ inputs.k8s_version }}

- name: Az login
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1
with:
client-id: ${{ secrets.E2E_CLIENT_ID }}
tenant-id: ${{ secrets.E2E_TENANT_ID }}
subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }}

- name: Create Identities and Permissions for ${{ inputs.nodeprovisioner }}
shell: bash
run: |
make generate-identities
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
TEST_SUITE: ${{ inputs.nodeprovisioner }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }}

- name: Install gpu-provisioner helm chart
if: ${{ inputs.nodeprovisioner == 'gpuprovisioner' }}
shell: bash
run: |
make gpu-provisioner-helm
kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
AZURE_TENANT_ID: ${{ secrets.E2E_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }}
GPU_PROVISIONER_VERSION: ${{ vars.GPU_PROVISIONER_VERSION }}

- name: Install karpenter Azure provider helm chart
if: ${{ inputs.nodeprovisioner == 'azkarpenter' }}
shell: bash
run: |
make azure-karpenter-helm
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
AZURE_TENANT_ID: ${{ secrets.E2E_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }}
KARPENTER_VERSION: ${{ vars.KARPENTER_VERSION }}
KARPENTER_NAMESPACE: ${{ env.KARPENTER_NAMESPACE }}

- uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1
with:
client-id: ${{ secrets.E2E_CLIENT_ID }}
tenant-id: ${{ secrets.E2E_TENANT_ID }}
subscription-id: ${{ secrets.E2E_SUBSCRIPTION_ID }}

- name: Create Role Assignment
uses: azure/CLI@v2.0.0
with:
inlineScript: |
IDENTITY_PRINCIPAL_ID="$(az identity show --name gpuIdentity --resource-group ${{ env.CLUSTER_NAME }} --query 'principalId' -otsv)"
az role assignment create --assignee ${IDENTITY_PRINCIPAL_ID} --scope "/subscriptions/${{ secrets.E2E_SUBSCRIPTION_ID }}/resourceGroups/${{ env.CLUSTER_NAME }}" --role "Contributor"
- name: Create Azure Federated Identity
uses: azure/CLI@v2.0.0
with:
inlineScript: |
AKS_OIDC_ISSUER="$(az aks show -n "${{ env.CLUSTER_NAME }}" -g "${{ env.CLUSTER_NAME }}" --query 'oidcIssuerProfile.issuerUrl' -otsv)"
az identity federated-credential create --name gpu-fed-credential --identity-name gpuIdentity --resource-group "${{ env.CLUSTER_NAME }}" \
--issuer "${AKS_OIDC_ISSUER}" --subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange
- name: build KAITO image
if: ${{ !inputs.isRelease }}
shell: bash
run: |
make docker-build-kaito
env:
REGISTRY: ${{ env.REGISTRY }}
VERSION: ${{ env.VERSION }}

- name: Install KAITO Workspace helm chart
shell: bash
Expand All @@ -196,6 +225,7 @@ jobs:
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
REGISTRY: ${{ env.REGISTRY }}
VERSION: ${{ env.VERSION }}
TEST_SUITE: ${{ inputs.nodeprovisioner }}

# Retrieve E2E ACR credentials and create Kubernetes secret
- name: Set up E2E ACR Credentials and Secret
Expand Down Expand Up @@ -224,7 +254,15 @@ jobs:
--docker-server=${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io \
--docker-username=${{ secrets.E2E_ACR_AMRT_USERNAME }} \
--docker-password=${{ secrets.E2E_ACR_AMRT_PASSWORD }}
- name: Log ${{ inputs.nodeprovisioner }}
run: |
if [ "${{ inputs.nodeprovisioner }}" == "gpuprovisioner" ]; then
kubectl logs -n "${{ env.GPU_PROVISIONER_NAMESPACE }}" -l app.kubernetes.io/name=gpu-provisioner -c controller
else
kubectl logs -n "${{ env.KARPENTER_NAMESPACE }}" -l app.kubernetes.io/name=karpenter -c controller
fi
- name: Log kaito-workspace
run: |
kubectl get pods -n kaito-workspace -o name | grep "^pod/kaito-workspace" | sed 's/^pod\///' | xargs -I {} kubectl logs -n kaito-workspace {}
Expand All @@ -238,6 +276,7 @@ jobs:
REGISTRY: ${{ env.REGISTRY }}
AI_MODELS_REGISTRY: ${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io
AI_MODELS_REGISTRY_SECRET: ${{ secrets.E2E_AMRT_SECRET_NAME }}
TEST_SUITE: ${{ inputs.nodeprovisioner }}
E2E_ACR_REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io
E2E_ACR_REGISTRY_SECRET: ${{ env.CLUSTER_NAME }}-acr-secret

Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/kaito-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,19 @@ permissions:

jobs:
run-e2e:
strategy:
fail-fast: false
matrix:
suite: [ gpuprovisioner ]
permissions:
contents: read
id-token: write
statuses: write
uses: ./.github/workflows/e2e-workflow.yml
with:
git_sha: ${{ github.event.pull_request.head.sha }}
k8s_version: ${{ vars.AKS_K8S_VERSION }}
nodeprovisioner: ${{ matrix.suite }}
secrets:
E2E_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
E2E_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish-gh-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ jobs:
git_sha: ${{ github.sha }}
isRelease: true
registry: ${{ needs.build-scan-publish-gh-images.outputs.registry_repository }}
k8s_version: ${{ vars.AKS_K8S_VERSION }}
tag: ${{ needs.check-tag.outputs.tag }}
secrets:
E2E_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish-mcr-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ jobs:
git_sha: ${{ github.sha }}
isRelease: true
registry: "mcr.microsoft.com/aks/kaito"
k8s_version: ${{ vars.AKS_K8S_VERSION }}
tag: ${{ github.event.client_payload.tag }}
secrets:
E2E_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
Expand Down
Loading

0 comments on commit 3db011e

Please sign in to comment.