Skip to content

Commit

Permalink
test: Add e2e tests for azure karpenter (#573)
Browse files Browse the repository at this point in the history
**Reason for Change**:
- Add Karpenter e2e tests 
- Add support for Azure Karpenter in the e2e workflow.
- Organize e2e code base
- 
**Requirements**

- [ ] added unit tests and e2e tests (if applicable).

**Issue Fixed**:
<!-- If this PR fixes GitHub issue 4321, add "Fixes #4321" to the next
line. -->

**Notes for Reviewers**:

---------

Signed-off-by: Heba Elayoty <hebaelayoty@gmail.com>
  • Loading branch information
helayoty authored Aug 22, 2024
1 parent 78e9e69 commit 6b4b86d
Show file tree
Hide file tree
Showing 10 changed files with 234 additions and 120 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/kaito-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
strategy:
fail-fast: false
matrix:
node-provisioner: [ gpuprovisioner ]
node-provisioner: [gpuprovisioner, azkarpenter]
permissions:
contents: read
id-token: write
Expand Down
9 changes: 5 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ AKS_K8S_VERSION ?= 1.30.0
AZURE_RESOURCE_GROUP ?= demo
AZURE_CLUSTER_NAME ?= kaito-demo
AZURE_RESOURCE_GROUP_MC=MC_$(AZURE_RESOURCE_GROUP)_$(AZURE_CLUSTER_NAME)_$(AZURE_LOCATION)
GPU_NAMESPACE ?= gpu-provisioner
GPU_PROVISIONER_NAMESPACE ?= gpu-provisioner
KAITO_NAMESPACE ?= kaito-workspace
GPU_PROVISIONER_MSI_NAME ?= gpuprovisionerIdentity

Expand Down Expand Up @@ -118,8 +118,9 @@ $(E2E_TEST):
.PHONY: kaito-workspace-e2e-test
kaito-workspace-e2e-test: $(E2E_TEST) $(GINKGO)
AI_MODELS_REGISTRY_SECRET=$(AI_MODELS_REGISTRY_SECRET) RUN_LLAMA_13B=$(RUN_LLAMA_13B) \
AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_NAMESPACE=$(GPU_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) \
TEST_SUITE=$(TEST_SUITE) SUPPORTED_MODELS_YAML_PATH=$(SUPPORTED_MODELS_YAML_PATH) \
AI_MODELS_REGISTRY=$(AI_MODELS_REGISTRY) GPU_PROVISIONER_NAMESPACE=$(GPU_PROVISIONER_NAMESPACE) \
KARPENTER_NAMESPACE=$(KARPENTER_NAMESPACE) KAITO_NAMESPACE=$(KAITO_NAMESPACE) TEST_SUITE=$(TEST_SUITE) \
SUPPORTED_MODELS_YAML_PATH=$(SUPPORTED_MODELS_YAML_PATH) \
$(GINKGO) -v -trace $(GINKGO_ARGS) $(E2E_TEST)

## --------------------------------------
Expand Down Expand Up @@ -242,7 +243,7 @@ gpu-provisioner-helm: ## Update Azure client env vars and settings in helm valu
chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) \
$(AZURE_RESOURCE_GROUP) $(GPU_PROVISIONER_MSI_NAME)

helm install gpu-provisioner \
helm install $(GPU_PROVISIONER_NAMESPACE) \
--values gpu-provisioner-values.yaml \
--set settings.azure.clusterName=$(AZURE_CLUSTER_NAME) \
https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz
Expand Down
9 changes: 3 additions & 6 deletions hack/deploy/generate-identities.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ echo "IDENTITY_JSON: $IDENTITY_JSON"

IDENTITY_PRINCIPAL_ID=$(jq -r '.principalId' <<< "$IDENTITY_JSON")

AZURE_RESOURCE_GROUP_RESOURCE_ID="/subscriptions/$AZURE_SUBSCRIPTION_ID/resourceGroups/$AZURE_RESOURCE_GROUP"
AZURE_RESOURCE_GROUP_RESOURCE_ID=$(az group show --name "${AZURE_RESOURCE_GROUP}" --query "id" -otsv)

AZURE_RESOURCE_GROUP_MC=$(jq -r ".nodeResourceGroup" <<< "$AKS_JSON")
AZURE_RESOURCE_GROUP_MC_RESOURCE_ID="/subscriptions/$AZURE_SUBSCRIPTION_ID/resourceGroups/$AZURE_RESOURCE_GROUP_MC"
AZURE_RESOURCE_GROUP_MC_RESOURCE_ID=$(az group show --name "${AZURE_RESOURCE_GROUP_MC}" --query "id" -otsv)

sleep 40 ## wait for the identity credential to be created

Expand All @@ -53,13 +53,10 @@ az identity federated-credential create --name "${FED_NAME}" \

if [[ "${COMPONENT_NAME}" == "azkarpenter" ]]; then
echo "Creating role assignments for $COMPONENT_NAME ..."
for role in "Virtual Machine Contributor" "Network Contributor" "Managed Identity Operator" "Contributor"; do
for role in "Virtual Machine Contributor" "Network Contributor" "Managed Identity Operator"; do
az role assignment create --assignee "$IDENTITY_PRINCIPAL_ID" \
--scope "$AZURE_RESOURCE_GROUP_MC_RESOURCE_ID" \
--role "$role"
az role assignment create --assignee "$IDENTITY_PRINCIPAL_ID" \
--scope "$AZURE_RESOURCE_GROUP_RESOURCE_ID" \
--role "$role"
done
else
echo "Creating role assignments for $COMPONENT_NAME ..."
Expand Down
61 changes: 42 additions & 19 deletions test/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,52 @@ import (
)

var (
ctx = context.Background()
namespaceName = fmt.Sprint(E2eNamespace, rand.Intn(100))
ctx = context.Background()
namespaceName = fmt.Sprint(utils.E2eNamespace, rand.Intn(100))
nodeProvisionerName = os.Getenv("TEST_SUITE")
)

var _ = SynchronizedBeforeSuite(func() []byte {
GetClusterClient(TestingCluster)
gpuNamespace := os.Getenv("GPU_NAMESPACE")
utils.GetClusterClient(utils.TestingCluster)
kaitoNamespace := os.Getenv("KAITO_NAMESPACE")

//check gpu-provisioner deployment is up and running
gpuProvisionerDeployment := &v1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-provisioner",
Namespace: gpuNamespace,
},
if nodeProvisionerName == "azkarpenter" {
karpenterNamespace := os.Getenv("KARPENTER_NAMESPACE")
//check karpenter deployment is up and running
karpenterDeployment := &v1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "karpenter",
Namespace: karpenterNamespace,
},
}

Eventually(func() error {
return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
Namespace: karpenterDeployment.Namespace,
Name: karpenterDeployment.Name,
}, karpenterDeployment, &client.GetOptions{})
}, utils.PollTimeout, utils.PollInterval).
Should(Succeed(), "Failed to wait for karpenter deployment")
}

Eventually(func() error {
return TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
Namespace: gpuProvisionerDeployment.Namespace,
Name: gpuProvisionerDeployment.Name,
}, gpuProvisionerDeployment, &client.GetOptions{})
}, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to wait for gpu-provisioner deployment")
if nodeProvisionerName == "gpuprovisioner" {
gpuNamespace := os.Getenv("GPU_PROVISIONER_NAMESPACE")
//check gpu-provisioner deployment is up and running
gpuProvisionerDeployment := &v1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-provisioner",
Namespace: gpuNamespace,
},
}

Eventually(func() error {
return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
Namespace: gpuProvisionerDeployment.Namespace,
Name: gpuProvisionerDeployment.Name,
}, gpuProvisionerDeployment, &client.GetOptions{})
}, utils.PollTimeout, utils.PollInterval).
Should(Succeed(), "Failed to wait for gpu-provisioner deployment")
}

//check kaito-workspace deployment is up and running
kaitoWorkspaceDeployment := &v1.Deployment{
Expand All @@ -53,14 +76,14 @@ var _ = SynchronizedBeforeSuite(func() []byte {
}

Eventually(func() error {
return TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
return utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
Namespace: kaitoWorkspaceDeployment.Namespace,
Name: kaitoWorkspaceDeployment.Name,
}, kaitoWorkspaceDeployment, &client.GetOptions{})
}, utils.PollTimeout, utils.PollInterval).Should(Succeed(), "Failed to wait for kaito-workspace deployment")

// create testing namespace
err := TestingCluster.KubeClient.Create(context.TODO(), &corev1.Namespace{
err := utils.TestingCluster.KubeClient.Create(context.TODO(), &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Name: namespaceName,
},
Expand All @@ -73,7 +96,7 @@ var _ = SynchronizedBeforeSuite(func() []byte {
var _ = SynchronizedAfterSuite(func() {
// delete testing namespace
Eventually(func() error {
return TestingCluster.KubeClient.Delete(ctx, &corev1.Namespace{
return utils.TestingCluster.KubeClient.Delete(ctx, &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Name: namespaceName,
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func validateAdapters(workspaceObj *kaitov1alpha1.Workspace, expectedInitContain
Namespace: workspaceObj.Namespace,
},
}
err = TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
err = utils.TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
Namespace: workspaceObj.Namespace,
Name: workspaceObj.Name,
}, dep)
Expand Down Expand Up @@ -95,7 +95,11 @@ var _ = Describe("Workspace Preset", func() {
defer cleanupResources(workspaceObj)
time.Sleep(30 * time.Second)

validateMachineCreation(workspaceObj, numOfNode)
if nodeProvisionerName == "azkarpenter" {
utils.ValidateNodeClaimCreation(ctx, workspaceObj, numOfNode)
} else {
utils.ValidateMachineCreation(ctx, workspaceObj, numOfNode)
}
validateResourceStatus(workspaceObj)

time.Sleep(30 * time.Second)
Expand Down
Loading

0 comments on commit 6b4b86d

Please sign in to comment.