From 1e242b57a2698eb2c7af2eeda2bc200ad8bc7923 Mon Sep 17 00:00:00 2001 From: Smriti Dahal Date: Wed, 20 Nov 2024 12:41:23 -0800 Subject: [PATCH 1/5] chart changes --- .../workspace/templates/clusterrole.yaml | 3 +++ .../kaito/workspace/templates/deployment.yaml | 2 ++ .../templates/nvidia-device-plugin-ds.yaml | 5 +++++ pkg/utils/nodeclaim/nodeclaim.go | 19 ++++++++++++++++--- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/charts/kaito/workspace/templates/clusterrole.yaml b/charts/kaito/workspace/templates/clusterrole.yaml index 5185e3d44..afcdc48ae 100644 --- a/charts/kaito/workspace/templates/clusterrole.yaml +++ b/charts/kaito/workspace/templates/clusterrole.yaml @@ -42,6 +42,9 @@ rules: - apiGroups: [ "karpenter.azure.com" ] resources: [ "aksnodeclasses"] verbs: [ "get","list","watch","create", "delete", "update", "patch" ] + - apiGroups: [ "karpenter.k8s.aws" ] + resources: [ "ec2nodeclasses"] + verbs: [ "get","list","watch","create", "delete", "update", "patch" ] - apiGroups: ["admissionregistration.k8s.io"] resources: ["validatingwebhookconfigurations"] verbs: ["get","list","watch"] diff --git a/charts/kaito/workspace/templates/deployment.yaml b/charts/kaito/workspace/templates/deployment.yaml index 0297530e8..92989f6a9 100644 --- a/charts/kaito/workspace/templates/deployment.yaml +++ b/charts/kaito/workspace/templates/deployment.yaml @@ -47,6 +47,8 @@ spec: value: {{ .Values.presetRegistryName }} - name: CLOUD_PROVIDER value: {{ .Values.cloudProviderName }} + - name: CLUSTER_NAME + value: {{ .Values.clusterName }} ports: - name: http-metrics containerPort: 8080 diff --git a/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml b/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml index a07ba91ca..312ba651d 100644 --- a/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml +++ b/charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml @@ -21,12 +21,17 @@ spec: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: + {{- if eq .Values.cloudProviderName "azure" }} - key: kubernetes.azure.com/cluster operator: Exists - key: type operator: NotIn values: - virtual-kubelet + {{- else if eq .Values.cloudProviderName "aws" }} + - key: "k8s.io/cloud-provider-aws" + operator: Exists + {{- end }} tolerations: # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. # This, along with the annotation above marks this pod as a critical add-on. diff --git a/pkg/utils/nodeclaim/nodeclaim.go b/pkg/utils/nodeclaim/nodeclaim.go index 86c797215..cccc15ddc 100644 --- a/pkg/utils/nodeclaim/nodeclaim.go +++ b/pkg/utils/nodeclaim/nodeclaim.go @@ -130,6 +130,18 @@ func GenerateNodeClaimManifest(ctx context.Context, storageRequirement string, o } nodeClaimObj.Spec.Requirements = append(nodeClaimObj.Spec.Requirements, nodeSelector) } + + if cloudName == consts.AWSCloudName { + nodeSelector := v1beta1.NodeSelectorRequirementWithMinValues{ + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: "karpenter.k8s.aws/instance-gpu-count", + Operator: v1.NodeSelectorOpGt, + Values: []string{"0"}, + }, + } + nodeClaimObj.Spec.Requirements = append(nodeClaimObj.Spec.Requirements, nodeSelector) + } + return nodeClaimObj } @@ -170,8 +182,9 @@ func GenerateEC2NodeClassManifest(ctx context.Context) *awsv1beta1.EC2NodeClass Name: consts.NodeClassName, }, Spec: awsv1beta1.EC2NodeClassSpec{ - AMIFamily: lo.ToPtr(awsv1beta1.AMIFamilyAL2), // Amazon Linux 2 - Role: fmt.Sprintf("KarpenterNodeRole-%s", clusterName), + AMIFamily: lo.ToPtr(awsv1beta1.AMIFamilyAL2), // Amazon Linux 2 + Role: fmt.Sprintf("KarpenterNodeRole-%s", clusterName), + InstanceStorePolicy: lo.ToPtr(awsv1beta1.InstanceStorePolicyRAID0), //required to share node's ephermeral storage among pods that request it SubnetSelectorTerms: []awsv1beta1.SubnetSelectorTerm{ { Tags: map[string]string{ @@ -201,7 +214,7 @@ func CreateNodeClaim(ctx context.Context, nodeClaimObj *v1beta1.NodeClaim, kubeC return err } - err = kubeClient.Create(ctx, nodeClaimObj, &client.CreateOptions{}) + err = kubeClient.Create(ctx, nodeClaimObj.DeepCopy(), &client.CreateOptions{}) if err != nil { return err } From b7c2475748d00d25df69a60d16286c457794c13e Mon Sep 17 00:00:00 2001 From: Smriti Dahal Date: Wed, 20 Nov 2024 13:32:25 -0800 Subject: [PATCH 2/5] fix unit test --- pkg/utils/nodeclaim/nodeclaim_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/utils/nodeclaim/nodeclaim_test.go b/pkg/utils/nodeclaim/nodeclaim_test.go index 48bcd71b3..e992d3e3a 100644 --- a/pkg/utils/nodeclaim/nodeclaim_test.go +++ b/pkg/utils/nodeclaim/nodeclaim_test.go @@ -205,7 +205,7 @@ func TestGenerateNodeClaimManifest(t *testing.T) { assert.Equal(t, nodeClaim.Labels[kaitov1alpha1.LabelWorkspaceNamespace], mockWorkspace.Namespace, "label must have same workspace namespace as workspace") assert.Equal(t, nodeClaim.Labels[consts.LabelNodePool], consts.KaitoNodePoolName, "label must have same labels as workspace label selector") assert.Equal(t, nodeClaim.Annotations[v1beta1.DoNotDisruptAnnotationKey], "true", "label must have do not disrupt annotation") - assert.Equal(t, len(nodeClaim.Spec.Requirements), 3, " NodeClaim must have 3 NodeSelector Requirements") + assert.Equal(t, len(nodeClaim.Spec.Requirements), 4, " NodeClaim must have 3 NodeSelector Requirements") assert.Check(t, nodeClaim.Spec.NodeClassRef != nil, "NodeClaim must have NodeClassRef") assert.Equal(t, nodeClaim.Spec.NodeClassRef.Kind, "EC2NodeClass", "NodeClaim must have 'EC2NodeClass' kind") }) From 1411d112befab340d0fae73f0ddc76dabaf40f1e Mon Sep 17 00:00:00 2001 From: Smriti Dahal Date: Thu, 21 Nov 2024 10:03:29 -0800 Subject: [PATCH 3/5] fix test case --- pkg/utils/nodeclaim/nodeclaim_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/utils/nodeclaim/nodeclaim_test.go b/pkg/utils/nodeclaim/nodeclaim_test.go index e992d3e3a..bf387f866 100644 --- a/pkg/utils/nodeclaim/nodeclaim_test.go +++ b/pkg/utils/nodeclaim/nodeclaim_test.go @@ -205,7 +205,7 @@ func TestGenerateNodeClaimManifest(t *testing.T) { assert.Equal(t, nodeClaim.Labels[kaitov1alpha1.LabelWorkspaceNamespace], mockWorkspace.Namespace, "label must have same workspace namespace as workspace") assert.Equal(t, nodeClaim.Labels[consts.LabelNodePool], consts.KaitoNodePoolName, "label must have same labels as workspace label selector") assert.Equal(t, nodeClaim.Annotations[v1beta1.DoNotDisruptAnnotationKey], "true", "label must have do not disrupt annotation") - assert.Equal(t, len(nodeClaim.Spec.Requirements), 4, " NodeClaim must have 3 NodeSelector Requirements") + assert.Equal(t, len(nodeClaim.Spec.Requirements), 4, " NodeClaim must have 4 NodeSelector Requirements") assert.Check(t, nodeClaim.Spec.NodeClassRef != nil, "NodeClaim must have NodeClassRef") assert.Equal(t, nodeClaim.Spec.NodeClassRef.Kind, "EC2NodeClass", "NodeClaim must have 'EC2NodeClass' kind") }) From 50a8a027c58faf17a07d07d9dd4702c83663a625 Mon Sep 17 00:00:00 2001 From: Smriti Dahal Date: Thu, 21 Nov 2024 12:56:37 -0800 Subject: [PATCH 4/5] add conditional to clusterrole --- charts/kaito/workspace/templates/clusterrole.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charts/kaito/workspace/templates/clusterrole.yaml b/charts/kaito/workspace/templates/clusterrole.yaml index afcdc48ae..43f4f30e8 100644 --- a/charts/kaito/workspace/templates/clusterrole.yaml +++ b/charts/kaito/workspace/templates/clusterrole.yaml @@ -39,12 +39,15 @@ rules: - apiGroups: ["karpenter.sh"] resources: ["machines", "machines/status", "nodeclaims", "nodeclaims/status"] verbs: ["get","list","watch","create", "delete", "update", "patch"] + {{- if eq .Values.cloudProviderName "azure" }} - apiGroups: [ "karpenter.azure.com" ] resources: [ "aksnodeclasses"] verbs: [ "get","list","watch","create", "delete", "update", "patch" ] + {{- else if eq .Values.cloudProviderName "aws" }} - apiGroups: [ "karpenter.k8s.aws" ] resources: [ "ec2nodeclasses"] verbs: [ "get","list","watch","create", "delete", "update", "patch" ] + {{- end }} - apiGroups: ["admissionregistration.k8s.io"] resources: ["validatingwebhookconfigurations"] verbs: ["get","list","watch"] From 9281ffc6dc5c066e49c520d5be6779acf95bf60c Mon Sep 17 00:00:00 2001 From: Smriti Dahal Date: Tue, 26 Nov 2024 15:19:34 -0800 Subject: [PATCH 5/5] adding default clusterName --- charts/kaito/workspace/values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/charts/kaito/workspace/values.yaml b/charts/kaito/workspace/values.yaml index 702ec60ea..5f52cf3de 100644 --- a/charts/kaito/workspace/values.yaml +++ b/charts/kaito/workspace/values.yaml @@ -32,3 +32,4 @@ tolerations: [] affinity: {} # Values can be "azure" or "aws" cloudProviderName: "azure" +clusterName: "kaito"