Skip to content

Commit

Permalink
feat: [SKU modularization] AWS chart changes (#710)
Browse files Browse the repository at this point in the history
**Reason for Change**:
<!-- What does this PR improve or fix in Kaito? Why is it needed? -->
Updating kaito charts for aws integration:

- add clusterrole permission for ec2nodeclasses resource
- add new env variable in deployment CLUSTER_NAME as this environment is
needed to create EC2 Node Class
- update nodeSelectorTerm for nvidia-device-plugin daemonset based on
the cloud provider
- add AWS specific nodeSelectorRequirement for nodeClaim obj
- add InstanceStorePolicyRAID0 to EC2 node class to share node's
ephermeral storage among pods that request it. Additional information
[here](https://karpenter.sh/v1.0/concepts/nodeclasses/#specinstancestorepolicy)
- fixing a bug on nodeClaim create (kubeClient.Create) - if the
nodeClaim fails to get created, we retry to create it again. However, on
kubeClient.Create, certain metadata are set like resourceVersion which
will be passed along in the retires if we don't pass the DeepCopy().
This metadata being populated will throw an error from KubeClient
`resourceVersion should not be set on objects to be created`
  • Loading branch information
smritidahal653 authored Nov 27, 2024
1 parent 711c858 commit f7e6d66
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 4 deletions.
6 changes: 6 additions & 0 deletions charts/kaito/workspace/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,15 @@ rules:
- apiGroups: ["karpenter.sh"]
resources: ["machines", "machines/status", "nodeclaims", "nodeclaims/status"]
verbs: ["get","list","watch","create", "delete", "update", "patch"]
{{- if eq .Values.cloudProviderName "azure" }}
- apiGroups: [ "karpenter.azure.com" ]
resources: [ "aksnodeclasses"]
verbs: [ "get","list","watch","create", "delete", "update", "patch" ]
{{- else if eq .Values.cloudProviderName "aws" }}
- apiGroups: [ "karpenter.k8s.aws" ]
resources: [ "ec2nodeclasses"]
verbs: [ "get","list","watch","create", "delete", "update", "patch" ]
{{- end }}
- apiGroups: ["admissionregistration.k8s.io"]
resources: ["validatingwebhookconfigurations"]
verbs: ["get","list","watch"]
Expand Down
2 changes: 2 additions & 0 deletions charts/kaito/workspace/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ spec:
value: {{ .Values.presetRegistryName }}
- name: CLOUD_PROVIDER
value: {{ .Values.cloudProviderName }}
- name: CLUSTER_NAME
value: {{ .Values.clusterName }}
ports:
- name: http-metrics
containerPort: 8080
Expand Down
5 changes: 5 additions & 0 deletions charts/kaito/workspace/templates/nvidia-device-plugin-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,17 @@ spec:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
{{- if eq .Values.cloudProviderName "azure" }}
- key: kubernetes.azure.com/cluster
operator: Exists
- key: type
operator: NotIn
values:
- virtual-kubelet
{{- else if eq .Values.cloudProviderName "aws" }}
- key: "k8s.io/cloud-provider-aws"
operator: Exists
{{- end }}
tolerations:
# Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
# This, along with the annotation above marks this pod as a critical add-on.
Expand Down
1 change: 1 addition & 0 deletions charts/kaito/workspace/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ tolerations: []
affinity: {}
# Values can be "azure" or "aws"
cloudProviderName: "azure"
clusterName: "kaito"
19 changes: 16 additions & 3 deletions pkg/utils/nodeclaim/nodeclaim.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,18 @@ func GenerateNodeClaimManifest(ctx context.Context, storageRequirement string, o
}
nodeClaimObj.Spec.Requirements = append(nodeClaimObj.Spec.Requirements, nodeSelector)
}

if cloudName == consts.AWSCloudName {
nodeSelector := v1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: "karpenter.k8s.aws/instance-gpu-count",
Operator: v1.NodeSelectorOpGt,
Values: []string{"0"},
},
}
nodeClaimObj.Spec.Requirements = append(nodeClaimObj.Spec.Requirements, nodeSelector)
}

return nodeClaimObj
}

Expand Down Expand Up @@ -170,8 +182,9 @@ func GenerateEC2NodeClassManifest(ctx context.Context) *awsv1beta1.EC2NodeClass
Name: consts.NodeClassName,
},
Spec: awsv1beta1.EC2NodeClassSpec{
AMIFamily: lo.ToPtr(awsv1beta1.AMIFamilyAL2), // Amazon Linux 2
Role: fmt.Sprintf("KarpenterNodeRole-%s", clusterName),
AMIFamily: lo.ToPtr(awsv1beta1.AMIFamilyAL2), // Amazon Linux 2
Role: fmt.Sprintf("KarpenterNodeRole-%s", clusterName),
InstanceStorePolicy: lo.ToPtr(awsv1beta1.InstanceStorePolicyRAID0), //required to share node's ephermeral storage among pods that request it
SubnetSelectorTerms: []awsv1beta1.SubnetSelectorTerm{
{
Tags: map[string]string{
Expand Down Expand Up @@ -201,7 +214,7 @@ func CreateNodeClaim(ctx context.Context, nodeClaimObj *v1beta1.NodeClaim, kubeC
return err
}

err = kubeClient.Create(ctx, nodeClaimObj, &client.CreateOptions{})
err = kubeClient.Create(ctx, nodeClaimObj.DeepCopy(), &client.CreateOptions{})
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/utils/nodeclaim/nodeclaim_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ func TestGenerateNodeClaimManifest(t *testing.T) {
assert.Equal(t, nodeClaim.Labels[kaitov1alpha1.LabelWorkspaceNamespace], mockWorkspace.Namespace, "label must have same workspace namespace as workspace")
assert.Equal(t, nodeClaim.Labels[consts.LabelNodePool], consts.KaitoNodePoolName, "label must have same labels as workspace label selector")
assert.Equal(t, nodeClaim.Annotations[v1beta1.DoNotDisruptAnnotationKey], "true", "label must have do not disrupt annotation")
assert.Equal(t, len(nodeClaim.Spec.Requirements), 3, " NodeClaim must have 3 NodeSelector Requirements")
assert.Equal(t, len(nodeClaim.Spec.Requirements), 4, " NodeClaim must have 4 NodeSelector Requirements")
assert.Check(t, nodeClaim.Spec.NodeClassRef != nil, "NodeClaim must have NodeClassRef")
assert.Equal(t, nodeClaim.Spec.NodeClassRef.Kind, "EC2NodeClass", "NodeClaim must have 'EC2NodeClass' kind")
})
Expand Down

0 comments on commit f7e6d66

Please sign in to comment.