Skip to content

Commit

Permalink
fix: machine and nodeclaim can not supported at the same time (#769)
Browse files Browse the repository at this point in the history
**Reason for Change**:
<!-- What does this PR improve or fix in Kaito? Why is it needed? -->
We need to select Machine or NodeClaim to connect gpu-provisioner
component to provide GPU resource.
this means Machine and NodeClaim can not supported at the same time.

In this pull request, improve some sequence process of Machine and
NodeClaim into `if condition`.

**Requirements**

- [ ] added unit tests and e2e tests (if applicable).

**Issue Fixed**:
<!-- If this PR fixes GitHub issue 4321, add "Fixes #4321" to the next
line. -->

**Notes for Reviewers**:

Signed-off-by: rambohe-ch <rambohe.ch@gmail.com>
  • Loading branch information
rambohe-ch authored Dec 10, 2024
1 parent 9a2f8d6 commit 24eb89b
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 44 deletions.
17 changes: 8 additions & 9 deletions pkg/ragengine/controllers/ragengine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,17 +293,16 @@ func computeHash(ragEngineObj *kaitov1alpha1.RAGEngine) string {

// applyRAGEngineResource applies RAGEngine resource spec.
func (c *RAGEngineReconciler) applyRAGEngineResource(ctx context.Context, ragEngineObj *kaitov1alpha1.RAGEngine) error {

// Wait for pending machines if any before we decide whether to create new machine or not.
if err := machine.WaitForPendingMachines(ctx, ragEngineObj, c.Client); err != nil {
return err
}

if featuregates.FeatureGates[consts.FeatureFlagKarpenter] {
// Wait for pending nodeClaims if any before we decide whether to create new node or not.
if err := nodeclaim.WaitForPendingNodeClaims(ctx, ragEngineObj, c.Client); err != nil {
return err
}
} else {
// Wait for pending machines if any before we decide whether to create new machine or not.
if err := machine.WaitForPendingMachines(ctx, ragEngineObj, c.Client); err != nil {
return err
}
}

// Find all nodes that match the labelSelector and instanceType, they are not necessarily created by machines/nodeClaims.
Expand Down Expand Up @@ -578,11 +577,11 @@ func (c *RAGEngineReconciler) SetupWithManager(mgr ctrl.Manager) error {
}
builder := ctrl.NewControllerManagedBy(mgr).
For(&kaitov1alpha1.RAGEngine{}).
Watches(&v1alpha5.Machine{}, c.watchMachines()).
WithOptions(controller.Options{MaxConcurrentReconciles: 5})
if featuregates.FeatureGates[consts.FeatureFlagKarpenter] {
builder.
Watches(&v1beta1.NodeClaim{}, c.watchNodeClaims()) // watches for nodeClaim with labels indicating ragengine name.
builder.Watches(&v1beta1.NodeClaim{}, c.watchNodeClaims()) // watches for nodeClaim with labels indicating ragengine name.
} else {
builder.Watches(&v1alpha5.Machine{}, c.watchMachines())
}
return builder.Complete(c)
}
Expand Down
26 changes: 13 additions & 13 deletions pkg/ragengine/controllers/ragengine_gc_finalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,6 @@ import (
func (c *RAGEngineReconciler) garbageCollectRAGEngine(ctx context.Context, ragEngineObj *kaitov1alpha1.RAGEngine) (ctrl.Result, error) {
klog.InfoS("garbageCollectRAGEngine", "ragengine", klog.KObj(ragEngineObj))

// Check if there are any machines associated with this ragengine.
mList, err := machine.ListMachines(ctx, ragEngineObj, c.Client)
if err != nil {
return ctrl.Result{}, err
}
// We should delete all the machines that are created by this ragengine
for i := range mList.Items {
if deleteErr := c.Delete(ctx, &mList.Items[i], &client.DeleteOptions{}); deleteErr != nil {
klog.ErrorS(deleteErr, "failed to delete the machine", "machine", klog.KObj(&mList.Items[i]))
return ctrl.Result{}, deleteErr
}
}

if featuregates.FeatureGates[consts.FeatureFlagKarpenter] {
// Check if there are any nodeClaims associated with this ragengine.
ncList, err := nodeclaim.ListNodeClaim(ctx, ragEngineObj, c.Client)
Expand All @@ -48,6 +35,19 @@ func (c *RAGEngineReconciler) garbageCollectRAGEngine(ctx context.Context, ragEn
return ctrl.Result{}, deleteErr
}
}
} else {
// Check if there are any machines associated with this ragengine.
mList, err := machine.ListMachines(ctx, ragEngineObj, c.Client)
if err != nil {
return ctrl.Result{}, err
}
// We should delete all the machines that are created by this ragengine
for i := range mList.Items {
if deleteErr := c.Delete(ctx, &mList.Items[i], &client.DeleteOptions{}); deleteErr != nil {
klog.ErrorS(deleteErr, "failed to delete the machine", "machine", klog.KObj(&mList.Items[i]))
return ctrl.Result{}, deleteErr
}
}
}

if controllerutil.RemoveFinalizer(ragEngineObj, consts.RAGEngineFinalizer) {
Expand Down
17 changes: 8 additions & 9 deletions pkg/workspace/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -323,17 +323,16 @@ func computeHash(w *kaitov1alpha1.Workspace) string {

// applyWorkspaceResource applies workspace resource spec.
func (c *WorkspaceReconciler) applyWorkspaceResource(ctx context.Context, wObj *kaitov1alpha1.Workspace) error {

// Wait for pending machines if any before we decide whether to create new machine or not.
if err := machine.WaitForPendingMachines(ctx, wObj, c.Client); err != nil {
return err
}

if featuregates.FeatureGates[consts.FeatureFlagKarpenter] {
// Wait for pending nodeClaims if any before we decide whether to create new node or not.
if err := nodeclaim.WaitForPendingNodeClaims(ctx, wObj, c.Client); err != nil {
return err
}
} else {
// Wait for pending machines if any before we decide whether to create new machine or not.
if err := machine.WaitForPendingMachines(ctx, wObj, c.Client); err != nil {
return err
}
}

// Find all nodes that meet the requirements, they are not necessarily created by machines/nodeClaims.
Expand Down Expand Up @@ -834,12 +833,12 @@ func (c *WorkspaceReconciler) SetupWithManager(mgr ctrl.Manager) error {
Owns(&appsv1.Deployment{}).
Owns(&appsv1.StatefulSet{}).
Owns(&batchv1.Job{}).
Watches(&v1alpha5.Machine{}, c.watchMachines()).
WithOptions(controller.Options{MaxConcurrentReconciles: 5})

if featuregates.FeatureGates[consts.FeatureFlagKarpenter] {
builder.
Watches(&v1beta1.NodeClaim{}, c.watchNodeClaims()) // watches for nodeClaim with labels indicating workspace name.
builder.Watches(&v1beta1.NodeClaim{}, c.watchNodeClaims()) // watches for nodeClaim with labels indicating workspace name.
} else {
builder.Watches(&v1alpha5.Machine{}, c.watchMachines())
}
return builder.Complete(c)
}
Expand Down
26 changes: 13 additions & 13 deletions pkg/workspace/controllers/workspace_gc_finalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,6 @@ import (
func (c *WorkspaceReconciler) garbageCollectWorkspace(ctx context.Context, wObj *kaitov1alpha1.Workspace) (ctrl.Result, error) {
klog.InfoS("garbageCollectWorkspace", "workspace", klog.KObj(wObj))

// Check if there are any machines associated with this workspace.
mList, err := machine.ListMachines(ctx, wObj, c.Client)
if err != nil {
return ctrl.Result{}, err
}
// We should delete all the machines that are created by this workspace
for i := range mList.Items {
if deleteErr := c.Delete(ctx, &mList.Items[i], &client.DeleteOptions{}); deleteErr != nil {
klog.ErrorS(deleteErr, "failed to delete the machine", "machine", klog.KObj(&mList.Items[i]))
return ctrl.Result{}, deleteErr
}
}

if featuregates.FeatureGates[consts.FeatureFlagKarpenter] {
// Check if there are any nodeClaims associated with this workspace.
ncList, err := nodeclaim.ListNodeClaim(ctx, wObj, c.Client)
Expand All @@ -48,6 +35,19 @@ func (c *WorkspaceReconciler) garbageCollectWorkspace(ctx context.Context, wObj
return ctrl.Result{}, deleteErr
}
}
} else {
// Check if there are any machines associated with this workspace.
mList, err := machine.ListMachines(ctx, wObj, c.Client)
if err != nil {
return ctrl.Result{}, err
}
// We should delete all the machines that are created by this workspace
for i := range mList.Items {
if deleteErr := c.Delete(ctx, &mList.Items[i], &client.DeleteOptions{}); deleteErr != nil {
klog.ErrorS(deleteErr, "failed to delete the machine", "machine", klog.KObj(&mList.Items[i]))
return ctrl.Result{}, deleteErr
}
}
}

if controllerutil.RemoveFinalizer(wObj, consts.WorkspaceFinalizer) {
Expand Down

0 comments on commit 24eb89b

Please sign in to comment.