Skip to content

Commit

Permalink
addreseed review comment, refactored added test for computeScaleDownData
Browse files Browse the repository at this point in the history
  • Loading branch information
elankath committed Feb 3, 2025
1 parent 194b65a commit 136ea27
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 25 deletions.
6 changes: 3 additions & 3 deletions cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,8 +341,8 @@ func (ngImpl *nodeGroup) Refresh() error {
}
machinesOfNodeGroup, err := ngImpl.mcmManager.getMachinesForMachineDeployment(ngImpl.Name)
if err != nil {
klog.Errorf("NodeGroup.Refresh() of %q failed to get machines for MachineDeployment due to: %v", ngImpl.Name, err)
return fmt.Errorf("failed refresh of NodeGroup %q due to: %v", ngImpl.Name, err)
klog.Warningf("NodeGroup.Refresh() of %q failed to get machines for MachineDeployment due to: %v", ngImpl.Name, err)
return nil
}
toBeDeletedMachines := filterMachinesMatchingNames(machinesOfNodeGroup, sets.New(toBeDeletedMachineNames...))
if len(toBeDeletedMachines) == 0 {
Expand Down Expand Up @@ -479,7 +479,7 @@ func getNodeNamesFromMachines(machines []*v1alpha1.Machine) []string {
return nodeNames
}

// Id returns MachineDeployment id.
// Id returns NodeGroup name
func (ngImpl *nodeGroup) Id() string {
return ngImpl.Name
}
Expand Down
71 changes: 49 additions & 22 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,12 @@ type machineInfo struct {
FailedOrTerminating bool
}

type scaleDownData struct {
RevisedToBeDeletedNames sets.Set[string]
RevisedScaledownAmount int
RevisedMachineDeployment *v1alpha1.MachineDeployment
}

func init() {
controlBurst = flag.Int("control-apiserver-burst", rest.DefaultBurst, "Throttling burst configuration for the client to control cluster's apiserver.")
controlQPS = flag.Float64("control-apiserver-qps", float64(rest.DefaultQPS), "Throttling QPS configuration for the client to control cluster's apiserver.")
Expand Down Expand Up @@ -491,42 +497,31 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
return true, err
}

// scaleDownMachineDeployment scales down the MachineDeployment for given name by the length of toDeleteMachineNames
// It also updates the machines-marked-by-ca-for-deletion annotation on the machine deployment with the list of toDeleteMachineNames
// scaleDownMachineDeployment scales down the MachineDeployment for given name by the length of toDeleteMachineNames after removing machine names that
// are already marked for deletion in the machineutils.TriggerDeletionByMCM of the MachineDeployment.
// It then updates the machineutils.TriggerDeletionByMCM annotation with revised toBeDeletedMachineNames along with the replica count as a atomic operation.
// NOTE: Callers MUST take the NodeGroup scalingMutex before invoking this method.
func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, toBeDeletedMachineNames []string) (bool, error) {
md, err := m.GetMachineDeploymentObject(mdName)
if err != nil {
return true, err
}

scaleDownAmount := len(toBeDeletedMachineNames)
expectedReplicas := md.Spec.Replicas - int32(scaleDownAmount)
if expectedReplicas == md.Spec.Replicas {
klog.Infof("MachineDeployment %q is already set to %d, skipping the update", md.Name, expectedReplicas)
data := computeScaledownData(md, toBeDeletedMachineNames)
if data.RevisedScaledownAmount == 0 {
klog.V(3).Infof("Skipping scaledown since MachineDeployment %q has already marked %v for deletion by MCM, skipping the scale-down", md.Name, toBeDeletedMachineNames)
return false, nil
} else if expectedReplicas < 0 {
klog.Errorf("Cannot delete machines in MachineDeployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, md.Spec.Replicas)
return false, fmt.Errorf("cannot delete machines in MachineDeployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, md.Spec.Replicas)
}

alreadyMarkedMachineNames := getMachineNamesTriggeredForDeletion(md)
toBeMarkedMachineNamesSet := sets.NewString(toBeDeletedMachineNames...).Insert(alreadyMarkedMachineNames...)
triggerDeletionAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toBeMarkedMachineNamesSet.List())

mdCopy := md.DeepCopy()
mdCopy.Spec.Replicas = expectedReplicas
if mdCopy.Annotations == nil {
mdCopy.Annotations = make(map[string]string)
}
if mdCopy.Annotations[machineutils.TriggerDeletionByMCM] != triggerDeletionAnnotValue {
mdCopy.Annotations[machineutils.TriggerDeletionByMCM] = triggerDeletionAnnotValue
if data.RevisedMachineDeployment == nil {
klog.V(3).Infof("Skipping scaledown for MachineDeployment %q for toBeDeletedMachineNames: %v", md.Name, toBeDeletedMachineNames)
return false, nil
}
_, err = m.machineClient.MachineDeployments(mdCopy.Namespace).Update(ctx, mdCopy, metav1.UpdateOptions{})
updatedMd, err := m.machineClient.MachineDeployments(data.RevisedMachineDeployment.Namespace).Update(ctx, data.RevisedMachineDeployment, metav1.UpdateOptions{})
if err != nil {
return true, err
}
klog.V(2).Infof("MachineDeployment %s size decreased to %d, triggerDeletionAnnotValue: %q", mdCopy.Name, mdCopy.Spec.Replicas, triggerDeletionAnnotValue)
klog.V(2).Infof("MachineDeployment %s size decreased from %d to %d, TriggerDeletionByMCM Annotation Value: %q", md.Name, md.Spec.Replicas, updatedMd.Spec.Replicas, updatedMd.Annotations[machineutils.TriggerDeletionByMCM])
return false, nil
}

Expand Down Expand Up @@ -1080,3 +1075,35 @@ func filterExtendedResources(allResources v1.ResourceList) (extendedResources v1
})
return
}

// computeScaledownData computes fresh scaleDownData for the given MachineDeployment given the machineNamesForDeletion
func computeScaledownData(md *v1alpha1.MachineDeployment, machineNamesForDeletion []string) (data scaleDownData) {
forDeletionSet := sets.New(machineNamesForDeletion...)
alreadyMarkedSet := sets.New(getMachineNamesTriggeredForDeletion(md)...)

uniqueForDeletionSet := forDeletionSet.Difference(alreadyMarkedSet)
toBeMarkedSet := alreadyMarkedSet.Union(forDeletionSet)

data.RevisedToBeDeletedNames = uniqueForDeletionSet
data.RevisedScaledownAmount = uniqueForDeletionSet.Len()
data.RevisedMachineDeployment = nil

expectedReplicas := md.Spec.Replicas - int32(data.RevisedScaledownAmount)
if expectedReplicas == md.Spec.Replicas {
klog.Infof("MachineDeployment %q is already set to %d, no need to scale-down", md.Name, expectedReplicas)
} else if expectedReplicas < 0 {
klog.Errorf("Cannot delete machines in MachineDeployment %q, expected decrease in replicas: %d is more than current replicas: %d", md.Name, data.RevisedScaledownAmount, md.Spec.Replicas)
} else {
mdCopy := md.DeepCopy()
if mdCopy.Annotations == nil {
mdCopy.Annotations = make(map[string]string)
}
triggerDeletionAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toBeMarkedSet.UnsortedList())
if mdCopy.Annotations[machineutils.TriggerDeletionByMCM] != triggerDeletionAnnotValue {
mdCopy.Annotations[machineutils.TriggerDeletionByMCM] = triggerDeletionAnnotValue
}
mdCopy.Spec.Replicas = expectedReplicas
data.RevisedMachineDeployment = mdCopy
}
return
}
83 changes: 83 additions & 0 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ package mcm
import (
"errors"
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/utils/ptr"
"maps"
Expand Down Expand Up @@ -222,6 +224,87 @@ func TestFilterExtendedResources(t *testing.T) {
assert.Equal(t, customResources, extendedResources)
}

func TestComputeScaledownData(t *testing.T) {
t.Run("simple", func(t *testing.T) {
initialReplicas := int32(2)
md := newMachineDeployments(1, initialReplicas, nil, nil, nil)[0]
md.Annotations = map[string]string{}

machineNamesForDeletion := []string{"n1"}
data := computeScaledownData(md, machineNamesForDeletion)
assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
assert.Equal(t, int32(2-len(machineNamesForDeletion)), data.RevisedMachineDeployment.Spec.Replicas)
})

t.Run("single-duplicate", func(t *testing.T) {
initialReplicas := 2
md := newMachineDeployments(1, int32(initialReplicas), nil, nil, nil)[0]
md.Annotations = map[string]string{}

machineNamesForDeletion := []string{"n1"}
data := computeScaledownData(md, machineNamesForDeletion)
assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)

expectedReplicas := int32(initialReplicas - len(machineNamesForDeletion))
assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)

md = data.RevisedMachineDeployment
// repeating computeScaledownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
data = computeScaledownData(md, machineNamesForDeletion)
assert.Equal(t, 0, data.RevisedScaledownAmount)
assert.Empty(t, data.RevisedToBeDeletedNames)
assert.Nil(t, data.RevisedMachineDeployment)

})

t.Run("multi-duplicates", func(t *testing.T) {
initialReplicas := 3
md := newMachineDeployments(1, int32(initialReplicas), nil, nil, nil)[0]
md.Annotations = map[string]string{}

machineNamesForDeletion := []string{"n1", "n2"}
data := computeScaledownData(md, machineNamesForDeletion)
assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
expectedReplicas := int32(initialReplicas - len(machineNamesForDeletion))
assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)

md = data.RevisedMachineDeployment
// repeating computeScaledownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
data = computeScaledownData(md, machineNamesForDeletion)
assert.Equal(t, 0, data.RevisedScaledownAmount)
assert.Empty(t, data.RevisedToBeDeletedNames)
assert.Nil(t, data.RevisedMachineDeployment)

})

t.Run("overlapping", func(t *testing.T) {
initialReplicas := 5
md := newMachineDeployments(1, int32(initialReplicas), nil, nil, nil)[0]
md.Annotations = map[string]string{}

machineNamesForDeletion := sets.New("n1", "n2")
data := computeScaledownData(md, machineNamesForDeletion.UnsortedList())
assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion.UnsortedList()), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
expectedReplicas := int32(initialReplicas - len(machineNamesForDeletion))
assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)

newMachineNamesForDeletion := sets.New("n2", "n3", "n4")
md = data.RevisedMachineDeployment
data = computeScaledownData(md, newMachineNamesForDeletion.UnsortedList())
assert.NotNil(t, data.RevisedMachineDeployment)
uniqueMachinesNamesForDeletion := newMachineNamesForDeletion.Difference(machineNamesForDeletion)
assert.Equal(t, uniqueMachinesNamesForDeletion.Len(), data.RevisedScaledownAmount)
assert.Equal(t, uniqueMachinesNamesForDeletion, data.RevisedToBeDeletedNames)
expectedReplicas = int32(initialReplicas - machineNamesForDeletion.Union(newMachineNamesForDeletion).Len())
assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)

})
}

func createSampleInstanceType(instanceTypeName string, customResourceName apiv1.ResourceName, customResourceQuantity resource.Quantity) *instanceType {
awsM5Large := AWSInstanceTypes[instanceTypeName]
extendedResources := make(apiv1.ResourceList)
Expand Down

0 comments on commit 136ea27

Please sign in to comment.