diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 97e5a2289870..541ea1fb1cd0 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -19,6 +19,7 @@ package config import ( "time" + kubelet_config "k8s.io/kubernetes/pkg/kubelet/apis/config" scheduler_config "k8s.io/kubernetes/pkg/scheduler/apis/config" ) @@ -125,7 +126,12 @@ type AutoscalingOptions struct { IgnoreMirrorPodsUtilization bool // MaxGracefulTerminationSec is maximum number of seconds scale down waits for pods to terminate before // removing the node from cloud provider. + // DrainPriorityConfig takes higher precedence and MaxGracefulTerminationSec will not be applicable when the DrainPriorityConfig is set. MaxGracefulTerminationSec int + // DrainPriorityConfig is a list of ShutdownGracePeriodByPodPriority. + // This field is optional and could be nil. + // DrainPriorityConfig takes higher precedence and MaxGracefulTerminationSec will not be applicable when the DrainPriorityConfig is set. + DrainPriorityConfig []kubelet_config.ShutdownGracePeriodByPodPriority // MaxTotalUnreadyPercentage is the maximum percentage of unready nodes after which CA halts operations MaxTotalUnreadyPercentage float64 // OkTotalUnreadyCount is the number of allowed unready nodes, irrespective of max-total-unready-percentage diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator.go b/cluster-autoscaler/core/scaledown/actuation/actuator.go index 8fa30d8ed232..b80a0a7856e3 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator.go @@ -21,8 +21,6 @@ import ( "time" apiv1 "k8s.io/api/core/v1" - "k8s.io/klog/v2" - "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/clusterstate" "k8s.io/autoscaler/cluster-autoscaler/context" @@ -41,6 +39,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/utils/errors" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" "k8s.io/autoscaler/cluster-autoscaler/utils/taints" + "k8s.io/klog/v2" ) // Actuator is responsible for draining and deleting nodes. @@ -69,11 +68,18 @@ type actuatorNodeGroupConfigGetter interface { // NewActuator returns a new instance of Actuator. func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, configGetter actuatorNodeGroupConfigGetter) *Actuator { ndb := NewNodeDeletionBatcher(ctx, csr, ndt, ctx.NodeDeletionBatcherInterval) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + var evictor Evictor + if len(ctx.DrainPriorityConfig) > 0 { + evictor = NewEvictor(ndt, ctx.DrainPriorityConfig, true) + } else { + evictor = NewEvictor(ndt, legacyFlagDrainConfig, false) + } return &Actuator{ ctx: ctx, clusterState: csr, nodeDeletionTracker: ndt, - nodeDeletionScheduler: NewGroupDeletionScheduler(ctx, ndt, ndb, NewDefaultEvictor(deleteOptions, drainabilityRules, ndt)), + nodeDeletionScheduler: NewGroupDeletionScheduler(ctx, ndt, ndb, evictor), budgetProcessor: budgets.NewScaleDownBudgetProcessor(ctx), deleteOptions: deleteOptions, drainabilityRules: drainabilityRules, @@ -318,7 +324,7 @@ func (a *Actuator) scaleDownNodeToReport(node *apiv1.Node, drain bool) (*status. } var evictedPods []*apiv1.Pod if drain { - _, nonDsPodsToEvict := podsToEvict(a.ctx, nodeInfo) + _, nonDsPodsToEvict := podsToEvict(nodeInfo, a.ctx.DaemonSetEvictionForOccupiedNodes) evictedPods = nonDsPodsToEvict } return &status.ScaleDownNode{ diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go index cc77afce9f1d..5c35c0369841 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go @@ -32,9 +32,6 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/kubernetes/fake" - core "k8s.io/client-go/testing" - "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test" "k8s.io/autoscaler/cluster-autoscaler/clusterstate" @@ -48,6 +45,8 @@ import ( kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" "k8s.io/autoscaler/cluster-autoscaler/utils/taints" . "k8s.io/autoscaler/cluster-autoscaler/utils/test" + "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" ) type nodeGroupViewInfo struct { @@ -1190,7 +1189,8 @@ func TestStartDeletion(t *testing.T) { // Create Actuator, run StartDeletion, and verify the error. ndt := deletiontracker.NewNodeDeletionTracker(0) ndb := NewNodeDeletionBatcher(&ctx, csr, ndt, 0*time.Second) - evictor := Evictor{EvictionRetryTime: 0, DsEvictionRetryTime: 0, DsEvictionEmptyNodeTimeout: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom} + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, fullDsEviction: false} actuator := Actuator{ ctx: &ctx, clusterState: csr, nodeDeletionTracker: ndt, nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), @@ -1426,7 +1426,8 @@ func TestStartDeletionInBatchBasic(t *testing.T) { csr := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, ctx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute})) ndt := deletiontracker.NewNodeDeletionTracker(0) ndb := NewNodeDeletionBatcher(&ctx, csr, ndt, deleteInterval) - evictor := Evictor{EvictionRetryTime: 0, DsEvictionRetryTime: 0, DsEvictionEmptyNodeTimeout: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom} + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, shutdownGracePeriodByPodPriority: legacyFlagDrainConfig} actuator := Actuator{ ctx: &ctx, clusterState: csr, nodeDeletionTracker: ndt, nodeDeletionScheduler: NewGroupDeletionScheduler(&ctx, ndt, ndb, evictor), diff --git a/cluster-autoscaler/core/scaledown/actuation/drain.go b/cluster-autoscaler/core/scaledown/actuation/drain.go index 456d29045d2e..94d245d71d9b 100644 --- a/cluster-autoscaler/core/scaledown/actuation/drain.go +++ b/cluster-autoscaler/core/scaledown/actuation/drain.go @@ -19,21 +19,19 @@ package actuation import ( "context" "fmt" - "strings" + "sort" "time" apiv1 "k8s.io/api/core/v1" policyv1beta1 "k8s.io/api/policy/v1beta1" kube_errors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/autoscaler/cluster-autoscaler/metrics" "k8s.io/klog/v2" + kubelet_config "k8s.io/kubernetes/pkg/kubelet/apis/config" acontext "k8s.io/autoscaler/cluster-autoscaler/context" "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status" - "k8s.io/autoscaler/cluster-autoscaler/metrics" - "k8s.io/autoscaler/cluster-autoscaler/simulator" - "k8s.io/autoscaler/cluster-autoscaler/simulator/drainability/rules" - "k8s.io/autoscaler/cluster-autoscaler/simulator/options" "k8s.io/autoscaler/cluster-autoscaler/utils/daemonset" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" pod_util "k8s.io/autoscaler/cluster-autoscaler/utils/pod" @@ -44,108 +42,104 @@ const ( // DefaultEvictionRetryTime is the time after CA retries failed pod eviction. DefaultEvictionRetryTime = 10 * time.Second // DefaultPodEvictionHeadroom is the extra time we wait to catch situations when the pod is ignoring SIGTERM and - // is killed with SIGKILL after MaxGracefulTerminationTime + // is killed with SIGKILL after GracePeriodSeconds elapses DefaultPodEvictionHeadroom = 30 * time.Second - // DefaultDsEvictionEmptyNodeTimeout is the time to evict all DaemonSet pods on empty node - DefaultDsEvictionEmptyNodeTimeout = 10 * time.Second - // DefaultDsEvictionRetryTime is a time between retries to create eviction that uses for DaemonSet eviction for empty nodes - DefaultDsEvictionRetryTime = 3 * time.Second ) type evictionRegister interface { RegisterEviction(*apiv1.Pod) } -// Evictor can be used to evict pods from nodes. +// Evictor keeps configurations of pod eviction type Evictor struct { - EvictionRetryTime time.Duration - DsEvictionRetryTime time.Duration - DsEvictionEmptyNodeTimeout time.Duration - PodEvictionHeadroom time.Duration - evictionRegister evictionRegister - deleteOptions options.NodeDeleteOptions - drainabilityRules rules.Rules + EvictionRetryTime time.Duration + PodEvictionHeadroom time.Duration + evictionRegister evictionRegister + shutdownGracePeriodByPodPriority []kubelet_config.ShutdownGracePeriodByPodPriority + fullDsEviction bool } -// NewDefaultEvictor returns an instance of Evictor using the default parameters. -func NewDefaultEvictor(deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, evictionRegister evictionRegister) Evictor { +// NewEvictor returns an instance of Evictor. +func NewEvictor(evictionRegister evictionRegister, shutdownGracePeriodByPodPriority []kubelet_config.ShutdownGracePeriodByPodPriority, fullDsEviction bool) Evictor { + sort.Slice(shutdownGracePeriodByPodPriority, func(i, j int) bool { + return shutdownGracePeriodByPodPriority[i].Priority < shutdownGracePeriodByPodPriority[j].Priority + }) + return Evictor{ - EvictionRetryTime: DefaultEvictionRetryTime, - DsEvictionRetryTime: DefaultDsEvictionRetryTime, - DsEvictionEmptyNodeTimeout: DefaultDsEvictionEmptyNodeTimeout, - PodEvictionHeadroom: DefaultPodEvictionHeadroom, - evictionRegister: evictionRegister, - deleteOptions: deleteOptions, - drainabilityRules: drainabilityRules, + EvictionRetryTime: DefaultEvictionRetryTime, + PodEvictionHeadroom: DefaultPodEvictionHeadroom, + evictionRegister: evictionRegister, + shutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority, + fullDsEviction: fullDsEviction, } } -// DrainNode works like DrainNodeWithPods, but lists of pods to evict don't have to be provided. All non-mirror, non-DS pods on the -// node are evicted. Mirror pods are not evicted. DaemonSet pods are evicted if DaemonSetEvictionForOccupiedNodes is enabled, or -// if they have the EnableDsEvictionKey annotation. +// DrainNode groups pods in the node in to priority groups and, evicts pods in the ascending order of priorities. +// If priority evictor is not enable, eviction of daemonSet pods is the best effort. func (e Evictor) DrainNode(ctx *acontext.AutoscalingContext, nodeInfo *framework.NodeInfo) (map[string]status.PodEvictionResult, error) { - dsPodsToEvict, nonDsPodsToEvict := podsToEvict(ctx, nodeInfo) - return e.DrainNodeWithPods(ctx, nodeInfo.Node(), nonDsPodsToEvict, dsPodsToEvict) + node := nodeInfo.Node() + dsPods, pods := podsToEvict(nodeInfo, ctx.DaemonSetEvictionForOccupiedNodes) + if e.fullDsEviction { + return e.drainNodeWithPodsBasedOnPodPriority(ctx, node, append(pods, dsPods...), nil) + } + return e.drainNodeWithPodsBasedOnPodPriority(ctx, node, pods, dsPods) } -// DrainNodeWithPods performs drain logic on the node. Marks the node as unschedulable and later removes all pods, giving -// them up to MaxGracefulTerminationTime to finish. The list of pods to evict has to be provided. -func (e Evictor) DrainNodeWithPods(ctx *acontext.AutoscalingContext, node *apiv1.Node, pods []*apiv1.Pod, daemonSetPods []*apiv1.Pod) (map[string]status.PodEvictionResult, error) { - evictionResults := make(map[string]status.PodEvictionResult) - retryUntil := time.Now().Add(ctx.MaxPodEvictionTime) - confirmations := make(chan status.PodEvictionResult, len(pods)) - daemonSetConfirmations := make(chan status.PodEvictionResult, len(daemonSetPods)) - for _, pod := range pods { - evictionResults[pod.Name] = status.PodEvictionResult{Pod: pod, TimedOut: true, Err: nil} - go func(podToEvict *apiv1.Pod) { - confirmations <- evictPod(ctx, podToEvict, false, retryUntil, e.EvictionRetryTime, e.evictionRegister) - }(pod) +// EvictDaemonSetPods groups daemonSet pods in the node in to priority groups and, evicts daemonSet pods in the ascending order of priorities. +// If priority evictor is not enable, eviction of daemonSet pods is the best effort. +func (e Evictor) EvictDaemonSetPods(ctx *acontext.AutoscalingContext, nodeInfo *framework.NodeInfo) (map[string]status.PodEvictionResult, error) { + node := nodeInfo.Node() + dsPods, _ := podsToEvict(nodeInfo, ctx.DaemonSetEvictionForEmptyNodes) + if e.fullDsEviction { + return e.drainNodeWithPodsBasedOnPodPriority(ctx, node, dsPods, nil) } + return e.drainNodeWithPodsBasedOnPodPriority(ctx, node, nil, dsPods) +} - // Perform eviction of daemonset. We don't want to raise an error if daemonsetPod wasn't evict properly - for _, daemonSetPod := range daemonSetPods { - go func(podToEvict *apiv1.Pod) { - daemonSetConfirmations <- evictPod(ctx, podToEvict, true, retryUntil, e.EvictionRetryTime, e.evictionRegister) - }(daemonSetPod) +// drainNodeWithPodsBasedOnPodPriority performs drain logic on the node based on pod priorities. +// Removes all pods, giving each pod group up to ShutdownGracePeriodSeconds to finish. The list of pods to evict has to be provided. +func (e Evictor) drainNodeWithPodsBasedOnPodPriority(ctx *acontext.AutoscalingContext, node *apiv1.Node, fullEvictionPods, bestEffortEvictionPods []*apiv1.Pod) (map[string]status.PodEvictionResult, error) { + evictionResults := make(map[string]status.PodEvictionResult) + groups := groupByPriority(e.shutdownGracePeriodByPodPriority, fullEvictionPods, bestEffortEvictionPods) + for _, group := range groups { + for _, pod := range group.FullEvictionPods { + evictionResults[pod.Name] = status.PodEvictionResult{Pod: pod, TimedOut: false, + Err: errors.NewAutoscalerError(errors.UnexpectedScaleDownStateError, "Eviction did not attempted for the pod %s because some of the previous evictions failed", pod.Name)} + } } - podsEvictionCounter := 0 - for i := 0; i < len(pods)+len(daemonSetPods); i++ { - select { - case evictionResult := <-confirmations: - podsEvictionCounter++ - evictionResults[evictionResult.Pod.Name] = evictionResult - if evictionResult.WasEvictionSuccessful() { - metrics.RegisterEvictions(1) - } - case <-daemonSetConfirmations: - case <-time.After(retryUntil.Sub(time.Now()) + 5*time.Second): - if podsEvictionCounter < len(pods) { - // All pods initially had results with TimedOut set to true, so the ones that didn't receive an actual result are correctly marked as timed out. - return evictionResults, errors.NewAutoscalerError(errors.ApiCallError, "Failed to drain node %s/%s: timeout when waiting for creating evictions", node.Namespace, node.Name) - } - klog.Infof("Timeout when waiting for creating daemonSetPods eviction") + for _, group := range groups { + // If there are no pods in a particular range, + // then do not wait for pods in that priority range. + if len(group.FullEvictionPods) == 0 && len(group.BestEffortEvictionPods) == 0 { + continue } - } - evictionErrs := make([]error, 0) - for _, result := range evictionResults { - if !result.WasEvictionSuccessful() { - evictionErrs = append(evictionErrs, result.Err) + var err error + evictionResults, err = e.initiateEviction(ctx, node, group.FullEvictionPods, group.BestEffortEvictionPods, evictionResults, group.ShutdownGracePeriodSeconds) + if err != nil { + return evictionResults, err + } + + // Evictions created successfully, wait ShutdownGracePeriodSeconds + podEvictionHeadroom to see if fullEviction pods really disappeared. + evictionResults, err = e.waitPodsToDisappear(ctx, node, group.FullEvictionPods, evictionResults, group.ShutdownGracePeriodSeconds) + if err != nil { + return evictionResults, err } } - if len(evictionErrs) != 0 { - return evictionResults, errors.NewAutoscalerError(errors.ApiCallError, "Failed to drain node %s/%s, due to following errors: %v", node.Namespace, node.Name, evictionErrs) - } + klog.V(1).Infof("All pods removed from %s", node.Name) + return evictionResults, nil +} - // Evictions created successfully, wait maxGracefulTerminationSec + podEvictionHeadroom to see if pods really disappeared. +func (e Evictor) waitPodsToDisappear(ctx *acontext.AutoscalingContext, node *apiv1.Node, pods []*apiv1.Pod, evictionResults map[string]status.PodEvictionResult, + maxTermination int64) (map[string]status.PodEvictionResult, error) { var allGone bool - for start := time.Now(); time.Now().Sub(start) < time.Duration(ctx.MaxGracefulTerminationSec)*time.Second+e.PodEvictionHeadroom; time.Sleep(5 * time.Second) { + for start := time.Now(); time.Now().Sub(start) < time.Duration(maxTermination)*time.Second+e.PodEvictionHeadroom; time.Sleep(5 * time.Second) { allGone = true for _, pod := range pods { - podreturned, err := ctx.ClientSet.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) - if err == nil && (podreturned == nil || podreturned.Spec.NodeName == node.Name) { + podReturned, err := ctx.ClientSet.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) + if err == nil && (podReturned == nil || podReturned.Spec.NodeName == node.Name) { klog.V(1).Infof("Not deleted yet %s/%s", pod.Namespace, pod.Name) allGone = false break @@ -157,8 +151,6 @@ func (e Evictor) DrainNodeWithPods(ctx *acontext.AutoscalingContext, node *apiv1 } } if allGone { - klog.V(1).Infof("All pods removed from %s", node.Name) - // Let the deferred function know there is no need for cleanup return evictionResults, nil } } @@ -177,58 +169,65 @@ func (e Evictor) DrainNodeWithPods(ctx *acontext.AutoscalingContext, node *apiv1 return evictionResults, errors.NewAutoscalerError(errors.TransientError, "Failed to drain node %s/%s: pods remaining after timeout", node.Namespace, node.Name) } -// EvictDaemonSetPods creates eviction objects for all DaemonSet pods on the node. -func (e Evictor) EvictDaemonSetPods(ctx *acontext.AutoscalingContext, nodeInfo *framework.NodeInfo, timeNow time.Time) error { - nodeToDelete := nodeInfo.Node() - _, daemonSetPods, _, err := simulator.GetPodsToMove(nodeInfo, e.deleteOptions, e.drainabilityRules, nil, nil, timeNow) - if err != nil { - return fmt.Errorf("failed to get DaemonSet pods for %s (error: %v)", nodeToDelete.Name, err) - } +func (e Evictor) initiateEviction(ctx *acontext.AutoscalingContext, node *apiv1.Node, fullEvictionPods, bestEffortEvictionPods []*apiv1.Pod, evictionResults map[string]status.PodEvictionResult, + maxTermination int64) (map[string]status.PodEvictionResult, error) { - daemonSetPods = daemonset.PodsToEvict(daemonSetPods, ctx.DaemonSetEvictionForEmptyNodes) + retryUntil := time.Now().Add(ctx.MaxPodEvictionTime) + fullEvictionConfirmations := make(chan status.PodEvictionResult, len(fullEvictionPods)) + bestEffortEvictionConfirmations := make(chan status.PodEvictionResult, len(bestEffortEvictionPods)) - dsEviction := make(chan status.PodEvictionResult, len(daemonSetPods)) + for _, pod := range fullEvictionPods { + evictionResults[pod.Name] = status.PodEvictionResult{Pod: pod, TimedOut: true, Err: nil} + go func(pod *apiv1.Pod) { + fullEvictionConfirmations <- e.evictPod(ctx, pod, retryUntil, maxTermination, true) + }(pod) + } - // Perform eviction of DaemonSet pods - for _, daemonSetPod := range daemonSetPods { - go func(podToEvict *apiv1.Pod) { - dsEviction <- evictPod(ctx, podToEvict, true, timeNow.Add(e.DsEvictionEmptyNodeTimeout), e.DsEvictionRetryTime, e.evictionRegister) - }(daemonSetPod) + for _, pod := range bestEffortEvictionPods { + go func(pod *apiv1.Pod) { + bestEffortEvictionConfirmations <- e.evictPod(ctx, pod, retryUntil, maxTermination, false) + }(pod) } - // Wait for creating eviction of DaemonSet pods - var failedPodErrors []string - for range daemonSetPods { + + for i := 0; i < len(fullEvictionPods)+len(bestEffortEvictionPods); i++ { select { - case status := <-dsEviction: - if status.Err != nil { - failedPodErrors = append(failedPodErrors, status.Err.Error()) + case evictionResult := <-fullEvictionConfirmations: + evictionResults[evictionResult.Pod.Name] = evictionResult + if evictionResult.WasEvictionSuccessful() { + metrics.RegisterEvictions(1, metrics.PodEvictionSucceed) + } else { + metrics.RegisterEvictions(1, metrics.PodEvictionFailed) } - // adding waitBetweenRetries in order to have a bigger time interval than evictPod() - case <-time.After(e.DsEvictionEmptyNodeTimeout): - return fmt.Errorf("failed to create DaemonSet eviction for %v seconds on the %s", e.DsEvictionEmptyNodeTimeout, nodeToDelete.Name) + case <-bestEffortEvictionConfirmations: } } - if len(failedPodErrors) > 0 { - return fmt.Errorf("following DaemonSet pod failed to evict on the %s:\n%s", nodeToDelete.Name, fmt.Errorf(strings.Join(failedPodErrors, "\n"))) + evictionErrs := make([]error, 0) + for _, pod := range fullEvictionPods { + result := evictionResults[pod.Name] + if !result.WasEvictionSuccessful() { + evictionErrs = append(evictionErrs, result.Err) + } + } + if len(evictionErrs) != 0 { + return evictionResults, errors.NewAutoscalerError(errors.ApiCallError, "Failed to drain node %s/%s, due to following errors: %v", node.Namespace, node.Name, evictionErrs) } - return nil + return evictionResults, nil } -func evictPod(ctx *acontext.AutoscalingContext, podToEvict *apiv1.Pod, isDaemonSetPod bool, retryUntil time.Time, waitBetweenRetries time.Duration, evictionRegister evictionRegister) status.PodEvictionResult { +func (e Evictor) evictPod(ctx *acontext.AutoscalingContext, podToEvict *apiv1.Pod, retryUntil time.Time, maxTermination int64, fullEvictionPod bool) status.PodEvictionResult { ctx.Recorder.Eventf(podToEvict, apiv1.EventTypeNormal, "ScaleDown", "deleting pod for node scale down") - maxTermination := int64(apiv1.DefaultTerminationGracePeriodSeconds) + termination := int64(apiv1.DefaultTerminationGracePeriodSeconds) if podToEvict.Spec.TerminationGracePeriodSeconds != nil { - if *podToEvict.Spec.TerminationGracePeriodSeconds < int64(ctx.MaxGracefulTerminationSec) { - maxTermination = *podToEvict.Spec.TerminationGracePeriodSeconds - } else { - maxTermination = int64(ctx.MaxGracefulTerminationSec) - } + termination = *podToEvict.Spec.TerminationGracePeriodSeconds + } + if maxTermination > 0 && termination > maxTermination { + termination = maxTermination } var lastError error - for first := true; first || time.Now().Before(retryUntil); time.Sleep(waitBetweenRetries) { + for first := true; first || time.Now().Before(retryUntil); time.Sleep(e.EvictionRetryTime) { first = false eviction := &policyv1beta1.Eviction{ ObjectMeta: metav1.ObjectMeta{ @@ -236,25 +235,25 @@ func evictPod(ctx *acontext.AutoscalingContext, podToEvict *apiv1.Pod, isDaemonS Name: podToEvict.Name, }, DeleteOptions: &metav1.DeleteOptions{ - GracePeriodSeconds: &maxTermination, + GracePeriodSeconds: &termination, }, } lastError = ctx.ClientSet.CoreV1().Pods(podToEvict.Namespace).Evict(context.TODO(), eviction) if lastError == nil || kube_errors.IsNotFound(lastError) { - if evictionRegister != nil { - evictionRegister.RegisterEviction(podToEvict) + if e.evictionRegister != nil { + e.evictionRegister.RegisterEviction(podToEvict) } return status.PodEvictionResult{Pod: podToEvict, TimedOut: false, Err: nil} } } - if !isDaemonSetPod { + if fullEvictionPod { klog.Errorf("Failed to evict pod %s, error: %v", podToEvict.Name, lastError) ctx.Recorder.Eventf(podToEvict, apiv1.EventTypeWarning, "ScaleDownFailed", "failed to delete pod for ScaleDown") } return status.PodEvictionResult{Pod: podToEvict, TimedOut: true, Err: fmt.Errorf("failed to evict pod %s/%s within allowed timeout (last error: %v)", podToEvict.Namespace, podToEvict.Name, lastError)} } -func podsToEvict(ctx *acontext.AutoscalingContext, nodeInfo *framework.NodeInfo) (dsPods, nonDsPods []*apiv1.Pod) { +func podsToEvict(nodeInfo *framework.NodeInfo, evictDsByDefault bool) (dsPods, nonDsPods []*apiv1.Pod) { for _, podInfo := range nodeInfo.Pods { if pod_util.IsMirrorPod(podInfo.Pod) { continue @@ -264,6 +263,12 @@ func podsToEvict(ctx *acontext.AutoscalingContext, nodeInfo *framework.NodeInfo) nonDsPods = append(nonDsPods, podInfo.Pod) } } - dsPodsToEvict := daemonset.PodsToEvict(dsPods, ctx.DaemonSetEvictionForOccupiedNodes) + dsPodsToEvict := daemonset.PodsToEvict(dsPods, evictDsByDefault) return dsPodsToEvict, nonDsPods } + +type podEvictionGroup struct { + kubelet_config.ShutdownGracePeriodByPodPriority + FullEvictionPods []*apiv1.Pod + BestEffortEvictionPods []*apiv1.Pod +} diff --git a/cluster-autoscaler/core/scaledown/actuation/drain_test.go b/cluster-autoscaler/core/scaledown/actuation/drain_test.go index ed2ae3922a58..3baf252d1dd3 100644 --- a/cluster-autoscaler/core/scaledown/actuation/drain_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/drain_test.go @@ -31,10 +31,6 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/kubernetes/fake" - core "k8s.io/client-go/testing" - "k8s.io/kubernetes/pkg/kubelet/types" - testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test" "k8s.io/autoscaler/cluster-autoscaler/config" acontext "k8s.io/autoscaler/cluster-autoscaler/context" @@ -44,10 +40,13 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/utils/daemonset" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" . "k8s.io/autoscaler/cluster-autoscaler/utils/test" + "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" + kubelet_config "k8s.io/kubernetes/pkg/kubelet/apis/config" + "k8s.io/kubernetes/pkg/kubelet/types" ) func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { - timeNow := time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC) testScenarios := []struct { name string dsPods []string @@ -58,6 +57,8 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { evictByDefault bool extraAnnotationValue map[string]string expectNotEvicted map[string]struct{} + fullDsEviction bool + podPriorities []int32 }{ { name: "Successful attempt to evict DaemonSet pods", @@ -66,22 +67,52 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { evictionSuccess: true, evictByDefault: true, }, + { + name: "Evict single pod due to annotation", + dsPods: []string{"d1", "d2"}, + dsEvictionTimeout: 5000 * time.Millisecond, + evictionSuccess: true, + extraAnnotationValue: map[string]string{"d1": "true"}, + expectNotEvicted: map[string]struct{}{"d2": {}}, + }, + { + name: "Don't evict single pod due to annotation", + dsPods: []string{"d1", "d2"}, + dsEvictionTimeout: 5000 * time.Millisecond, + evictionSuccess: true, + evictByDefault: true, + extraAnnotationValue: map[string]string{"d1": "false"}, + expectNotEvicted: map[string]struct{}{"d1": {}}, + }, { name: "Failed to create DaemonSet eviction", dsPods: []string{"d1", "d2"}, dsEvictionTimeout: 5000 * time.Millisecond, evictionSuccess: false, - err: fmt.Errorf("following DaemonSet pod failed to evict on the"), + err: fmt.Errorf("Failed to drain node /n1, due to following errors"), evictByDefault: true, + fullDsEviction: true, + podPriorities: []int32{0, 1000}, }, { name: "Eviction timeout exceed", dsPods: []string{"d1", "d2", "d3"}, evictionTimeoutExceed: true, dsEvictionTimeout: 100 * time.Millisecond, - evictionSuccess: true, - err: fmt.Errorf("failed to create DaemonSet eviction for"), + evictionSuccess: false, + err: fmt.Errorf("Failed to drain node /n1, due to following errors"), evictByDefault: true, + fullDsEviction: true, + podPriorities: []int32{0, 1000, 2000}, + }, + { + name: "Successful attempt to evict DaemonSet pods", + dsPods: []string{"d1", "d2"}, + dsEvictionTimeout: 5000 * time.Millisecond, + evictionSuccess: true, + evictByDefault: true, + fullDsEviction: true, + podPriorities: []int32{0, 1000}, }, { name: "Evict single pod due to annotation", @@ -90,6 +121,8 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { evictionSuccess: true, extraAnnotationValue: map[string]string{"d1": "true"}, expectNotEvicted: map[string]struct{}{"d2": {}}, + fullDsEviction: true, + podPriorities: []int32{0, 1000}, }, { name: "Don't evict single pod due to annotation", @@ -99,6 +132,8 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { evictByDefault: true, extraAnnotationValue: map[string]string{"d1": "false"}, expectNotEvicted: map[string]struct{}{"d1": {}}, + fullDsEviction: true, + podPriorities: []int32{0, 1000}, }, } @@ -113,6 +148,7 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { }, MaxGracefulTerminationSec: 1, DaemonSetEvictionForEmptyNodes: scenario.evictByDefault, + MaxPodEvictionTime: scenario.dsEvictionTimeout, } deletedPods := make(chan string, len(scenario.dsPods)+2) waitBetweenRetries := 10 * time.Millisecond @@ -122,9 +158,11 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { SetNodeReadyState(n1, true, time.Time{}) dsPods := make([]*apiv1.Pod, len(scenario.dsPods)) for i, dsName := range scenario.dsPods { - ds := BuildTestPod(dsName, 100, 0) + ds := BuildTestPod(dsName, 100, 0, WithDSController()) ds.Spec.NodeName = "n1" - ds.OwnerReferences = GenerateOwnerReferences("", "DaemonSet", "", "") + if scenario.fullDsEviction { + ds.Spec.Priority = &scenario.podPriorities[i] + } if v, ok := scenario.extraAnnotationValue[dsName]; ok { ds.Annotations[daemonset.EnableDsEvictionKey] = v } @@ -159,13 +197,24 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { clustersnapshot.InitializeClusterSnapshotOrDie(t, context.ClusterSnapshot, []*apiv1.Node{n1}, dsPods) + drainConfig := SingleRuleDrainConfig(context.MaxGracefulTerminationSec) + if scenario.fullDsEviction { + drainConfig = []kubelet_config.ShutdownGracePeriodByPodPriority{} + for _, priority := range scenario.podPriorities { + drainConfig = append(drainConfig, kubelet_config.ShutdownGracePeriodByPodPriority{ + Priority: priority, + ShutdownGracePeriodSeconds: int64(context.MaxGracefulTerminationSec), + }) + } + } evictor := Evictor{ - DsEvictionEmptyNodeTimeout: scenario.dsEvictionTimeout, - DsEvictionRetryTime: waitBetweenRetries, + EvictionRetryTime: waitBetweenRetries, + shutdownGracePeriodByPodPriority: drainConfig, + fullDsEviction: scenario.fullDsEviction, } nodeInfo, err := context.ClusterSnapshot.NodeInfos().Get(n1.Name) assert.NoError(t, err) - err = evictor.EvictDaemonSetPods(&context, nodeInfo, timeNow) + _, err = evictor.EvictDaemonSetPods(&context, nodeInfo) if scenario.err != nil { assert.NotNil(t, err) assert.Contains(t, err.Error(), scenario.err.Error()) @@ -183,7 +232,11 @@ func TestDaemonSetEvictionForEmptyNodes(t *testing.T) { for i := 0; i < len(expectEvicted); i++ { deleted[i] = utils.GetStringFromChan(deletedPods) } - assert.ElementsMatch(t, deleted, expectEvicted) + if scenario.fullDsEviction { + assert.Equal(t, expectEvicted, deleted) + } else { + assert.ElementsMatch(t, deleted, expectEvicted) + } }) } } @@ -192,10 +245,10 @@ func TestDrainNodeWithPods(t *testing.T) { deletedPods := make(chan string, 10) fakeClient := &fake.Clientset{} - p1 := BuildTestPod("p1", 100, 0) - p2 := BuildTestPod("p2", 300, 0) - d1 := BuildTestPod("d1", 150, 0) n1 := BuildTestNode("n1", 1000, 1000) + p1 := BuildTestPod("p1", 100, 0, WithNodeName(n1.Name)) + p2 := BuildTestPod("p2", 300, 0, WithNodeName(n1.Name)) + d1 := BuildTestPod("d1", 150, 0, WithNodeName(n1.Name), WithDSController()) SetNodeReadyState(n1, true, time.Time{}) @@ -216,14 +269,23 @@ func TestDrainNodeWithPods(t *testing.T) { }) options := config.AutoscalingOptions{ - MaxGracefulTerminationSec: 20, - MaxPodEvictionTime: 5 * time.Second, + MaxGracefulTerminationSec: 20, + MaxPodEvictionTime: 5 * time.Second, + DaemonSetEvictionForOccupiedNodes: true, } ctx, err := NewScaleTestAutoscalingContext(options, fakeClient, nil, nil, nil, nil) assert.NoError(t, err) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom} - _, err = evictor.DrainNodeWithPods(&ctx, n1, []*apiv1.Pod{p1, p2}, []*apiv1.Pod{d1}) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{ + EvictionRetryTime: 0, + PodEvictionHeadroom: DefaultPodEvictionHeadroom, + shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, + } + clustersnapshot.InitializeClusterSnapshotOrDie(t, ctx.ClusterSnapshot, []*apiv1.Node{n1}, []*apiv1.Pod{p1, p2, d1}) + nodeInfo, err := ctx.ClusterSnapshot.NodeInfos().Get(n1.Name) + assert.NoError(t, err) + _, err = evictor.DrainNode(&ctx, nodeInfo) assert.NoError(t, err) deleted := make([]string, 0) deleted = append(deleted, utils.GetStringFromChan(deletedPods)) @@ -240,11 +302,12 @@ func TestDrainNodeWithPodsWithRescheduled(t *testing.T) { deletedPods := make(chan string, 10) fakeClient := &fake.Clientset{} - p1 := BuildTestPod("p1", 100, 0) - p2 := BuildTestPod("p2", 300, 0) + n1 := BuildTestNode("n1", 1000, 1000) + p1 := BuildTestPod("p1", 100, 0, WithNodeName(n1.Name)) + p2 := BuildTestPod("p2", 300, 0, WithNodeName(n1.Name)) p2Rescheduled := BuildTestPod("p2", 300, 0) p2Rescheduled.Spec.NodeName = "n2" - n1 := BuildTestNode("n1", 1000, 1000) + SetNodeReadyState(n1, true, time.Time{}) fakeClient.Fake.AddReactor("get", "pods", func(action core.Action) (bool, runtime.Object, error) { @@ -277,8 +340,16 @@ func TestDrainNodeWithPodsWithRescheduled(t *testing.T) { ctx, err := NewScaleTestAutoscalingContext(options, fakeClient, nil, nil, nil, nil) assert.NoError(t, err) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom} - _, err = evictor.DrainNodeWithPods(&ctx, n1, []*apiv1.Pod{p1, p2}, []*apiv1.Pod{}) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{ + EvictionRetryTime: 0, + PodEvictionHeadroom: DefaultPodEvictionHeadroom, + shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, + } + clustersnapshot.InitializeClusterSnapshotOrDie(t, ctx.ClusterSnapshot, []*apiv1.Node{n1}, []*apiv1.Pod{p1, p2}) + nodeInfo, err := ctx.ClusterSnapshot.NodeInfos().Get(n1.Name) + assert.NoError(t, err) + _, err = evictor.DrainNode(&ctx, nodeInfo) assert.NoError(t, err) deleted := make([]string, 0) deleted = append(deleted, utils.GetStringFromChan(deletedPods)) @@ -297,11 +368,12 @@ func TestDrainNodeWithPodsWithRetries(t *testing.T) { ticket := make(chan bool, 1) fakeClient := &fake.Clientset{} - p1 := BuildTestPod("p1", 100, 0) - p2 := BuildTestPod("p2", 300, 0) - p3 := BuildTestPod("p3", 300, 0) - d1 := BuildTestPod("d1", 150, 0) n1 := BuildTestNode("n1", 1000, 1000) + p1 := BuildTestPod("p1", 100, 0, WithNodeName(n1.Name)) + p2 := BuildTestPod("p2", 300, 0, WithNodeName(n1.Name)) + p3 := BuildTestPod("p3", 300, 0, WithNodeName(n1.Name)) + d1 := BuildTestPod("d1", 150, 0, WithDSController(), WithNodeName(n1.Name)) + SetNodeReadyState(n1, true, time.Time{}) fakeClient.Fake.AddReactor("get", "pods", func(action core.Action) (bool, runtime.Object, error) { @@ -330,14 +402,23 @@ func TestDrainNodeWithPodsWithRetries(t *testing.T) { }) options := config.AutoscalingOptions{ - MaxGracefulTerminationSec: 20, - MaxPodEvictionTime: 5 * time.Second, + MaxGracefulTerminationSec: 20, + MaxPodEvictionTime: 5 * time.Second, + DaemonSetEvictionForOccupiedNodes: true, } ctx, err := NewScaleTestAutoscalingContext(options, fakeClient, nil, nil, nil, nil) assert.NoError(t, err) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom} - _, err = evictor.DrainNodeWithPods(&ctx, n1, []*apiv1.Pod{p1, p2, p3}, []*apiv1.Pod{d1}) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{ + EvictionRetryTime: 0, + PodEvictionHeadroom: DefaultPodEvictionHeadroom, + shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, + } + clustersnapshot.InitializeClusterSnapshotOrDie(t, ctx.ClusterSnapshot, []*apiv1.Node{n1}, []*apiv1.Pod{p1, p2, p3, d1}) + nodeInfo, err := ctx.ClusterSnapshot.NodeInfos().Get(n1.Name) + assert.NoError(t, err) + _, err = evictor.DrainNode(&ctx, nodeInfo) assert.NoError(t, err) deleted := make([]string, 0) deleted = append(deleted, utils.GetStringFromChan(deletedPods)) @@ -354,11 +435,12 @@ func TestDrainNodeWithPodsWithRetries(t *testing.T) { func TestDrainNodeWithPodsDaemonSetEvictionFailure(t *testing.T) { fakeClient := &fake.Clientset{} - p1 := BuildTestPod("p1", 100, 0) - p2 := BuildTestPod("p2", 300, 0) - d1 := BuildTestPod("d1", 150, 0) - d2 := BuildTestPod("d2", 250, 0) n1 := BuildTestNode("n1", 1000, 1000) + p1 := BuildTestPod("p1", 100, 0, WithNodeName(n1.Name)) + p2 := BuildTestPod("p2", 300, 0, WithNodeName(n1.Name)) + d1 := BuildTestPod("d1", 150, 0, WithDSController(), WithNodeName(n1.Name)) + d2 := BuildTestPod("d2", 250, 0, WithDSController(), WithNodeName(n1.Name)) + e1 := fmt.Errorf("eviction_error: d1") e2 := fmt.Errorf("eviction_error: d2") @@ -390,8 +472,16 @@ func TestDrainNodeWithPodsDaemonSetEvictionFailure(t *testing.T) { ctx, err := NewScaleTestAutoscalingContext(options, fakeClient, nil, nil, nil, nil) assert.NoError(t, err) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom} - evictionResults, err := evictor.DrainNodeWithPods(&ctx, n1, []*apiv1.Pod{p1, p2}, []*apiv1.Pod{d1, d2}) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{ + EvictionRetryTime: 0, + PodEvictionHeadroom: DefaultPodEvictionHeadroom, + shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, + } + clustersnapshot.InitializeClusterSnapshotOrDie(t, ctx.ClusterSnapshot, []*apiv1.Node{n1}, []*apiv1.Pod{p1, p2, d1, d2}) + nodeInfo, err := ctx.ClusterSnapshot.NodeInfos().Get(n1.Name) + assert.NoError(t, err) + evictionResults, err := evictor.DrainNode(&ctx, nodeInfo) assert.NoError(t, err) assert.Equal(t, 2, len(evictionResults)) assert.Equal(t, p1, evictionResults["p1"].Pod) @@ -407,11 +497,11 @@ func TestDrainNodeWithPodsDaemonSetEvictionFailure(t *testing.T) { func TestDrainNodeWithPodsEvictionFailure(t *testing.T) { fakeClient := &fake.Clientset{} - p1 := BuildTestPod("p1", 100, 0) - p2 := BuildTestPod("p2", 100, 0) - p3 := BuildTestPod("p3", 100, 0) - p4 := BuildTestPod("p4", 100, 0) n1 := BuildTestNode("n1", 1000, 1000) + p1 := BuildTestPod("p1", 100, 0, WithNodeName(n1.Name)) + p2 := BuildTestPod("p2", 100, 0, WithNodeName(n1.Name)) + p3 := BuildTestPod("p3", 100, 0, WithNodeName(n1.Name)) + p4 := BuildTestPod("p4", 100, 0, WithNodeName(n1.Name)) e2 := fmt.Errorf("eviction_error: p2") e4 := fmt.Errorf("eviction_error: p4") SetNodeReadyState(n1, true, time.Time{}) @@ -442,8 +532,17 @@ func TestDrainNodeWithPodsEvictionFailure(t *testing.T) { ctx, err := NewScaleTestAutoscalingContext(options, fakeClient, nil, nil, nil, nil) assert.NoError(t, err) r := evRegister{} - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom, evictionRegister: &r} - evictionResults, err := evictor.DrainNodeWithPods(&ctx, n1, []*apiv1.Pod{p1, p2, p3, p4}, []*apiv1.Pod{}) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{ + EvictionRetryTime: 0, + PodEvictionHeadroom: DefaultPodEvictionHeadroom, + evictionRegister: &r, + shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, + } + clustersnapshot.InitializeClusterSnapshotOrDie(t, ctx.ClusterSnapshot, []*apiv1.Node{n1}, []*apiv1.Pod{p1, p2, p3, p4}) + nodeInfo, err := ctx.ClusterSnapshot.NodeInfos().Get(n1.Name) + assert.NoError(t, err) + evictionResults, err := evictor.DrainNode(&ctx, nodeInfo) assert.Error(t, err) assert.Equal(t, 4, len(evictionResults)) assert.Equal(t, *p1, *evictionResults["p1"].Pod) @@ -468,12 +567,12 @@ func TestDrainNodeWithPodsEvictionFailure(t *testing.T) { func TestDrainWithPodsNodeDisappearanceFailure(t *testing.T) { fakeClient := &fake.Clientset{} - p1 := BuildTestPod("p1", 100, 0) - p2 := BuildTestPod("p2", 100, 0) - p3 := BuildTestPod("p3", 100, 0) - p4 := BuildTestPod("p4", 100, 0) - e2 := fmt.Errorf("disappearance_error: p2") n1 := BuildTestNode("n1", 1000, 1000) + p1 := BuildTestPod("p1", 100, 0, WithNodeName(n1.Name)) + p2 := BuildTestPod("p2", 100, 0, WithNodeName(n1.Name)) + p3 := BuildTestPod("p3", 100, 0, WithNodeName(n1.Name)) + p4 := BuildTestPod("p4", 100, 0, WithNodeName(n1.Name)) + e2 := fmt.Errorf("disappearance_error: p2") SetNodeReadyState(n1, true, time.Time{}) fakeClient.Fake.AddReactor("get", "pods", func(action core.Action) (bool, runtime.Object, error) { @@ -500,8 +599,16 @@ func TestDrainWithPodsNodeDisappearanceFailure(t *testing.T) { ctx, err := NewScaleTestAutoscalingContext(options, fakeClient, nil, nil, nil, nil) assert.NoError(t, err) - evictor := Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: 0} - evictionResults, err := evictor.DrainNodeWithPods(&ctx, n1, []*apiv1.Pod{p1, p2, p3, p4}, []*apiv1.Pod{}) + legacyFlagDrainConfig := SingleRuleDrainConfig(ctx.MaxGracefulTerminationSec) + evictor := Evictor{ + EvictionRetryTime: 0, + PodEvictionHeadroom: 0, + shutdownGracePeriodByPodPriority: legacyFlagDrainConfig, + } + clustersnapshot.InitializeClusterSnapshotOrDie(t, ctx.ClusterSnapshot, []*apiv1.Node{n1}, []*apiv1.Pod{p1, p2, p3, p4}) + nodeInfo, err := ctx.ClusterSnapshot.NodeInfos().Get(n1.Name) + assert.NoError(t, err) + evictionResults, err := evictor.DrainNode(&ctx, nodeInfo) assert.Error(t, err) assert.Equal(t, 4, len(evictionResults)) assert.Equal(t, *p1, *evictionResults["p1"].Pod) @@ -593,7 +700,7 @@ func TestPodsToEvict(t *testing.T) { if err != nil { t.Fatalf("NodeInfos().Get() unexpected error: %v", err) } - gotDsPods, gotNonDsPods := podsToEvict(ctx, nodeInfo) + gotDsPods, gotNonDsPods := podsToEvict(nodeInfo, ctx.DaemonSetEvictionForOccupiedNodes) if diff := cmp.Diff(tc.wantDsPods, gotDsPods, cmpopts.EquateEmpty()); diff != "" { t.Errorf("podsToEvict dsPods diff (-want +got):\n%s", diff) } diff --git a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go index e8aa9676932e..41d3e0a7b8b5 100644 --- a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go +++ b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go @@ -18,7 +18,6 @@ package actuation import ( "sync" - "time" apiv1 "k8s.io/api/core/v1" "k8s.io/klog/v2" @@ -104,7 +103,7 @@ func (ds *GroupDeletionScheduler) prepareNodeForDeletion(nodeInfo *framework.Nod return status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToEvictPods, Err: err, PodEvictionResults: evictionResults} } } else { - if err := ds.evictor.EvictDaemonSetPods(ds.ctx, nodeInfo, time.Now()); err != nil { + if _, err := ds.evictor.EvictDaemonSetPods(ds.ctx, nodeInfo); err != nil { // Evicting DS pods is best-effort, so proceed with the deletion even if there are errors. klog.Warningf("Error while evicting DS pods from an empty node %q: %v", node.Name, err) } diff --git a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go index 8149bb33e234..86d584d89c0c 100644 --- a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go @@ -126,7 +126,7 @@ func TestScheduleDeletion(t *testing.T) { if err != nil { t.Fatalf("Couldn't set up autoscaling context: %v", err) } - scheduler := NewGroupDeletionScheduler(&ctx, tracker, batcher, Evictor{EvictionRetryTime: 0, DsEvictionRetryTime: 0, DsEvictionEmptyNodeTimeout: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom}) + scheduler := NewGroupDeletionScheduler(&ctx, tracker, batcher, Evictor{EvictionRetryTime: 0, PodEvictionHeadroom: DefaultPodEvictionHeadroom}) if err := scheduleAll(tc.toSchedule, scheduler); err != nil { t.Fatal(err) diff --git a/cluster-autoscaler/core/scaledown/actuation/priority.go b/cluster-autoscaler/core/scaledown/actuation/priority.go new file mode 100644 index 000000000000..662a783eda1c --- /dev/null +++ b/cluster-autoscaler/core/scaledown/actuation/priority.go @@ -0,0 +1,117 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package actuation + +import ( + "math" + "sort" + "strconv" + "strings" + + apiv1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + kubelet_config "k8s.io/kubernetes/pkg/kubelet/apis/config" +) + +func groupByPriority(shutdownGracePeriodByPodPriority []kubelet_config.ShutdownGracePeriodByPodPriority, fullEvictionPods, bestEffortEvictionPods []*apiv1.Pod) []podEvictionGroup { + groups := make([]podEvictionGroup, 0, len(shutdownGracePeriodByPodPriority)) + for _, period := range shutdownGracePeriodByPodPriority { + groups = append(groups, podEvictionGroup{ + ShutdownGracePeriodByPodPriority: period, + }) + } + + for _, pod := range fullEvictionPods { + index := groupIndex(pod, groups) + groups[index].FullEvictionPods = append(groups[index].FullEvictionPods, pod) + } + for _, pod := range bestEffortEvictionPods { + index := groupIndex(pod, groups) + groups[index].BestEffortEvictionPods = append(groups[index].BestEffortEvictionPods, pod) + } + return groups +} + +func groupIndex(pod *apiv1.Pod, groups []podEvictionGroup) int { + var priority int32 + if pod.Spec.Priority != nil { + priority = *pod.Spec.Priority + } + + // Find the group index according to the priority. + index := sort.Search(len(groups), func(i int) bool { + return (groups)[i].Priority >= priority + }) + + // 1. Those higher than the highest priority default to the highest priority + // 2. Those lower than the lowest priority default to the lowest priority + // 3. Those boundary priority default to the lower priority + // if priority of pod is: + // groups[index-1].Priority <= pod priority < groups[index].Priority + // in which case we want to pick lower one (i.e. index-1) + if index == len(groups) { + index = len(groups) - 1 + } else if index < 0 { + index = 0 + } else if index > 0 && (groups)[index].Priority > priority { + index-- + } + return index +} + +// ParseShutdownGracePeriodsAndPriorities parse priorityGracePeriodStr and returns an array of ShutdownGracePeriodByPodPriority if succeeded. +// Otherwise, returns an empty list +func ParseShutdownGracePeriodsAndPriorities(priorityGracePeriodStr string) []kubelet_config.ShutdownGracePeriodByPodPriority { + var priorityGracePeriodMap, emptyMap []kubelet_config.ShutdownGracePeriodByPodPriority + + if priorityGracePeriodStr == "" { + return emptyMap + } + priorityGracePeriodStrArr := strings.Split(priorityGracePeriodStr, ",") + for _, item := range priorityGracePeriodStrArr { + priorityAndPeriod := strings.Split(item, ":") + if len(priorityAndPeriod) != 2 { + klog.Errorf("Parsing shutdown grace periods failed because '%s' is not a priority and grace period couple separated by ':'", item) + return emptyMap + } + priority, err := strconv.Atoi(priorityAndPeriod[0]) + if err != nil { + klog.Errorf("Parsing shutdown grace periods and priorities failed: %v", err) + return emptyMap + } + shutDownGracePeriod, err := strconv.Atoi(priorityAndPeriod[1]) + if err != nil { + klog.Errorf("Parsing shutdown grace periods and priorities failed: %v", err) + return emptyMap + } + priorityGracePeriodMap = append(priorityGracePeriodMap, kubelet_config.ShutdownGracePeriodByPodPriority{ + Priority: int32(priority), + ShutdownGracePeriodSeconds: int64(shutDownGracePeriod), + }) + } + return priorityGracePeriodMap +} + +// SingleRuleDrainConfig returns an array of ShutdownGracePeriodByPodPriority with a single ShutdownGracePeriodByPodPriority +func SingleRuleDrainConfig(shutdownGracePeriodSeconds int) []kubelet_config.ShutdownGracePeriodByPodPriority { + return []kubelet_config.ShutdownGracePeriodByPodPriority{ + { + Priority: math.MaxInt32, + ShutdownGracePeriodSeconds: int64(shutdownGracePeriodSeconds), + }, + } +} diff --git a/cluster-autoscaler/core/scaledown/actuation/priority_test.go b/cluster-autoscaler/core/scaledown/actuation/priority_test.go new file mode 100644 index 000000000000..4b50be682bf5 --- /dev/null +++ b/cluster-autoscaler/core/scaledown/actuation/priority_test.go @@ -0,0 +1,239 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package actuation + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + apiv1 "k8s.io/api/core/v1" + policyv1beta1 "k8s.io/api/policy/v1beta1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/autoscaler/cluster-autoscaler/config" + . "k8s.io/autoscaler/cluster-autoscaler/core/test" + "k8s.io/autoscaler/cluster-autoscaler/core/utils" + "k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot" + . "k8s.io/autoscaler/cluster-autoscaler/utils/test" + "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" + kubelet_config "k8s.io/kubernetes/pkg/kubelet/apis/config" +) + +func TestPriorityEvictor(t *testing.T) { + deletedPods := make(chan string, 10) + fakeClient := &fake.Clientset{} + + n1 := BuildTestNode("n1", 1000, 1000) + p1 := BuildTestPod("p1", 100, 0, WithNodeName(n1.Name)) + p2 := BuildTestPod("p2", 300, 0, WithNodeName(n1.Name)) + p3 := BuildTestPod("p3", 150, 0, WithNodeName(n1.Name)) + + priority100 := int32(100) + priority2000 := int32(2000) + priority2000000005 := int32(2000000005) + p1.Spec.Priority = &priority2000000005 + p2.Spec.Priority = &priority2000 + p3.Spec.Priority = &priority100 + + SetNodeReadyState(n1, true, time.Time{}) + + fakeClient.Fake.AddReactor("get", "pods", func(action core.Action) (bool, runtime.Object, error) { + return true, nil, errors.NewNotFound(apiv1.Resource("pod"), "whatever") + }) + fakeClient.Fake.AddReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) { + createAction := action.(core.CreateAction) + if createAction == nil { + return false, nil, nil + } + eviction := createAction.GetObject().(*policyv1beta1.Eviction) + if eviction == nil { + return false, nil, nil + } + deletedPods <- eviction.Name + return true, nil, nil + }) + + options := config.AutoscalingOptions{ + MaxGracefulTerminationSec: 20, + MaxPodEvictionTime: 5 * time.Second, + } + ctx, err := NewScaleTestAutoscalingContext(options, fakeClient, nil, nil, nil, nil) + assert.NoError(t, err) + + evictor := Evictor{ + EvictionRetryTime: 0, + PodEvictionHeadroom: DefaultPodEvictionHeadroom, + shutdownGracePeriodByPodPriority: []kubelet_config.ShutdownGracePeriodByPodPriority{ + { + Priority: 0, + ShutdownGracePeriodSeconds: 3, + }, + { + Priority: 1000, + ShutdownGracePeriodSeconds: 2, + }, + { + Priority: 2000000000, + ShutdownGracePeriodSeconds: 1, + }, + }, + fullDsEviction: true, + } + clustersnapshot.InitializeClusterSnapshotOrDie(t, ctx.ClusterSnapshot, []*apiv1.Node{n1}, []*apiv1.Pod{p1, p2, p3}) + nodeInfo, err := ctx.ClusterSnapshot.NodeInfos().Get(n1.Name) + assert.NoError(t, err) + _, err = evictor.DrainNode(&ctx, nodeInfo) + assert.NoError(t, err) + deleted := make([]string, 0) + deleted = append(deleted, utils.GetStringFromChan(deletedPods)) + deleted = append(deleted, utils.GetStringFromChan(deletedPods)) + deleted = append(deleted, utils.GetStringFromChan(deletedPods)) + + assert.Equal(t, p3.Name, deleted[0]) + assert.Equal(t, p2.Name, deleted[1]) + assert.Equal(t, p1.Name, deleted[2]) +} + +func TestGroupByPriority(t *testing.T) { + p1 := BuildTestPod("p1", 100, 0) + p2 := BuildTestPod("p2", 300, 0) + p3 := BuildTestPod("p3", 150, 0) + p4 := BuildTestPod("p4", 100, 0) + p5 := BuildTestPod("p5", 300, 0) + + p6 := BuildTestPod("p6", 100, 0) + p7 := BuildTestPod("p7", 300, 0) + p8 := BuildTestPod("p8", 150, 0) + p9 := BuildTestPod("p9", 100, 0) + p10 := BuildTestPod("p10", 300, 0) + + priority0 := int32(0) + priority100 := int32(100) + priority500 := int32(500) + priority1000 := int32(1000) + priority2000000005 := int32(2000000005) + p1.Spec.Priority = &priority2000000005 + p2.Spec.Priority = &priority500 + p3.Spec.Priority = &priority100 + p4.Spec.Priority = &priority0 + p5.Spec.Priority = &priority1000 + + p6.Spec.Priority = &priority2000000005 + p7.Spec.Priority = &priority500 + p8.Spec.Priority = &priority100 + p9.Spec.Priority = &priority0 + p10.Spec.Priority = &priority1000 + + shutdownGracePeriodByPodPriority := []kubelet_config.ShutdownGracePeriodByPodPriority{ + { + Priority: 10, + ShutdownGracePeriodSeconds: 4, + }, + { + Priority: 1000, + ShutdownGracePeriodSeconds: 3, + }, + { + Priority: 2000, + ShutdownGracePeriodSeconds: 2, + }, + { + Priority: 2000000000, + ShutdownGracePeriodSeconds: 1, + }, + } + + wantGroups := []podEvictionGroup{ + { + ShutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority[0], + FullEvictionPods: []*apiv1.Pod{p2, p3, p4}, + BestEffortEvictionPods: []*apiv1.Pod{p7, p8, p9}, + }, + { + ShutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority[1], + FullEvictionPods: []*apiv1.Pod{p5}, + BestEffortEvictionPods: []*apiv1.Pod{p10}, + }, + { + ShutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority[2], + }, + { + ShutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority[3], + FullEvictionPods: []*apiv1.Pod{p1}, + BestEffortEvictionPods: []*apiv1.Pod{p6}, + }, + } + + groups := groupByPriority(shutdownGracePeriodByPodPriority, []*apiv1.Pod{p1, p2, p3, p4, p5}, []*apiv1.Pod{p6, p7, p8, p9, p10}) + assert.Equal(t, wantGroups, groups) +} + +func TestParseShutdownGracePeriodsAndPriorities(t *testing.T) { + testCases := []struct { + name string + input string + want []kubelet_config.ShutdownGracePeriodByPodPriority + }{ + { + name: "empty input", + input: "", + want: nil, + }, + { + name: "Incorrect string - incorrect priority grace period pairs", + input: "1:2,34", + want: nil, + }, + { + name: "Incorrect string - trailing ,", + input: "1:2, 3:4,", + want: nil, + }, + { + name: "Incorrect string - trailing space", + input: "1:2,3:4 ", + want: nil, + }, + { + name: "Non integers - 1", + input: "1:2,3:a", + want: nil, + }, + { + name: "Non integers - 2", + input: "1:2,3:23.2", + want: nil, + }, + { + name: "parsable input", + input: "1:2,3:4", + want: []kubelet_config.ShutdownGracePeriodByPodPriority{ + {1, 2}, + {3, 4}, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + shutdownGracePeriodByPodPriority := ParseShutdownGracePeriodsAndPriorities(tc.input) + assert.Equal(t, tc.want, shutdownGracePeriodByPodPriority) + }) + } +} diff --git a/cluster-autoscaler/core/scaledown/eligibility/eligibility_test.go b/cluster-autoscaler/core/scaledown/eligibility/eligibility_test.go index a40c88e49104..984504bbe93c 100644 --- a/cluster-autoscaler/core/scaledown/eligibility/eligibility_test.go +++ b/cluster-autoscaler/core/scaledown/eligibility/eligibility_test.go @@ -66,7 +66,7 @@ func getTestCases(ignoreDaemonSetsUtilization bool, suffix string, now time.Time smallPod := BuildTestPod("smallPod", 100, 0) smallPod.Spec.NodeName = "regular" - dsPod := BuildDSTestPod("dsPod", 500, 0) + dsPod := BuildTestPod("dsPod", 500, 0, WithDSController()) dsPod.Spec.NodeName = "regular" testCases := []testCase{ diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 715e699424e9..318554d99cbe 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -29,6 +29,11 @@ import ( "syscall" "time" + "k8s.io/autoscaler/cluster-autoscaler/core/scaledown/actuation" + "k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot" + "k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker" + kubelet_config "k8s.io/kubernetes/pkg/kubelet/apis/config" + "github.com/spf13/pflag" "k8s.io/apimachinery/pkg/api/meta" @@ -41,7 +46,6 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/core" "k8s.io/autoscaler/cluster-autoscaler/core/podlistprocessor" - "k8s.io/autoscaler/cluster-autoscaler/debuggingsnapshot" "k8s.io/autoscaler/cluster-autoscaler/estimator" "k8s.io/autoscaler/cluster-autoscaler/expander" "k8s.io/autoscaler/cluster-autoscaler/metrics" @@ -54,7 +58,6 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot" "k8s.io/autoscaler/cluster-autoscaler/simulator/drainability/rules" "k8s.io/autoscaler/cluster-autoscaler/simulator/options" - "k8s.io/autoscaler/cluster-autoscaler/simulator/predicatechecker" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" scheduler_util "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler" @@ -154,14 +157,15 @@ var ( maxBulkSoftTaintCount = flag.Int("max-bulk-soft-taint-count", 10, "Maximum number of nodes that can be tainted/untainted PreferNoSchedule at the same time. Set to 0 to turn off such tainting.") maxBulkSoftTaintTime = flag.Duration("max-bulk-soft-taint-time", 3*time.Second, "Maximum duration of tainting/untainting nodes as PreferNoSchedule at the same time.") maxEmptyBulkDeleteFlag = flag.Int("max-empty-bulk-delete", 10, "Maximum number of empty nodes that can be deleted at the same time.") - maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node.") - maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations") - okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage") - scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.") - parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.") - maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group") - maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up") - nodeGroupsFlag = multiStringFlag( + maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node. "+ + "This flag is mutually exclusion with drain-priority-config flag which allows more configuration options.") + maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations") + okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage") + scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.") + parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.") + maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group") + maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up") + nodeGroupsFlag = multiStringFlag( "nodes", "sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: ::") nodeGroupAutoDiscoveryFlag = multiStringFlag( @@ -244,6 +248,11 @@ var ( forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server") bypassedSchedulers = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up.")) + drainPriorityConfig = flag.String("drain-priority-config", "", + "List of ',' separated pairs (priority:terminationGracePeriodSeconds) of integers separated by ':' enables priority evictor. Priority evictor groups pods into priority groups based on pod priority and evict pods in the ascending order of group priorities"+ + "--max-graceful-termination-sec flag should not be set when this flag is set. Not setting this flag will use unordered evictor by default."+ + "Priority evictor reuses the concepts of drain logic in kubelet(https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2712-pod-priority-based-graceful-node-shutdown#migration-from-the-node-graceful-shutdown-feature)."+ + "Eg. flag usage: '10000:20,1000:100,0:60'") ) func isFlagPassed(name string) bool { @@ -296,6 +305,18 @@ func createAutoscalingOptions() config.AutoscalingOptions { klog.Fatalf("Failed to get scheduler config: %v", err) } + if isFlagPassed("drain-priority-config") && isFlagPassed("max-graceful-termination-sec") { + klog.Fatalf("Invalid configuration, could not use --drain-priority-config together with --max-graceful-termination-sec") + } + + var drainPriorityConfigMap []kubelet_config.ShutdownGracePeriodByPodPriority + if isFlagPassed("drain-priority-config") { + drainPriorityConfigMap = actuation.ParseShutdownGracePeriodsAndPriorities(*drainPriorityConfig) + if len(drainPriorityConfigMap) == 0 { + klog.Fatalf("Invalid configuration, parsing --drain-priority-config") + } + } + return config.AutoscalingOptions{ NodeGroupDefaults: config.NodeGroupAutoscalingOptions{ ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold, @@ -338,6 +359,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount, ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio, ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount, + DrainPriorityConfig: drainPriorityConfigMap, SchedulerConfig: parsedSchedConfig, WriteStatusConfigMap: *writeStatusConfigMapFlag, StatusConfigMapName: *statusConfigMapName, diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index dfa1d4f7daa7..30cee3e01329 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -44,6 +44,9 @@ type FunctionLabel string // NodeGroupType describes node group relation to CA type NodeGroupType string +// PodEvictionResult describes result of the pod eviction attempt +type PodEvictionResult string + const ( caNamespace = "cluster_autoscaler" readyLabel = "ready" @@ -87,6 +90,10 @@ const ( // This is meant to help find unexpectedly long function execution times for // debugging purposes. LogLongDurationThreshold = 5 * time.Second + // PodEvictionSucceed means creation of the pod eviction object succeed + PodEvictionSucceed PodEvictionResult = "succeed" + // PodEvictionFailed means creation of the pod eviction object failed + PodEvictionFailed PodEvictionResult = "failed" ) // Names of Cluster Autoscaler operations @@ -292,12 +299,12 @@ var ( }, []string{"reason", "gpu_resource_name", "gpu_name"}, ) - evictionsCount = k8smetrics.NewCounter( + evictionsCount = k8smetrics.NewCounterVec( &k8smetrics.CounterOpts{ Namespace: caNamespace, Name: "evicted_pods_total", Help: "Number of pods evicted by CA", - }, + }, []string{"eviction_result"}, ) unneededNodesCount = k8smetrics.NewGauge( @@ -550,9 +557,9 @@ func RegisterScaleDown(nodesCount int, gpuResourceName, gpuType string, reason N } } -// RegisterEvictions records number of evicted pods -func RegisterEvictions(podsCount int) { - evictionsCount.Add(float64(podsCount)) +// RegisterEvictions records number of evicted pods succeed or failed +func RegisterEvictions(podsCount int, result PodEvictionResult) { + evictionsCount.WithLabelValues(string(result)).Add(float64(podsCount)) } // UpdateUnneededNodesCount records number of currently unneeded nodes diff --git a/cluster-autoscaler/utils/test/test_utils.go b/cluster-autoscaler/utils/test/test_utils.go index 1b76b24b8bfb..8d0fbf90d9ff 100644 --- a/cluster-autoscaler/utils/test/test_utils.go +++ b/cluster-autoscaler/utils/test/test_utils.go @@ -89,13 +89,18 @@ func AddSchedulerName(schedulerName string) func(*apiv1.Pod) { } } -// BuildDSTestPod creates a DaemonSet pod with cpu and memory. -func BuildDSTestPod(name string, cpu int64, mem int64) *apiv1.Pod { - - pod := BuildTestPod(name, cpu, mem) - pod.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "some-uid") +// WithDSController creates a daemonSet owner ref for the pod. +func WithDSController() func(*apiv1.Pod) { + return func(pod *apiv1.Pod) { + pod.OwnerReferences = GenerateOwnerReferences("ds", "DaemonSet", "apps/v1", "some-uid") + } +} - return pod +// WithNodeName sets a node name to the pod. +func WithNodeName(nodeName string) func(*apiv1.Pod) { + return func(pod *apiv1.Pod) { + pod.Spec.NodeName = nodeName + } } // BuildTestPodWithEphemeralStorage creates a pod with cpu, memory and ephemeral storage resources.