diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 57444a6536e6..abd1d05d6003 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -143,6 +143,18 @@ type ClusterStateRegistry struct { scaleUpFailures map[string][]ScaleUpFailure } +// NodeGroupHealth contains information about the health of the node group. +type NodeGroupHealth struct { + IsHealthy bool + ErrorInfo *cloudprovider.InstanceErrorInfo +} + +// NodeGroupScalingSafety contains information about the safety of the node group to scale up/down. +type NodeGroupScalingSafety struct { + SafeToScale bool + ErrorInfo *cloudprovider.InstanceErrorInfo +} + // NewClusterStateRegistry creates new ClusterStateRegistry. func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor) *ClusterStateRegistry { emptyStatus := &api.ClusterAutoscalerStatus{ @@ -276,7 +288,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) { } else { gpuResource, gpuType = gpu.GetGpuInfoForMetrics(csr.cloudProvider.GetNodeGpuConfig(nodeInfo.Node()), availableGPUTypes, nodeInfo.Node(), scaleUpRequest.NodeGroup) } - csr.registerFailedScaleUpNoLock(scaleUpRequest.NodeGroup, metrics.Timeout, cloudprovider.OtherErrorClass, "timeout", gpuResource, gpuType, currentTime) + csr.registerFailedScaleUpNoLock(scaleUpRequest.NodeGroup, metrics.Timeout, cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "timeout", + ErrorMessage: fmt.Sprintf("Scale-up timed out for node group %v after %v", nodeGroupName, currentTime.Sub(scaleUpRequest.Time)), + }, gpuResource, gpuType, currentTime) delete(csr.scaleUpRequests, nodeGroupName) } } @@ -291,25 +307,29 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) { } // To be executed under a lock. -func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) { +func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) { nodeGroupInfo := csr.nodeInfosForGroups[nodeGroup.Id()] - backoffUntil := csr.backoff.Backoff(nodeGroup, nodeGroupInfo, errorClass, errorCode, currentTime) - klog.Warningf("Disabling scale-up for node group %v until %v; errorClass=%v; errorCode=%v", nodeGroup.Id(), backoffUntil, errorClass, errorCode) + backoffUntil := csr.backoff.Backoff(nodeGroup, nodeGroupInfo, errorInfo, currentTime) + klog.Warningf("Disabling scale-up for node group %v until %v; errorClass=%v; errorCode=%v", nodeGroup.Id(), backoffUntil, errorInfo.ErrorClass, errorInfo.ErrorCode) } // RegisterFailedScaleUp should be called after getting error from cloudprovider // when trying to scale-up node group. It will mark this group as not safe to autoscale // for some time. -func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, gpuResourceName, gpuType string, currentTime time.Time) { +func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorMessage, gpuResourceName, gpuType string, currentTime time.Time) { csr.Lock() defer csr.Unlock() - csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.OtherErrorClass, string(reason), gpuResourceName, gpuType, currentTime) + csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: string(reason), + ErrorMessage: errorMessage, + }, gpuResourceName, gpuType, currentTime) } -func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorClass cloudprovider.InstanceErrorClass, errorCode string, gpuResourceName, gpuType string, currentTime time.Time) { +func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorInfo cloudprovider.InstanceErrorInfo, gpuResourceName, gpuType string, currentTime time.Time) { csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime}) metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType) - csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime) + csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime) } // UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats @@ -390,22 +410,36 @@ func (csr *ClusterStateRegistry) IsClusterHealthy() bool { return true } -// IsNodeGroupHealthy returns true if the node group health is within the acceptable limits -func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool { +// GetNodeGroupHealth returns information regarding node group health being within the acceptable limits +func (csr *ClusterStateRegistry) GetNodeGroupHealth(nodeGroupName string) NodeGroupHealth { acceptable, found := csr.acceptableRanges[nodeGroupName] if !found { klog.Warningf("Failed to find acceptable ranges for %v", nodeGroupName) - return false + return NodeGroupHealth{ + IsHealthy: false, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "NodeGroupUnhealthy", + ErrorMessage: "Failed to find acceptable ranges", + }, + } } readiness, found := csr.perNodeGroupReadiness[nodeGroupName] if !found { // No nodes but target == 0 or just scaling up. if acceptable.CurrentTarget == 0 || (acceptable.MinNodes == 0 && acceptable.CurrentTarget > 0) { - return true + return NodeGroupHealth{IsHealthy: true} } klog.Warningf("Failed to find readiness information for %v", nodeGroupName) - return false + return NodeGroupHealth{ + IsHealthy: false, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "NodeGroupUnhealthy", + ErrorMessage: "Failed to find readiness information", + }, + } } unjustifiedUnready := 0 @@ -418,10 +452,17 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool { if unjustifiedUnready > csr.config.OkTotalUnreadyCount && float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0* float64(len(readiness.Ready)+len(readiness.Unready)+len(readiness.NotStarted)) { - return false + return NodeGroupHealth{ + IsHealthy: false, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "NodeGroupUnhealthy", + ErrorMessage: "Too many unready nodes", + }, + } } - return true + return NodeGroupHealth{IsHealthy: true} } // updateNodeGroupMetrics looks at NodeGroups provided by cloudprovider and updates corresponding metrics @@ -441,12 +482,14 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() { metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned) } -// IsNodeGroupSafeToScaleUp returns true if node group can be scaled up now. -func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) bool { - if !csr.IsNodeGroupHealthy(nodeGroup.Id()) { - return false +// IsNodeGroupSafeToScaleUp returns information about node group safety to be scaled up now. +func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety { + nodeGroupStatus := csr.GetNodeGroupHealth(nodeGroup.Id()) + if !nodeGroupStatus.IsHealthy { + return NodeGroupScalingSafety{false, nodeGroupStatus.ErrorInfo} } - return !csr.backoff.IsBackedOff(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now) + backoffStatus := csr.backoff.GetBackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now) + return NodeGroupScalingSafety{!backoffStatus.IsBackoff, backoffStatus.ErrorInfo} } func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGroupName string) (provisioned, target int, ok bool) { @@ -756,7 +799,7 @@ func (csr *ClusterStateRegistry) GetStatus(now time.Time) *api.ClusterAutoscaler // Health. nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildHealthStatusNodeGroup( - csr.IsNodeGroupHealthy(nodeGroup.Id()), readiness, acceptable, nodeGroup.MinSize(), nodeGroup.MaxSize())) + csr.GetNodeGroupHealth(nodeGroup.Id()).IsHealthy, readiness, acceptable, nodeGroup.MinSize(), nodeGroup.MaxSize())) // Scale up. nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildScaleUpStatusNodeGroup( @@ -811,7 +854,7 @@ func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable Ac return condition } -func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, isSafeToScaleUp bool, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition { +func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, scaleUpSafety NodeGroupScalingSafety, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition { condition := api.ClusterAutoscalerCondition{ Type: api.ClusterAutoscalerScaleUp, Message: fmt.Sprintf("ready=%d cloudProviderTarget=%d", @@ -821,7 +864,7 @@ func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, isSafeToScaleUp bool, } if isScaleUpInProgress { condition.Status = api.ClusterAutoscalerInProgress - } else if !isSafeToScaleUp { + } else if !scaleUpSafety.SafeToScale { condition.Status = api.ClusterAutoscalerBackoff } else { condition.Status = api.ClusterAutoscalerNoActivity @@ -1123,7 +1166,12 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup( } // Decrease the scale up request by the number of deleted nodes csr.registerOrUpdateScaleUpNoLock(nodeGroup, -len(unseenInstanceIds), currentTime) - csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode.code), errorCode.class, errorCode.code, gpuResource, gpuType, currentTime) + + csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode.code), cloudprovider.InstanceErrorInfo{ + ErrorClass: errorCode.class, + ErrorCode: errorCode.code, + ErrorMessage: csr.buildErrorMessageEventString(currentUniqueErrorMessagesForErrorCode[errorCode]), + }, gpuResource, gpuType, currentTime) } } } diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 6a74f69617fd..d6a48961d90e 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -120,7 +120,7 @@ func TestEmptyOK(t *testing.T) { assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) assert.Empty(t, clusterstate.GetScaleUpFailures()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1")) assert.False(t, clusterstate.HasNodeGroupStartedScaleUp("ng1")) @@ -132,7 +132,7 @@ func TestEmptyOK(t *testing.T) { assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1")) assert.True(t, clusterstate.HasNodeGroupStartedScaleUp("ng1")) } @@ -203,7 +203,7 @@ func TestOKOneUnreadyNode(t *testing.T) { assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) assert.Empty(t, clusterstate.GetScaleUpFailures()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) status := clusterstate.GetStatus(now) assert.Equal(t, api.ClusterAutoscalerHealthy, @@ -270,7 +270,7 @@ func TestOKOneUnreadyNodeWithScaleDownCandidate(t *testing.T) { assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) assert.Empty(t, clusterstate.GetScaleUpFailures()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) status := clusterstate.GetStatus(now) assert.Equal(t, api.ClusterAutoscalerHealthy, @@ -333,7 +333,13 @@ func TestMissingNodes(t *testing.T) { assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) assert.Empty(t, clusterstate.GetScaleUpFailures()) - assert.False(t, clusterstate.IsNodeGroupHealthy("ng1")) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{ + IsHealthy: false, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "NodeGroupUnhealthy", + ErrorMessage: "Too many unready nodes", + }}) status := clusterstate.GetStatus(now) assert.Equal(t, api.ClusterAutoscalerHealthy, @@ -375,7 +381,7 @@ func TestTooManyUnready(t *testing.T) { assert.NoError(t, err) assert.False(t, clusterstate.IsClusterHealthy()) assert.Empty(t, clusterstate.GetScaleUpFailures()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) } func TestUnreadyLongAfterCreation(t *testing.T) { @@ -473,7 +479,13 @@ func TestExpiredScaleUp(t *testing.T) { err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) - assert.False(t, clusterstate.IsNodeGroupHealthy("ng1")) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{ + IsHealthy: false, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "NodeGroupUnhealthy", + ErrorMessage: "Too many unready nodes", + }}) assert.Equal(t, clusterstate.GetScaleUpFailures(), map[string][]ScaleUpFailure{ "ng1": { {NodeGroup: provider.GetNodeGroup("ng1"), Time: now, Reason: metrics.Timeout}, @@ -922,14 +934,27 @@ func TestScaleUpBackoff(t *testing.T) { err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, nil, now) assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) - assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) + assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{ + SafeToScale: false, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "timeout", + ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s", + }}) + assert.Equal(t, clusterstate.backoff.GetBackoffStatus(ng1, nil, now), backoff.BackoffStatus{ + IsBackoff: true, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "timeout", + ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s", + }}) // Backoff should expire after timeout now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second) assert.True(t, clusterstate.IsClusterHealthy()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) - assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) + assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{SafeToScale: true}) // Another failed scale up should cause longer backoff clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-121*time.Second)) @@ -937,11 +962,23 @@ func TestScaleUpBackoff(t *testing.T) { err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, nil, now) assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) - assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) + assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{ + SafeToScale: false, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "timeout", + ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s", + }}) now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second) - assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) + assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{ + SafeToScale: false, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "timeout", + ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s", + }}) // The backoff should be cleared after a successful scale-up clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now) @@ -951,9 +988,9 @@ func TestScaleUpBackoff(t *testing.T) { err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3, ng1_4}, nil, now) assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) - assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) - assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now)) - assert.False(t, clusterstate.backoff.IsBackedOff(ng1, nil, now)) + assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true}) + assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{SafeToScale: true}) + assert.Equal(t, clusterstate.backoff.GetBackoffStatus(ng1, nil, now), backoff.BackoffStatus{IsBackoff: false}) } func TestGetClusterSize(t *testing.T) { @@ -1070,9 +1107,9 @@ func TestScaleUpFailures(t *testing.T) { fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute})) - clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, "", "", now) - clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, "", "", now) - clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, "", "", now.Add(time.Minute)) + clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, "", "", "", now) + clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, "", "", "", now) + clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, "", "", "", now.Add(time.Minute)) failures := clusterstate.GetScaleUpFailures() assert.Equal(t, map[string][]ScaleUpFailure{ diff --git a/cluster-autoscaler/core/scaleup/orchestrator/executor.go b/cluster-autoscaler/core/scaleup/orchestrator/executor.go index 8f787e6cecce..ea433b2a5d56 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/executor.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/executor.go @@ -151,7 +151,7 @@ func (e *scaleUpExecutor) executeScaleUp( if err := info.Group.IncreaseSize(increase); err != nil { e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err) aerr := errors.ToAutoscalerError(errors.CloudProviderError, err).AddPrefix("failed to increase node group size: ") - e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), gpuResourceName, gpuType, now) + e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), aerr.Error(), gpuResourceName, gpuType, now) return aerr } e.clusterStateRegistry.RegisterOrUpdateScaleUp( diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go index 7b07e7652a55..1c775e9d4120 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go @@ -569,9 +569,9 @@ func (o *ScaleUpOrchestrator) UpcomingNodes(nodeInfos map[string]*schedulerframe // IsNodeGroupReadyToScaleUp returns nil if node group is ready to be scaled up, otherwise a reason is provided. func (o *ScaleUpOrchestrator) IsNodeGroupReadyToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) *SkippedReasons { // Non-existing node groups are created later so skip check for them. - if nodeGroup.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(nodeGroup, now) { + if nodeGroup.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(nodeGroup, now).SafeToScale { // Hack that depends on internals of IsNodeGroupSafeToScaleUp. - if !o.clusterStateRegistry.IsNodeGroupHealthy(nodeGroup.Id()) { + if !o.clusterStateRegistry.GetNodeGroupHealth(nodeGroup.Id()).IsHealthy { klog.Warningf("Node group %s is not ready for scaleup - unhealthy", nodeGroup.Id()) return NotReadyReason } @@ -658,7 +658,7 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups( var validSimilarNodeGroups []cloudprovider.NodeGroup for _, ng := range similarNodeGroups { // Non-existing node groups are created later so skip check for them. - if ng.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(ng, now) { + if ng.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(ng, now).SafeToScale { klog.V(2).Infof("Ignoring node group %s when balancing: group is not ready for scaleup", ng.Id()) } else if similarSchedulablePods, found := schedulablePods[ng.Id()]; found && matchingSchedulablePods(groupSchedulablePods, similarSchedulablePods) { validSimilarNodeGroups = append(validSimilarNodeGroups, ng) diff --git a/cluster-autoscaler/utils/backoff/backoff.go b/cluster-autoscaler/utils/backoff/backoff.go index d42df155eabf..62fdb09a4d6a 100644 --- a/cluster-autoscaler/utils/backoff/backoff.go +++ b/cluster-autoscaler/utils/backoff/backoff.go @@ -23,12 +23,18 @@ import ( schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ) +// BackOffStatus contains information about back off status. +type BackoffStatus struct { + IsBackoff bool + ErrorInfo *cloudprovider.InstanceErrorInfo +} + // Backoff allows time-based backing off of node groups considered in scale up algorithm type Backoff interface { // Backoff execution for the given node group. Returns time till execution is backed off. - Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) time.Time - // IsBackedOff returns true if execution is backed off for the given node group. - IsBackedOff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) bool + Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) time.Time + // GetBackoffStatus returns whether the execution is backed off for the given node group and error info when the node group is backed off. + GetBackoffStatus(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) BackoffStatus // RemoveBackoff removes backoff data for the given node group. RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo) // RemoveStaleBackoffData removes stale backoff data. diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go index 013273bc0587..cdcdef0eb716 100644 --- a/cluster-autoscaler/utils/backoff/exponential_backoff.go +++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go @@ -37,6 +37,7 @@ type exponentialBackoffInfo struct { duration time.Duration backoffUntil time.Time lastFailedExecution time.Time + errorInfo cloudprovider.InstanceErrorInfo } // NewExponentialBackoff creates an instance of exponential backoff. @@ -66,7 +67,7 @@ func NewIdBasedExponentialBackoff(initialBackoffDuration time.Duration, maxBacko } // Backoff execution for the given node group. Returns time till execution is backed off. -func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) time.Time { +func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) time.Time { duration := b.initialBackoffDuration key := b.nodeGroupKey(nodeGroup) if backoffInfo, found := b.backoffInfo[key]; found { @@ -87,14 +88,21 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo duration: duration, backoffUntil: backoffUntil, lastFailedExecution: currentTime, + errorInfo: errorInfo, } return backoffUntil } -// IsBackedOff returns true if execution is backed off for the given node group. -func (b *exponentialBackoff) IsBackedOff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) bool { +// GetBackoffStatus returns whether the execution is backed off for the given node group and error info when the node group is backed off. +func (b *exponentialBackoff) GetBackoffStatus(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) BackoffStatus { backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)] - return found && backoffInfo.backoffUntil.After(currentTime) + if !found || !backoffInfo.backoffUntil.After(currentTime) { + return BackoffStatus{IsBackoff: false} + } + return BackoffStatus{ + IsBackoff: true, + ErrorInfo: &backoffInfo.errorInfo, + } } // RemoveBackoff removes backoff data for the given node group. diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff_test.go b/cluster-autoscaler/utils/backoff/exponential_backoff_test.go index 2fd9d8319685..e948718dffeb 100644 --- a/cluster-autoscaler/utils/backoff/exponential_backoff_test.go +++ b/cluster-autoscaler/utils/backoff/exponential_backoff_test.go @@ -35,45 +35,58 @@ func nodeGroup(id string) cloudprovider.NodeGroup { var nodeGroup1 = nodeGroup("id1") var nodeGroup2 = nodeGroup("id2") +var quotaError = cloudprovider.InstanceErrorInfo{ErrorClass: cloudprovider.OutOfResourcesErrorClass, ErrorCode: "QUOTA_EXCEEDED", ErrorMessage: "Not enough CPU"} +var ipSpaceExhaustedError = cloudprovider.InstanceErrorInfo{ErrorClass: cloudprovider.OtherErrorClass, ErrorCode: "IP_SPACE_EXHAUSTED", ErrorMessage: "IP space has been exhausted"} + +var noBackOff = BackoffStatus{IsBackoff: false} +var backoffWithQuotaError = BackoffStatus{ + IsBackoff: true, + ErrorInfo: "aError, +} +var backoffWithIpSpaceExhaustedError = BackoffStatus{ + IsBackoff: true, + ErrorInfo: &ipSpaceExhaustedError, +} + func TestBackoffTwoKeys(t *testing.T) { backoff := NewIdBasedExponentialBackoff(10*time.Minute, time.Hour, 3*time.Hour) startTime := time.Now() - assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime)) - assert.False(t, backoff.IsBackedOff(nodeGroup2, nil, startTime)) - backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(time.Minute)) - assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(2*time.Minute))) - assert.False(t, backoff.IsBackedOff(nodeGroup2, nil, startTime)) - assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(11*time.Minute))) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime), noBackOff) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup2, nil, startTime), noBackOff) + backoff.Backoff(nodeGroup1, nil, quotaError, startTime.Add(time.Minute)) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(2*time.Minute)), backoffWithQuotaError) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup2, nil, startTime), noBackOff) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(11*time.Minute)), noBackOff) } func TestMaxBackoff(t *testing.T) { backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour) startTime := time.Now() - backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime) - assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime)) - assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(1*time.Minute))) - backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(1*time.Minute)) - assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(1*time.Minute))) - assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(3*time.Minute))) - backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(3*time.Minute)) - assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(3*time.Minute))) - assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(6*time.Minute))) + backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime), backoffWithIpSpaceExhaustedError) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(1*time.Minute)), noBackOff) + backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime.Add(1*time.Minute)) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(1*time.Minute)), backoffWithIpSpaceExhaustedError) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(3*time.Minute)), noBackOff) + backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime.Add(3*time.Minute)) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(3*time.Minute)), backoffWithIpSpaceExhaustedError) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(6*time.Minute)), noBackOff) } func TestRemoveBackoff(t *testing.T) { backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour) startTime := time.Now() - backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime) - assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime)) + backoff.Backoff(nodeGroup1, nil, quotaError, startTime) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime), backoffWithQuotaError) backoff.RemoveBackoff(nodeGroup1, nil) - assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime)) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime), noBackOff) } func TestResetStaleBackoffData(t *testing.T) { backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour) startTime := time.Now() - backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime) - backoff.Backoff(nodeGroup2, nil, cloudprovider.OtherErrorClass, "", startTime.Add(time.Hour)) + backoff.Backoff(nodeGroup1, nil, quotaError, startTime) + backoff.Backoff(nodeGroup2, nil, quotaError, startTime.Add(time.Hour)) backoff.RemoveStaleBackoffData(startTime.Add(time.Hour)) assert.Equal(t, 2, len(backoff.(*exponentialBackoff).backoffInfo)) backoff.RemoveStaleBackoffData(startTime.Add(4 * time.Hour)) @@ -84,20 +97,22 @@ func TestResetStaleBackoffData(t *testing.T) { func TestIncreaseExistingBackoff(t *testing.T) { backoff := NewIdBasedExponentialBackoff(1*time.Second, 10*time.Minute, 3*time.Hour) - startTime := time.Now() - backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime) + currentTime := time.Now() + backoff.Backoff(nodeGroup1, nil, quotaError, currentTime) // NG in backoff for one second here - assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime)) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, currentTime), backoffWithQuotaError) // Come out of backoff - time.Sleep(1 * time.Second) - assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now())) - // Confirm existing backoff duration has been increased by backing off again - backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", time.Now()) + currentTime = currentTime.Add(1 * time.Second) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, currentTime), noBackOff) + // Confirm existing backoff duration and error info have been increased by backing off again + backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, currentTime) // Backoff should be for 2 seconds now - assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now())) - time.Sleep(1 * time.Second) - assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now())) - time.Sleep(1 * time.Second) - assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now())) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, currentTime), backoffWithIpSpaceExhaustedError) + currentTime = currentTime.Add(1 * time.Second) + // Doing backoff during existing backoff should change error info but doesn't change the duration. + backoff.Backoff(nodeGroup1, nil, quotaError, currentTime) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, time.Now()), backoffWithQuotaError) + currentTime = currentTime.Add(1 * time.Second) + assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, currentTime), noBackOff) // Result: existing backoff duration was scaled up beyond initial duration }