Skip to content

Commit

Permalink
Add error details to autoscaling backoff.
Browse files Browse the repository at this point in the history
  • Loading branch information
walidghallab committed Nov 30, 2023
1 parent 85b6058 commit ec92182
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 87 deletions.
96 changes: 72 additions & 24 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,18 @@ type ClusterStateRegistry struct {
scaleUpFailures map[string][]ScaleUpFailure
}

// NodeGroupHealth contains information about the health of the node group.
type NodeGroupHealth struct {
IsHealthy bool
ErrorInfo *cloudprovider.InstanceErrorInfo
}

// NodeGroupScalingSafety contains information about the safety of the node group to scale up/down.
type NodeGroupScalingSafety struct {
SafeToScale bool
ErrorInfo *cloudprovider.InstanceErrorInfo
}

// NewClusterStateRegistry creates new ClusterStateRegistry.
func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor) *ClusterStateRegistry {
emptyStatus := &api.ClusterAutoscalerStatus{
Expand Down Expand Up @@ -276,7 +288,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
} else {
gpuResource, gpuType = gpu.GetGpuInfoForMetrics(csr.cloudProvider.GetNodeGpuConfig(nodeInfo.Node()), availableGPUTypes, nodeInfo.Node(), scaleUpRequest.NodeGroup)
}
csr.registerFailedScaleUpNoLock(scaleUpRequest.NodeGroup, metrics.Timeout, cloudprovider.OtherErrorClass, "timeout", gpuResource, gpuType, currentTime)
csr.registerFailedScaleUpNoLock(scaleUpRequest.NodeGroup, metrics.Timeout, cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: fmt.Sprintf("Scale-up timed out for node group %v after %v", nodeGroupName, currentTime.Sub(scaleUpRequest.Time)),
}, gpuResource, gpuType, currentTime)
delete(csr.scaleUpRequests, nodeGroupName)
}
}
Expand All @@ -291,25 +307,29 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
}

// To be executed under a lock.
func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) {
func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) {
nodeGroupInfo := csr.nodeInfosForGroups[nodeGroup.Id()]
backoffUntil := csr.backoff.Backoff(nodeGroup, nodeGroupInfo, errorClass, errorCode, currentTime)
klog.Warningf("Disabling scale-up for node group %v until %v; errorClass=%v; errorCode=%v", nodeGroup.Id(), backoffUntil, errorClass, errorCode)
backoffUntil := csr.backoff.Backoff(nodeGroup, nodeGroupInfo, errorInfo, currentTime)
klog.Warningf("Disabling scale-up for node group %v until %v; errorClass=%v; errorCode=%v", nodeGroup.Id(), backoffUntil, errorInfo.ErrorClass, errorInfo.ErrorCode)
}

// RegisterFailedScaleUp should be called after getting error from cloudprovider
// when trying to scale-up node group. It will mark this group as not safe to autoscale
// for some time.
func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, gpuResourceName, gpuType string, currentTime time.Time) {
func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorMessage, gpuResourceName, gpuType string, currentTime time.Time) {
csr.Lock()
defer csr.Unlock()
csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.OtherErrorClass, string(reason), gpuResourceName, gpuType, currentTime)
csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: string(reason),
ErrorMessage: errorMessage,
}, gpuResourceName, gpuType, currentTime)
}

func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorClass cloudprovider.InstanceErrorClass, errorCode string, gpuResourceName, gpuType string, currentTime time.Time) {
func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorInfo cloudprovider.InstanceErrorInfo, gpuResourceName, gpuType string, currentTime time.Time) {
csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime})
metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType)
csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime)
csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime)
}

// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
Expand Down Expand Up @@ -390,22 +410,36 @@ func (csr *ClusterStateRegistry) IsClusterHealthy() bool {
return true
}

// IsNodeGroupHealthy returns true if the node group health is within the acceptable limits
func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
// GetNodeGroupHealth returns information regarding node group health being within the acceptable limits
func (csr *ClusterStateRegistry) GetNodeGroupHealth(nodeGroupName string) NodeGroupHealth {
acceptable, found := csr.acceptableRanges[nodeGroupName]
if !found {
klog.Warningf("Failed to find acceptable ranges for %v", nodeGroupName)
return false
return NodeGroupHealth{
IsHealthy: false,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "NodeGroupUnhealthy",
ErrorMessage: "Failed to find acceptable ranges",
},
}
}

readiness, found := csr.perNodeGroupReadiness[nodeGroupName]
if !found {
// No nodes but target == 0 or just scaling up.
if acceptable.CurrentTarget == 0 || (acceptable.MinNodes == 0 && acceptable.CurrentTarget > 0) {
return true
return NodeGroupHealth{IsHealthy: true}
}
klog.Warningf("Failed to find readiness information for %v", nodeGroupName)
return false
return NodeGroupHealth{
IsHealthy: false,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "NodeGroupUnhealthy",
ErrorMessage: "Failed to find readiness information",
},
}
}

unjustifiedUnready := 0
Expand All @@ -418,10 +452,17 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
if unjustifiedUnready > csr.config.OkTotalUnreadyCount &&
float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
float64(len(readiness.Ready)+len(readiness.Unready)+len(readiness.NotStarted)) {
return false
return NodeGroupHealth{
IsHealthy: false,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "NodeGroupUnhealthy",
ErrorMessage: "Too many unready nodes",
},
}
}

return true
return NodeGroupHealth{IsHealthy: true}
}

// updateNodeGroupMetrics looks at NodeGroups provided by cloudprovider and updates corresponding metrics
Expand All @@ -441,12 +482,14 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
}

// IsNodeGroupSafeToScaleUp returns true if node group can be scaled up now.
func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) bool {
if !csr.IsNodeGroupHealthy(nodeGroup.Id()) {
return false
// IsNodeGroupSafeToScaleUp returns information about node group safety to be scaled up now.
func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
nodeGroupStatus := csr.GetNodeGroupHealth(nodeGroup.Id())
if !nodeGroupStatus.IsHealthy {
return NodeGroupScalingSafety{false, nodeGroupStatus.ErrorInfo}
}
return !csr.backoff.IsBackedOff(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
backoffStatus := csr.backoff.GetBackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
return NodeGroupScalingSafety{!backoffStatus.IsBackoff, backoffStatus.ErrorInfo}
}

func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGroupName string) (provisioned, target int, ok bool) {
Expand Down Expand Up @@ -756,7 +799,7 @@ func (csr *ClusterStateRegistry) GetStatus(now time.Time) *api.ClusterAutoscaler

// Health.
nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildHealthStatusNodeGroup(
csr.IsNodeGroupHealthy(nodeGroup.Id()), readiness, acceptable, nodeGroup.MinSize(), nodeGroup.MaxSize()))
csr.GetNodeGroupHealth(nodeGroup.Id()).IsHealthy, readiness, acceptable, nodeGroup.MinSize(), nodeGroup.MaxSize()))

// Scale up.
nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildScaleUpStatusNodeGroup(
Expand Down Expand Up @@ -811,7 +854,7 @@ func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable Ac
return condition
}

func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, isSafeToScaleUp bool, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition {
func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, scaleUpSafety NodeGroupScalingSafety, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerScaleUp,
Message: fmt.Sprintf("ready=%d cloudProviderTarget=%d",
Expand All @@ -821,7 +864,7 @@ func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, isSafeToScaleUp bool,
}
if isScaleUpInProgress {
condition.Status = api.ClusterAutoscalerInProgress
} else if !isSafeToScaleUp {
} else if !scaleUpSafety.SafeToScale {
condition.Status = api.ClusterAutoscalerBackoff
} else {
condition.Status = api.ClusterAutoscalerNoActivity
Expand Down Expand Up @@ -1123,7 +1166,12 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
}
// Decrease the scale up request by the number of deleted nodes
csr.registerOrUpdateScaleUpNoLock(nodeGroup, -len(unseenInstanceIds), currentTime)
csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode.code), errorCode.class, errorCode.code, gpuResource, gpuType, currentTime)

csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode.code), cloudprovider.InstanceErrorInfo{
ErrorClass: errorCode.class,
ErrorCode: errorCode.code,
ErrorMessage: csr.buildErrorMessageEventString(currentUniqueErrorMessagesForErrorCode[errorCode]),
}, gpuResource, gpuType, currentTime)
}
}
}
Expand Down
77 changes: 57 additions & 20 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ func TestEmptyOK(t *testing.T) {
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.Empty(t, clusterstate.GetScaleUpFailures())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
assert.False(t, clusterstate.HasNodeGroupStartedScaleUp("ng1"))

Expand All @@ -132,7 +132,7 @@ func TestEmptyOK(t *testing.T) {

assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))
assert.True(t, clusterstate.HasNodeGroupStartedScaleUp("ng1"))
}
Expand Down Expand Up @@ -203,7 +203,7 @@ func TestOKOneUnreadyNode(t *testing.T) {
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.Empty(t, clusterstate.GetScaleUpFailures())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})

status := clusterstate.GetStatus(now)
assert.Equal(t, api.ClusterAutoscalerHealthy,
Expand Down Expand Up @@ -270,7 +270,7 @@ func TestOKOneUnreadyNodeWithScaleDownCandidate(t *testing.T) {
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.Empty(t, clusterstate.GetScaleUpFailures())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})

status := clusterstate.GetStatus(now)
assert.Equal(t, api.ClusterAutoscalerHealthy,
Expand Down Expand Up @@ -333,7 +333,13 @@ func TestMissingNodes(t *testing.T) {
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.Empty(t, clusterstate.GetScaleUpFailures())
assert.False(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{
IsHealthy: false,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "NodeGroupUnhealthy",
ErrorMessage: "Too many unready nodes",
}})

status := clusterstate.GetStatus(now)
assert.Equal(t, api.ClusterAutoscalerHealthy,
Expand Down Expand Up @@ -375,7 +381,7 @@ func TestTooManyUnready(t *testing.T) {
assert.NoError(t, err)
assert.False(t, clusterstate.IsClusterHealthy())
assert.Empty(t, clusterstate.GetScaleUpFailures())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
}

func TestUnreadyLongAfterCreation(t *testing.T) {
Expand Down Expand Up @@ -473,7 +479,13 @@ func TestExpiredScaleUp(t *testing.T) {
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.False(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{
IsHealthy: false,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "NodeGroupUnhealthy",
ErrorMessage: "Too many unready nodes",
}})
assert.Equal(t, clusterstate.GetScaleUpFailures(), map[string][]ScaleUpFailure{
"ng1": {
{NodeGroup: provider.GetNodeGroup("ng1"), Time: now, Reason: metrics.Timeout},
Expand Down Expand Up @@ -922,26 +934,51 @@ func TestScaleUpBackoff(t *testing.T) {
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, nil, now)
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{
SafeToScale: false,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s",
}})
assert.Equal(t, clusterstate.backoff.GetBackoffStatus(ng1, nil, now), backoff.BackoffStatus{
IsBackoff: true,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s",
}})

// Backoff should expire after timeout
now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{SafeToScale: true})

// Another failed scale up should cause longer backoff
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-121*time.Second))

err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, nil, now)
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{
SafeToScale: false,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
}})

now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{
SafeToScale: false,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
}})

// The backoff should be cleared after a successful scale-up
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now)
Expand All @@ -951,9 +988,9 @@ func TestScaleUpBackoff(t *testing.T) {
err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3, ng1_4}, nil, now)
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
assert.False(t, clusterstate.backoff.IsBackedOff(ng1, nil, now))
assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{SafeToScale: true})
assert.Equal(t, clusterstate.backoff.GetBackoffStatus(ng1, nil, now), backoff.BackoffStatus{IsBackoff: false})
}

func TestGetClusterSize(t *testing.T) {
Expand Down Expand Up @@ -1070,9 +1107,9 @@ func TestScaleUpFailures(t *testing.T) {
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))

clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, "", "", now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, "", "", now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, "", "", now.Add(time.Minute))
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, "", "", "", now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, "", "", "", now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, "", "", "", now.Add(time.Minute))

failures := clusterstate.GetScaleUpFailures()
assert.Equal(t, map[string][]ScaleUpFailure{
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/scaleup/orchestrator/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ func (e *scaleUpExecutor) executeScaleUp(
if err := info.Group.IncreaseSize(increase); err != nil {
e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
aerr := errors.ToAutoscalerError(errors.CloudProviderError, err).AddPrefix("failed to increase node group size: ")
e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), gpuResourceName, gpuType, now)
e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), aerr.Error(), gpuResourceName, gpuType, now)
return aerr
}
e.clusterStateRegistry.RegisterOrUpdateScaleUp(
Expand Down
Loading

0 comments on commit ec92182

Please sign in to comment.