diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index 57444a6536e6..abd1d05d6003 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -143,6 +143,18 @@ type ClusterStateRegistry struct {
 	scaleUpFailures map[string][]ScaleUpFailure
 }
 
+// NodeGroupHealth contains information about the health of the node group.
+type NodeGroupHealth struct {
+	IsHealthy bool
+	ErrorInfo *cloudprovider.InstanceErrorInfo
+}
+
+// NodeGroupScalingSafety contains information about the safety of the node group to scale up/down.
+type NodeGroupScalingSafety struct {
+	SafeToScale bool
+	ErrorInfo   *cloudprovider.InstanceErrorInfo
+}
+
 // NewClusterStateRegistry creates new ClusterStateRegistry.
 func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor) *ClusterStateRegistry {
 	emptyStatus := &api.ClusterAutoscalerStatus{
@@ -276,7 +288,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
 			} else {
 				gpuResource, gpuType = gpu.GetGpuInfoForMetrics(csr.cloudProvider.GetNodeGpuConfig(nodeInfo.Node()), availableGPUTypes, nodeInfo.Node(), scaleUpRequest.NodeGroup)
 			}
-			csr.registerFailedScaleUpNoLock(scaleUpRequest.NodeGroup, metrics.Timeout, cloudprovider.OtherErrorClass, "timeout", gpuResource, gpuType, currentTime)
+			csr.registerFailedScaleUpNoLock(scaleUpRequest.NodeGroup, metrics.Timeout, cloudprovider.InstanceErrorInfo{
+				ErrorClass:   cloudprovider.OtherErrorClass,
+				ErrorCode:    "timeout",
+				ErrorMessage: fmt.Sprintf("Scale-up timed out for node group %v after %v", nodeGroupName, currentTime.Sub(scaleUpRequest.Time)),
+			}, gpuResource, gpuType, currentTime)
 			delete(csr.scaleUpRequests, nodeGroupName)
 		}
 	}
@@ -291,25 +307,29 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
 }
 
 // To be executed under a lock.
-func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) {
+func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGroup, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) {
 	nodeGroupInfo := csr.nodeInfosForGroups[nodeGroup.Id()]
-	backoffUntil := csr.backoff.Backoff(nodeGroup, nodeGroupInfo, errorClass, errorCode, currentTime)
-	klog.Warningf("Disabling scale-up for node group %v until %v; errorClass=%v; errorCode=%v", nodeGroup.Id(), backoffUntil, errorClass, errorCode)
+	backoffUntil := csr.backoff.Backoff(nodeGroup, nodeGroupInfo, errorInfo, currentTime)
+	klog.Warningf("Disabling scale-up for node group %v until %v; errorClass=%v; errorCode=%v", nodeGroup.Id(), backoffUntil, errorInfo.ErrorClass, errorInfo.ErrorCode)
 }
 
 // RegisterFailedScaleUp should be called after getting error from cloudprovider
 // when trying to scale-up node group. It will mark this group as not safe to autoscale
 // for some time.
-func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, gpuResourceName, gpuType string, currentTime time.Time) {
+func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorMessage, gpuResourceName, gpuType string, currentTime time.Time) {
 	csr.Lock()
 	defer csr.Unlock()
-	csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.OtherErrorClass, string(reason), gpuResourceName, gpuType, currentTime)
+	csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.InstanceErrorInfo{
+		ErrorClass:   cloudprovider.OtherErrorClass,
+		ErrorCode:    string(reason),
+		ErrorMessage: errorMessage,
+	}, gpuResourceName, gpuType, currentTime)
 }
 
-func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorClass cloudprovider.InstanceErrorClass, errorCode string, gpuResourceName, gpuType string, currentTime time.Time) {
+func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorInfo cloudprovider.InstanceErrorInfo, gpuResourceName, gpuType string, currentTime time.Time) {
 	csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime})
 	metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType)
-	csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime)
+	csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime)
 }
 
 // UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
@@ -390,22 +410,36 @@ func (csr *ClusterStateRegistry) IsClusterHealthy() bool {
 	return true
 }
 
-// IsNodeGroupHealthy returns true if the node group health is within the acceptable limits
-func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
+// GetNodeGroupHealth returns information regarding node group health being within the acceptable limits
+func (csr *ClusterStateRegistry) GetNodeGroupHealth(nodeGroupName string) NodeGroupHealth {
 	acceptable, found := csr.acceptableRanges[nodeGroupName]
 	if !found {
 		klog.Warningf("Failed to find acceptable ranges for %v", nodeGroupName)
-		return false
+		return NodeGroupHealth{
+			IsHealthy: false,
+			ErrorInfo: &cloudprovider.InstanceErrorInfo{
+				ErrorClass:   cloudprovider.OtherErrorClass,
+				ErrorCode:    "NodeGroupUnhealthy",
+				ErrorMessage: "Failed to find acceptable ranges",
+			},
+		}
 	}
 
 	readiness, found := csr.perNodeGroupReadiness[nodeGroupName]
 	if !found {
 		// No nodes but target == 0 or just scaling up.
 		if acceptable.CurrentTarget == 0 || (acceptable.MinNodes == 0 && acceptable.CurrentTarget > 0) {
-			return true
+			return NodeGroupHealth{IsHealthy: true}
 		}
 		klog.Warningf("Failed to find readiness information for %v", nodeGroupName)
-		return false
+		return NodeGroupHealth{
+			IsHealthy: false,
+			ErrorInfo: &cloudprovider.InstanceErrorInfo{
+				ErrorClass:   cloudprovider.OtherErrorClass,
+				ErrorCode:    "NodeGroupUnhealthy",
+				ErrorMessage: "Failed to find readiness information",
+			},
+		}
 	}
 
 	unjustifiedUnready := 0
@@ -418,10 +452,17 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
 	if unjustifiedUnready > csr.config.OkTotalUnreadyCount &&
 		float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
 			float64(len(readiness.Ready)+len(readiness.Unready)+len(readiness.NotStarted)) {
-		return false
+		return NodeGroupHealth{
+			IsHealthy: false,
+			ErrorInfo: &cloudprovider.InstanceErrorInfo{
+				ErrorClass:   cloudprovider.OtherErrorClass,
+				ErrorCode:    "NodeGroupUnhealthy",
+				ErrorMessage: "Too many unready nodes",
+			},
+		}
 	}
 
-	return true
+	return NodeGroupHealth{IsHealthy: true}
 }
 
 // updateNodeGroupMetrics looks at NodeGroups provided by cloudprovider and updates corresponding metrics
@@ -441,12 +482,14 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
 	metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
 }
 
-// IsNodeGroupSafeToScaleUp returns true if node group can be scaled up now.
-func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) bool {
-	if !csr.IsNodeGroupHealthy(nodeGroup.Id()) {
-		return false
+// IsNodeGroupSafeToScaleUp returns information about node group safety to be scaled up now.
+func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
+	nodeGroupStatus := csr.GetNodeGroupHealth(nodeGroup.Id())
+	if !nodeGroupStatus.IsHealthy {
+		return NodeGroupScalingSafety{false, nodeGroupStatus.ErrorInfo}
 	}
-	return !csr.backoff.IsBackedOff(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
+	backoffStatus := csr.backoff.GetBackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
+	return NodeGroupScalingSafety{!backoffStatus.IsBackoff, backoffStatus.ErrorInfo}
 }
 
 func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGroupName string) (provisioned, target int, ok bool) {
@@ -756,7 +799,7 @@ func (csr *ClusterStateRegistry) GetStatus(now time.Time) *api.ClusterAutoscaler
 
 		// Health.
 		nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildHealthStatusNodeGroup(
-			csr.IsNodeGroupHealthy(nodeGroup.Id()), readiness, acceptable, nodeGroup.MinSize(), nodeGroup.MaxSize()))
+			csr.GetNodeGroupHealth(nodeGroup.Id()).IsHealthy, readiness, acceptable, nodeGroup.MinSize(), nodeGroup.MaxSize()))
 
 		// Scale up.
 		nodeGroupStatus.Conditions = append(nodeGroupStatus.Conditions, buildScaleUpStatusNodeGroup(
@@ -811,7 +854,7 @@ func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable Ac
 	return condition
 }
 
-func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, isSafeToScaleUp bool, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition {
+func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, scaleUpSafety NodeGroupScalingSafety, readiness Readiness, acceptable AcceptableRange) api.ClusterAutoscalerCondition {
 	condition := api.ClusterAutoscalerCondition{
 		Type: api.ClusterAutoscalerScaleUp,
 		Message: fmt.Sprintf("ready=%d cloudProviderTarget=%d",
@@ -821,7 +864,7 @@ func buildScaleUpStatusNodeGroup(isScaleUpInProgress bool, isSafeToScaleUp bool,
 	}
 	if isScaleUpInProgress {
 		condition.Status = api.ClusterAutoscalerInProgress
-	} else if !isSafeToScaleUp {
+	} else if !scaleUpSafety.SafeToScale {
 		condition.Status = api.ClusterAutoscalerBackoff
 	} else {
 		condition.Status = api.ClusterAutoscalerNoActivity
@@ -1123,7 +1166,12 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
 			}
 			// Decrease the scale up request by the number of deleted nodes
 			csr.registerOrUpdateScaleUpNoLock(nodeGroup, -len(unseenInstanceIds), currentTime)
-			csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode.code), errorCode.class, errorCode.code, gpuResource, gpuType, currentTime)
+
+			csr.registerFailedScaleUpNoLock(nodeGroup, metrics.FailedScaleUpReason(errorCode.code), cloudprovider.InstanceErrorInfo{
+				ErrorClass:   errorCode.class,
+				ErrorCode:    errorCode.code,
+				ErrorMessage: csr.buildErrorMessageEventString(currentUniqueErrorMessagesForErrorCode[errorCode]),
+			}, gpuResource, gpuType, currentTime)
 		}
 	}
 }
diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go
index 6a74f69617fd..d6a48961d90e 100644
--- a/cluster-autoscaler/clusterstate/clusterstate_test.go
+++ b/cluster-autoscaler/clusterstate/clusterstate_test.go
@@ -120,7 +120,7 @@ func TestEmptyOK(t *testing.T) {
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
 	assert.Empty(t, clusterstate.GetScaleUpFailures())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
 	assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
 	assert.False(t, clusterstate.HasNodeGroupStartedScaleUp("ng1"))
 
@@ -132,7 +132,7 @@ func TestEmptyOK(t *testing.T) {
 
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
 	assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))
 	assert.True(t, clusterstate.HasNodeGroupStartedScaleUp("ng1"))
 }
@@ -203,7 +203,7 @@ func TestOKOneUnreadyNode(t *testing.T) {
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
 	assert.Empty(t, clusterstate.GetScaleUpFailures())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
 
 	status := clusterstate.GetStatus(now)
 	assert.Equal(t, api.ClusterAutoscalerHealthy,
@@ -270,7 +270,7 @@ func TestOKOneUnreadyNodeWithScaleDownCandidate(t *testing.T) {
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
 	assert.Empty(t, clusterstate.GetScaleUpFailures())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
 
 	status := clusterstate.GetStatus(now)
 	assert.Equal(t, api.ClusterAutoscalerHealthy,
@@ -333,7 +333,13 @@ func TestMissingNodes(t *testing.T) {
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
 	assert.Empty(t, clusterstate.GetScaleUpFailures())
-	assert.False(t, clusterstate.IsNodeGroupHealthy("ng1"))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{
+		IsHealthy: false,
+		ErrorInfo: &cloudprovider.InstanceErrorInfo{
+			ErrorClass:   cloudprovider.OtherErrorClass,
+			ErrorCode:    "NodeGroupUnhealthy",
+			ErrorMessage: "Too many unready nodes",
+		}})
 
 	status := clusterstate.GetStatus(now)
 	assert.Equal(t, api.ClusterAutoscalerHealthy,
@@ -375,7 +381,7 @@ func TestTooManyUnready(t *testing.T) {
 	assert.NoError(t, err)
 	assert.False(t, clusterstate.IsClusterHealthy())
 	assert.Empty(t, clusterstate.GetScaleUpFailures())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
 }
 
 func TestUnreadyLongAfterCreation(t *testing.T) {
@@ -473,7 +479,13 @@ func TestExpiredScaleUp(t *testing.T) {
 	err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
-	assert.False(t, clusterstate.IsNodeGroupHealthy("ng1"))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{
+		IsHealthy: false,
+		ErrorInfo: &cloudprovider.InstanceErrorInfo{
+			ErrorClass:   cloudprovider.OtherErrorClass,
+			ErrorCode:    "NodeGroupUnhealthy",
+			ErrorMessage: "Too many unready nodes",
+		}})
 	assert.Equal(t, clusterstate.GetScaleUpFailures(), map[string][]ScaleUpFailure{
 		"ng1": {
 			{NodeGroup: provider.GetNodeGroup("ng1"), Time: now, Reason: metrics.Timeout},
@@ -922,14 +934,27 @@ func TestScaleUpBackoff(t *testing.T) {
 	err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, nil, now)
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
-	assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
+	assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{
+		SafeToScale: false,
+		ErrorInfo: &cloudprovider.InstanceErrorInfo{
+			ErrorClass:   cloudprovider.OtherErrorClass,
+			ErrorCode:    "timeout",
+			ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s",
+		}})
+	assert.Equal(t, clusterstate.backoff.GetBackoffStatus(ng1, nil, now), backoff.BackoffStatus{
+		IsBackoff: true,
+		ErrorInfo: &cloudprovider.InstanceErrorInfo{
+			ErrorClass:   cloudprovider.OtherErrorClass,
+			ErrorCode:    "timeout",
+			ErrorMessage: "Scale-up timed out for node group ng1 after 3m0s",
+		}})
 
 	// Backoff should expire after timeout
 	now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
 	assert.True(t, clusterstate.IsClusterHealthy())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
-	assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
+	assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{SafeToScale: true})
 
 	// Another failed scale up should cause longer backoff
 	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-121*time.Second))
@@ -937,11 +962,23 @@ func TestScaleUpBackoff(t *testing.T) {
 	err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, nil, now)
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
-	assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
+	assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{
+		SafeToScale: false,
+		ErrorInfo: &cloudprovider.InstanceErrorInfo{
+			ErrorClass:   cloudprovider.OtherErrorClass,
+			ErrorCode:    "timeout",
+			ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
+		}})
 
 	now = now.Add(5 * time.Minute /*InitialNodeGroupBackoffDuration*/).Add(time.Second)
-	assert.False(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
+	assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{
+		SafeToScale: false,
+		ErrorInfo: &cloudprovider.InstanceErrorInfo{
+			ErrorClass:   cloudprovider.OtherErrorClass,
+			ErrorCode:    "timeout",
+			ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
+		}})
 
 	// The backoff should be cleared after a successful scale-up
 	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 1, now)
@@ -951,9 +988,9 @@ func TestScaleUpBackoff(t *testing.T) {
 	err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3, ng1_4}, nil, now)
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
-	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
-	assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now))
-	assert.False(t, clusterstate.backoff.IsBackedOff(ng1, nil, now))
+	assert.Equal(t, clusterstate.GetNodeGroupHealth("ng1"), NodeGroupHealth{IsHealthy: true})
+	assert.Equal(t, clusterstate.IsNodeGroupSafeToScaleUp(ng1, now), NodeGroupScalingSafety{SafeToScale: true})
+	assert.Equal(t, clusterstate.backoff.GetBackoffStatus(ng1, nil, now), backoff.BackoffStatus{IsBackoff: false})
 }
 
 func TestGetClusterSize(t *testing.T) {
@@ -1070,9 +1107,9 @@ func TestScaleUpFailures(t *testing.T) {
 	fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
 	clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))
 
-	clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, "", "", now)
-	clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, "", "", now)
-	clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, "", "", now.Add(time.Minute))
+	clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, "", "", "", now)
+	clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, "", "", "", now)
+	clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, "", "", "", now.Add(time.Minute))
 
 	failures := clusterstate.GetScaleUpFailures()
 	assert.Equal(t, map[string][]ScaleUpFailure{
diff --git a/cluster-autoscaler/core/scaleup/orchestrator/executor.go b/cluster-autoscaler/core/scaleup/orchestrator/executor.go
index 8f787e6cecce..ea433b2a5d56 100644
--- a/cluster-autoscaler/core/scaleup/orchestrator/executor.go
+++ b/cluster-autoscaler/core/scaleup/orchestrator/executor.go
@@ -151,7 +151,7 @@ func (e *scaleUpExecutor) executeScaleUp(
 	if err := info.Group.IncreaseSize(increase); err != nil {
 		e.autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
 		aerr := errors.ToAutoscalerError(errors.CloudProviderError, err).AddPrefix("failed to increase node group size: ")
-		e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), gpuResourceName, gpuType, now)
+		e.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), aerr.Error(), gpuResourceName, gpuType, now)
 		return aerr
 	}
 	e.clusterStateRegistry.RegisterOrUpdateScaleUp(
diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
index 7b07e7652a55..1c775e9d4120 100644
--- a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
+++ b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
@@ -569,9 +569,9 @@ func (o *ScaleUpOrchestrator) UpcomingNodes(nodeInfos map[string]*schedulerframe
 // IsNodeGroupReadyToScaleUp returns nil if node group is ready to be scaled up, otherwise a reason is provided.
 func (o *ScaleUpOrchestrator) IsNodeGroupReadyToScaleUp(nodeGroup cloudprovider.NodeGroup, now time.Time) *SkippedReasons {
 	// Non-existing node groups are created later so skip check for them.
-	if nodeGroup.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(nodeGroup, now) {
+	if nodeGroup.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(nodeGroup, now).SafeToScale {
 		// Hack that depends on internals of IsNodeGroupSafeToScaleUp.
-		if !o.clusterStateRegistry.IsNodeGroupHealthy(nodeGroup.Id()) {
+		if !o.clusterStateRegistry.GetNodeGroupHealth(nodeGroup.Id()).IsHealthy {
 			klog.Warningf("Node group %s is not ready for scaleup - unhealthy", nodeGroup.Id())
 			return NotReadyReason
 		}
@@ -658,7 +658,7 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups(
 	var validSimilarNodeGroups []cloudprovider.NodeGroup
 	for _, ng := range similarNodeGroups {
 		// Non-existing node groups are created later so skip check for them.
-		if ng.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(ng, now) {
+		if ng.Exist() && !o.clusterStateRegistry.IsNodeGroupSafeToScaleUp(ng, now).SafeToScale {
 			klog.V(2).Infof("Ignoring node group %s when balancing: group is not ready for scaleup", ng.Id())
 		} else if similarSchedulablePods, found := schedulablePods[ng.Id()]; found && matchingSchedulablePods(groupSchedulablePods, similarSchedulablePods) {
 			validSimilarNodeGroups = append(validSimilarNodeGroups, ng)
diff --git a/cluster-autoscaler/utils/backoff/backoff.go b/cluster-autoscaler/utils/backoff/backoff.go
index d42df155eabf..62fdb09a4d6a 100644
--- a/cluster-autoscaler/utils/backoff/backoff.go
+++ b/cluster-autoscaler/utils/backoff/backoff.go
@@ -23,12 +23,18 @@ import (
 	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
 )
 
+// BackOffStatus contains information about back off status.
+type BackoffStatus struct {
+	IsBackoff bool
+	ErrorInfo *cloudprovider.InstanceErrorInfo
+}
+
 // Backoff allows time-based backing off of node groups considered in scale up algorithm
 type Backoff interface {
 	// Backoff execution for the given node group. Returns time till execution is backed off.
-	Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) time.Time
-	// IsBackedOff returns true if execution is backed off for the given node group.
-	IsBackedOff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) bool
+	Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) time.Time
+	// GetBackoffStatus returns whether the execution is backed off for the given node group and error info when the node group is backed off.
+	GetBackoffStatus(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) BackoffStatus
 	// RemoveBackoff removes backoff data for the given node group.
 	RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo)
 	// RemoveStaleBackoffData removes stale backoff data.
diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go
index 013273bc0587..cdcdef0eb716 100644
--- a/cluster-autoscaler/utils/backoff/exponential_backoff.go
+++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go
@@ -37,6 +37,7 @@ type exponentialBackoffInfo struct {
 	duration            time.Duration
 	backoffUntil        time.Time
 	lastFailedExecution time.Time
+	errorInfo           cloudprovider.InstanceErrorInfo
 }
 
 // NewExponentialBackoff creates an instance of exponential backoff.
@@ -66,7 +67,7 @@ func NewIdBasedExponentialBackoff(initialBackoffDuration time.Duration, maxBacko
 }
 
 // Backoff execution for the given node group. Returns time till execution is backed off.
-func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) time.Time {
+func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, errorInfo cloudprovider.InstanceErrorInfo, currentTime time.Time) time.Time {
 	duration := b.initialBackoffDuration
 	key := b.nodeGroupKey(nodeGroup)
 	if backoffInfo, found := b.backoffInfo[key]; found {
@@ -87,14 +88,21 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo
 		duration:            duration,
 		backoffUntil:        backoffUntil,
 		lastFailedExecution: currentTime,
+		errorInfo:           errorInfo,
 	}
 	return backoffUntil
 }
 
-// IsBackedOff returns true if execution is backed off for the given node group.
-func (b *exponentialBackoff) IsBackedOff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) bool {
+// GetBackoffStatus returns whether the execution is backed off for the given node group and error info when the node group is backed off.
+func (b *exponentialBackoff) GetBackoffStatus(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, currentTime time.Time) BackoffStatus {
 	backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)]
-	return found && backoffInfo.backoffUntil.After(currentTime)
+	if !found || !backoffInfo.backoffUntil.After(currentTime) {
+		return BackoffStatus{IsBackoff: false}
+	}
+	return BackoffStatus{
+		IsBackoff: true,
+		ErrorInfo: &backoffInfo.errorInfo,
+	}
 }
 
 // RemoveBackoff removes backoff data for the given node group.
diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff_test.go b/cluster-autoscaler/utils/backoff/exponential_backoff_test.go
index 2fd9d8319685..e948718dffeb 100644
--- a/cluster-autoscaler/utils/backoff/exponential_backoff_test.go
+++ b/cluster-autoscaler/utils/backoff/exponential_backoff_test.go
@@ -35,45 +35,58 @@ func nodeGroup(id string) cloudprovider.NodeGroup {
 var nodeGroup1 = nodeGroup("id1")
 var nodeGroup2 = nodeGroup("id2")
 
+var quotaError = cloudprovider.InstanceErrorInfo{ErrorClass: cloudprovider.OutOfResourcesErrorClass, ErrorCode: "QUOTA_EXCEEDED", ErrorMessage: "Not enough CPU"}
+var ipSpaceExhaustedError = cloudprovider.InstanceErrorInfo{ErrorClass: cloudprovider.OtherErrorClass, ErrorCode: "IP_SPACE_EXHAUSTED", ErrorMessage: "IP space has been exhausted"}
+
+var noBackOff = BackoffStatus{IsBackoff: false}
+var backoffWithQuotaError = BackoffStatus{
+	IsBackoff: true,
+	ErrorInfo: &quotaError,
+}
+var backoffWithIpSpaceExhaustedError = BackoffStatus{
+	IsBackoff: true,
+	ErrorInfo: &ipSpaceExhaustedError,
+}
+
 func TestBackoffTwoKeys(t *testing.T) {
 	backoff := NewIdBasedExponentialBackoff(10*time.Minute, time.Hour, 3*time.Hour)
 	startTime := time.Now()
-	assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
-	assert.False(t, backoff.IsBackedOff(nodeGroup2, nil, startTime))
-	backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(time.Minute))
-	assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(2*time.Minute)))
-	assert.False(t, backoff.IsBackedOff(nodeGroup2, nil, startTime))
-	assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(11*time.Minute)))
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime), noBackOff)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup2, nil, startTime), noBackOff)
+	backoff.Backoff(nodeGroup1, nil, quotaError, startTime.Add(time.Minute))
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(2*time.Minute)), backoffWithQuotaError)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup2, nil, startTime), noBackOff)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(11*time.Minute)), noBackOff)
 }
 
 func TestMaxBackoff(t *testing.T) {
 	backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
 	startTime := time.Now()
-	backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime)
-	assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
-	assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(1*time.Minute)))
-	backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(1*time.Minute))
-	assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(1*time.Minute)))
-	assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(3*time.Minute)))
-	backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime.Add(3*time.Minute))
-	assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(3*time.Minute)))
-	assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime.Add(6*time.Minute)))
+	backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime), backoffWithIpSpaceExhaustedError)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(1*time.Minute)), noBackOff)
+	backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime.Add(1*time.Minute))
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(1*time.Minute)), backoffWithIpSpaceExhaustedError)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(3*time.Minute)), noBackOff)
+	backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, startTime.Add(3*time.Minute))
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(3*time.Minute)), backoffWithIpSpaceExhaustedError)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime.Add(6*time.Minute)), noBackOff)
 }
 
 func TestRemoveBackoff(t *testing.T) {
 	backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
 	startTime := time.Now()
-	backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime)
-	assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
+	backoff.Backoff(nodeGroup1, nil, quotaError, startTime)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime), backoffWithQuotaError)
 	backoff.RemoveBackoff(nodeGroup1, nil)
-	assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, startTime), noBackOff)
 }
 
 func TestResetStaleBackoffData(t *testing.T) {
 	backoff := NewIdBasedExponentialBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
 	startTime := time.Now()
-	backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime)
-	backoff.Backoff(nodeGroup2, nil, cloudprovider.OtherErrorClass, "", startTime.Add(time.Hour))
+	backoff.Backoff(nodeGroup1, nil, quotaError, startTime)
+	backoff.Backoff(nodeGroup2, nil, quotaError, startTime.Add(time.Hour))
 	backoff.RemoveStaleBackoffData(startTime.Add(time.Hour))
 	assert.Equal(t, 2, len(backoff.(*exponentialBackoff).backoffInfo))
 	backoff.RemoveStaleBackoffData(startTime.Add(4 * time.Hour))
@@ -84,20 +97,22 @@ func TestResetStaleBackoffData(t *testing.T) {
 
 func TestIncreaseExistingBackoff(t *testing.T) {
 	backoff := NewIdBasedExponentialBackoff(1*time.Second, 10*time.Minute, 3*time.Hour)
-	startTime := time.Now()
-	backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", startTime)
+	currentTime := time.Now()
+	backoff.Backoff(nodeGroup1, nil, quotaError, currentTime)
 	// NG in backoff for one second here
-	assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, startTime))
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, currentTime), backoffWithQuotaError)
 	// Come out of backoff
-	time.Sleep(1 * time.Second)
-	assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now()))
-	// Confirm existing backoff duration has been increased by backing off again
-	backoff.Backoff(nodeGroup1, nil, cloudprovider.OtherErrorClass, "", time.Now())
+	currentTime = currentTime.Add(1 * time.Second)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, currentTime), noBackOff)
+	// Confirm existing backoff duration and error info have been increased by backing off again
+	backoff.Backoff(nodeGroup1, nil, ipSpaceExhaustedError, currentTime)
 	// Backoff should be for 2 seconds now
-	assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now()))
-	time.Sleep(1 * time.Second)
-	assert.True(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now()))
-	time.Sleep(1 * time.Second)
-	assert.False(t, backoff.IsBackedOff(nodeGroup1, nil, time.Now()))
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, currentTime), backoffWithIpSpaceExhaustedError)
+	currentTime = currentTime.Add(1 * time.Second)
+	// Doing backoff during existing backoff should change error info but doesn't change the duration.
+	backoff.Backoff(nodeGroup1, nil, quotaError, currentTime)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, time.Now()), backoffWithQuotaError)
+	currentTime = currentTime.Add(1 * time.Second)
+	assert.Equal(t, backoff.GetBackoffStatus(nodeGroup1, nil, currentTime), noBackOff)
 	// Result: existing backoff duration was scaled up beyond initial duration
 }