Skip to content

Commit

Permalink
Merge pull request kubernetes#1694 from cezarygerard/master
Browse files Browse the repository at this point in the history
Do not restart controller after L4 healthcheck failure, publish metrics instead
  • Loading branch information
k8s-ci-robot authored Apr 13, 2022
2 parents e9fec8e + 9f6f770 commit 2781c36
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
5 changes: 3 additions & 2 deletions pkg/l4lb/l4controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
const (
// The max tolerated delay between update being enqueued and sync being invoked.
enqueueToSyncDelayThreshold = 15 * time.Minute
l4ILBControllerName = "l4-ilb-subsetting-controller"
)

// L4Controller manages the create/update delete of all L4 Internal LoadBalancer services.
Expand Down Expand Up @@ -132,7 +133,7 @@ func NewILBController(ctx *context.ControllerContext, stopCh chan struct{}) *L4C
})
// TODO enhance this by looking at some metric from service controller to ensure it is up.
// We cannot use existence of a backend service or other resource, since those are on a per-service basis.
ctx.AddHealthCheck("l4-ilb-subsetting-controller", l4c.checkHealth)
ctx.AddHealthCheck(l4ILBControllerName, l4c.checkHealth)
return l4c
}

Expand All @@ -146,7 +147,7 @@ func (l4c *L4Controller) checkHealth() error {
msg := fmt.Sprintf("L4 ILB Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold)
// Log here, context/http handler do no log the error.
klog.Error(msg)
return fmt.Errorf(msg)
l4metrics.PublishL4FailedHealthCheckCount(l4ILBControllerName)
}
return nil
}
Expand Down
6 changes: 4 additions & 2 deletions pkg/l4lb/l4netlbcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import (
"k8s.io/klog"
)

const l4NetLBControllerName = "l4netlb-controller"

type L4NetLBController struct {
ctx *context.ControllerContext
svcQueue utils.TaskQueue
Expand Down Expand Up @@ -109,7 +111,7 @@ func NewL4NetLBController(
}
},
})
ctx.AddHealthCheck("l4netlb-controller", l4netLBc.checkHealth)
ctx.AddHealthCheck(l4NetLBControllerName, l4netLBc.checkHealth)
return l4netLBc
}

Expand Down Expand Up @@ -265,7 +267,7 @@ func (lc *L4NetLBController) checkHealth() error {
msg := fmt.Sprintf("L4 External LoadBalancer Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold)
// Log here, context/http handler do no log the error.
klog.Error(msg)
return fmt.Errorf(msg)
l4metrics.PublishL4FailedHealthCheckCount(l4NetLBControllerName)
}
return nil
}
Expand Down
15 changes: 15 additions & 0 deletions pkg/l4lb/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const (
L4ilbErrorMetricName = "l4_ilb_sync_error_count"
L4netlbLatencyMetricName = "l4_netlb_sync_duration_seconds"
L4netlbErrorMetricName = "l4_netlb_sync_error_count"
l4failedHealthCheckName = "l4_failed_healthcheck_count"
)

var (
Expand Down Expand Up @@ -80,6 +81,13 @@ var (
},
l4LBSyncErrorMetricLabels,
)
l4FailedHealthCheckCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: l4failedHealthCheckName,
Help: "Count l4 controller healthcheck failures",
},
[]string{"controller_name"},
)
)

// init registers l4 ilb nad netlb sync metrics.
Expand All @@ -88,6 +96,8 @@ func init() {
prometheus.MustRegister(l4ILBSyncLatency, l4ILBSyncErrorCount)
klog.V(3).Infof("Registering L4 NetLB controller metrics %v, %v", l4NetLBSyncLatency, l4NetLBSyncErrorCount)
prometheus.MustRegister(l4NetLBSyncLatency, l4NetLBSyncErrorCount)
klog.V(3).Infof("Registering L4 healthcheck failures count metric: %v", l4FailedHealthCheckCount)
prometheus.MustRegister(l4FailedHealthCheckCount)
}

// PublishL4ILBSyncMetrics exports metrics related to the L4 ILB sync.
Expand Down Expand Up @@ -133,3 +143,8 @@ func publishL4NetLBSyncLatency(success bool, syncType string, startTime time.Tim
func publishL4NetLBSyncErrorCount(syncType, gceResource, errorType string) {
l4NetLBSyncErrorCount.WithLabelValues(syncType, gceResource, errorType).Inc()
}

// PublishL4FailedHealthCheckCount observers failed healt check from controller.
func PublishL4FailedHealthCheckCount(controllerName string) {
l4FailedHealthCheckCount.WithLabelValues(controllerName).Inc()
}

0 comments on commit 2781c36

Please sign in to comment.