Skip to content

Commit

Permalink
Allow customizable backoff factor and max Signed-off-by: Pete Saia <i…
Browse files Browse the repository at this point in the history
…am@petesaia.com>
  • Loading branch information
psaia committed May 22, 2024
1 parent 2824c9a commit 573d8d7
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 21 deletions.
5 changes: 4 additions & 1 deletion api/v1alpha1/healthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ limitations under the License.
package v1alpha1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"reflect"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
Expand All @@ -34,6 +35,8 @@ type HealthCheckSpec struct {
Level string `json:"level,omitempty"` // defines if a workflow runs in a Namespace or Cluster level
Schedule ScheduleSpec `json:"schedule,omitempty"` // Schedule defines schedule rules to run HealthCheck
RemedyWorkflow RemedyWorkflow `json:"remedyworkflow,omitempty"`
BackoffFactor string `json:"backoffFactor,omitempty"`
BackoffMax int `json:"backoffMax,omitempty"`
RemedyRunsLimit int `json:"remedyRunsLimit,omitempty"`
RemedyResetInterval int `json:"remedyResetInterval,omitempty"`
}
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/activemonitor.keikoproj.io_healthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ spec:
Either RepeatAfterSec or Schedule must be defined for the health check
to run
properties:
backoffFactor:
type: string
backoffMax:
type: integer
description:
type: string
level:
Expand Down
45 changes: 25 additions & 20 deletions internal/controllers/healthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"context"
"errors"
"fmt"
"strconv"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -128,7 +129,7 @@ func (r *HealthCheckReconciler) Reconcile(ctx context.Context, req ctrl.Request)
r.RepeatTimersByName = make(map[string]*time.Timer)
}

var healthCheck = &activemonitorv1alpha1.HealthCheck{}
healthCheck := &activemonitorv1alpha1.HealthCheck{}
if err := r.Get(ctx, req.NamespacedName, healthCheck); err != nil {
// if our healthcheck was deleted, this Reconcile method is invoked with an empty resource cache
// see: https://book.kubebuilder.io/cronjob-tutorial/controller-implementation.html#1-load-the-cronjob-by-name
Expand Down Expand Up @@ -170,7 +171,6 @@ func (r *HealthCheckReconciler) processOrRecoverHealthCheck(ctx context.Context,
err := r.Update(ctx, &healthCheckNew)
return err
})

if err != nil {
log.Error(err, "Error updating healthcheck resource")
r.Recorder.Event(&healthCheckNew, v1.EventTypeWarning, "Warning", "Error updating healthcheck resource")
Expand Down Expand Up @@ -297,7 +297,6 @@ func (r *HealthCheckReconciler) createRBACForWorkflow(ctx context.Context, log l
}
}
if level == healthCheckClusterLevel {

if workFlowType != remedy {
_, err := r.createClusterRole(ctx, r.kubeclient, log, hc, amclusterRole)
if err != nil {
Expand Down Expand Up @@ -327,9 +326,7 @@ func (r *HealthCheckReconciler) createRBACForWorkflow(ctx context.Context, log l
return err
}
}

} else if level == healthCheckNamespaceLevel {

if workFlowType != remedy {
_, err := r.createNameSpaceRole(ctx, r.kubeclient, log, hc, amnsRole, wfNamespace)
if err != nil {
Expand Down Expand Up @@ -358,7 +355,6 @@ func (r *HealthCheckReconciler) createRBACForWorkflow(ctx context.Context, log l
return err
}
}

} else {
r.Recorder.Event(hc, v1.EventTypeWarning, "Warning", "level is not set")
return errors.New("level is not set")
Expand Down Expand Up @@ -513,24 +509,39 @@ func (r *HealthCheckReconciler) watchWorkflowReschedule(ctx context.Context, req
var now metav1.Time
then := metav1.Time{Time: time.Now()}
repeatAfterSec := hc.Spec.RepeatAfterSec
maxTime := time.Duration(hc.Spec.Workflow.Timeout/2) * time.Second
if maxTime <= 0 {
maxTime = time.Second
var maxTime time.Duration
if hc.Spec.BackoffMax == 0 {
maxTime = time.Duration(hc.Spec.Workflow.Timeout/2) * time.Second
if maxTime <= 0 {
maxTime = time.Second
}
} else {
maxTime = time.Duration(hc.Spec.BackoffMax) * time.Second
}
minTime := time.Duration(hc.Spec.Workflow.Timeout/60) * time.Second
if minTime <= 0 {
minTime = time.Second
}
factor := 0.5
if hc.Spec.BackoffFactor != "" {
val, err := strconv.ParseFloat(hc.Spec.BackoffFactor, 64)
if err != nil {
log.Error(err, "Error converting BackoffFactor string to float", err)
} else {
factor = val
}
}
timeout := time.Duration(hc.Spec.Workflow.Timeout) * time.Second
log.Info("IEB with timeout times are", "maxTime:", maxTime, "minTime:", minTime, "timeout:", timeout)
for ieTimer, err1 := iebackoff.NewIEBWithTimeout(maxTime, minTime, timeout, 0.5, time.Now()); ; err1 = ieTimer.Next() {
for ieTimer, err1 := iebackoff.NewIEBWithTimeout(maxTime, minTime, timeout, factor, time.Now()); ; err1 = ieTimer.Next() {
now = metav1.Time{Time: time.Now()}
// grab workflow object by name and check its status; update healthcheck accordingly
// do this once per second until the workflow reaches a terminal state (success/failure)
workflow, err := r.DynClient.Resource(wfGvr).Namespace(wfNamespace).Get(ctx, wfName, metav1.GetOptions{})
if err != nil {
// if the workflow wasn't found, it is most likely the case that its parent healthcheck was deleted
// we can swallow this error and simply not reschedule
r.Recorder.Event(hc, v1.EventTypeWarning, "Warning", "Error attempting to find workflow for healthcheck. This may indicate that either the healthcheck was removed or the Workflow was GC'd before active-monitor could obtain the status")
return ignoreNotFound(err)
}
status, ok := workflow.UnstructuredContent()["status"].(map[string]interface{})
Expand Down Expand Up @@ -666,7 +677,6 @@ func (r *HealthCheckReconciler) watchWorkflowReschedule(ctx context.Context, req
}

func (r *HealthCheckReconciler) processRemedyWorkflow(ctx context.Context, log logr.Logger, wfNamespace string, hc *activemonitorv1alpha1.HealthCheck) error {

log.Info("Creating Remedy Workflow", "namespace", wfNamespace, "generateNamePrefix", hc.Spec.RemedyWorkflow.GenerateName)
err := r.createRBACForWorkflow(ctx, log, hc, remedy)
if err != nil {
Expand Down Expand Up @@ -820,7 +830,7 @@ func (r *HealthCheckReconciler) parseWorkflowFromHealthcheck(log logr.Logger, hc
r.workflowLabels = make(map[string]string)
}

//assign instanceId labels to workflows
// assign instanceId labels to workflows
if wflabels == nil {
r.workflowLabels[WfInstanceIdLabelKey] = WfInstanceId
} else {
Expand All @@ -842,7 +852,7 @@ func (r *HealthCheckReconciler) parseWorkflowFromHealthcheck(log logr.Logger, hc
r.workflowLabels = make(map[string]string)
}

//assign instanceId labels to workflows
// assign instanceId labels to workflows
r.workflowLabels[WfInstanceIdLabelKey] = WfInstanceId
m1 := metadata{generateName: hc.Spec.Workflow.GenerateName, labels: r.workflowLabels}
data["metadata"] = m1
Expand Down Expand Up @@ -929,7 +939,7 @@ func (r *HealthCheckReconciler) parseRemedyWorkflowFromHealthcheck(log logr.Logg
r.workflowLabels = make(map[string]string)
}

//assign instanceId labels to workflows
// assign instanceId labels to workflows
if wflabels == nil {
r.workflowLabels[WfInstanceIdLabelKey] = WfInstanceId
} else {
Expand All @@ -951,7 +961,7 @@ func (r *HealthCheckReconciler) parseRemedyWorkflowFromHealthcheck(log logr.Logg
r.workflowLabels = make(map[string]string)
}

//assign instanceId labels to workflows
// assign instanceId labels to workflows
r.workflowLabels[WfInstanceIdLabelKey] = WfInstanceId
m1 := metadata{generateName: hc.Spec.Workflow.GenerateName, labels: r.workflowLabels}
data["metadata"] = m1
Expand Down Expand Up @@ -1073,14 +1083,12 @@ func (r *HealthCheckReconciler) createClusterRole(ctx context.Context, clientset
},
Rules: []rbacv1.PolicyRule{
{

APIGroups: []string{"*"},
Resources: []string{"*"},
Verbs: []string{"get", "list", "watch"},
},
},
}, metav1.CreateOptions{})

if err != nil {
return "", err
}
Expand Down Expand Up @@ -1113,7 +1121,6 @@ func (r *HealthCheckReconciler) createRemedyClusterRole(ctx context.Context, cli
},
},
}, metav1.CreateOptions{})

if err != nil {
return "", err
}
Expand Down Expand Up @@ -1323,7 +1330,6 @@ func (r *HealthCheckReconciler) createClusterRoleBinding(ctx context.Context, cl
log.Info("Successfully Created", "ClusterRoleBinding", crb.Name)
r.Recorder.Event(hc, v1.EventTypeNormal, "Normal", "Successfully Created ClusterRoleBinding")
return crb.Name, nil

}

// Delete ClusterRoleBinding
Expand All @@ -1345,7 +1351,6 @@ func (r *HealthCheckReconciler) DeleteClusterRoleBinding(ctx context.Context, cl
}

return nil

}

func (r *HealthCheckReconciler) updateHealthCheckStatus(ctx context.Context, log logr.Logger, hc *activemonitorv1alpha1.HealthCheck) error {
Expand Down

0 comments on commit 573d8d7

Please sign in to comment.