Skip to content

Commit

Permalink
daemon/drain: bump draining to error after 1 hour of attempts
Browse files Browse the repository at this point in the history
Refactor drain to remove exponential backoff which does not
scale well with larger values (hours).
  • Loading branch information
kikisdeliveryservice committed Jun 4, 2021
1 parent b7553ab commit 88cca46
Showing 1 changed file with 29 additions and 25 deletions.
54 changes: 29 additions & 25 deletions pkg/daemon/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,34 +45,38 @@ func (dn *Daemon) cordonOrUncordonNode(desired bool) error {
}

func (dn *Daemon) drain() error {
backoff := wait.Backoff{
Steps: 5,
Duration: 10 * time.Second,
Factor: 2,
}
var lastErr error
if err := wait.ExponentialBackoff(backoff, func() (bool, error) {
err := drain.RunNodeDrain(dn.drainer, dn.node.Name)
if err != nil {
lastErr = err
glog.Infof("Draining failed with: %v, retrying", err)
return false, nil
}
return true, nil
done := make(chan bool, 1)

}); err != nil {
if err == wait.ErrWaitTimeout {
failMsg := fmt.Sprintf("%d tries: %v", backoff.Steps, lastErr)
MCDDrainErr.WithLabelValues(dn.node.Name, "WaitTimeout").Set(float64(backoff.Steps))
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedToDrain", failMsg)
return errors.Wrapf(lastErr, "failed to drain node (%d tries): %v", backoff.Steps, err)
}
MCDDrainErr.WithLabelValues(dn.node.Name, "UnknownError").Set(float64(backoff.Steps))
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedToDrain", err.Error())
return errors.Wrap(err, "failed to drain node")
drainer := func() chan error {
ret := make(chan error)
go func() {
for {
select {
case <-done:
return
default:
if err := drain.RunNodeDrain(dn.drainer, dn.node.Name); err != nil {
glog.Infof("Draining failed with: %v, retrying", err)
time.Sleep(5 * time.Minute)
continue
}
close(ret)
return
}
}
}()
return ret
}

return nil
select {
case <-time.After(1 * time.Hour):
done <- true
failMsg := fmt.Sprintf("failed to drain node : %s after 1 hour", dn.node.Name)
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeWarning, "FailedToDrain", failMsg)
return errors.New(failMsg)
case <-drainer():
return nil
}
}

func (dn *Daemon) performDrain() error {
Expand Down

0 comments on commit 88cca46

Please sign in to comment.