From c7941877187ed05b24883ea051c366f954660292 Mon Sep 17 00:00:00 2001 From: Jeremy Huntwork Date: Wed, 12 Aug 2020 17:59:22 -0400 Subject: [PATCH] Begin draining a node when it enters `Terminating` state ... and continue with other operations once it reaches `Terminating:wait` state. As per https://docs.aws.amazon.com/elasticloadbalancing/latest/classic/config-conn-drain.html > If your instances are part of an Auto Scaling group and connection draining > is enabled for your load balancer, Auto Scaling waits for the in-flight > requests to complete, or for the maximum timeout to expire, before > terminating instances due to a scaling event or health check replacement ASGs will beging draining nodes from load balancers while in `Terminating` state, and will even remove them from the load balancer before the node transitions to `Terminating:wait`. This means if you depend on pod evictions to move a critical service to another available node in the load balancer target group, this will only happen _after_ that node has already been drained and removed from the load balancer. This effect is amplified whenever there is a timeout, or Deregistration delay value set on the load balancer. The default value is 300 seconds, as per here: https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-target-groups.html#deregistration-delay By draining sooner, critical pods providing service through that load balancer can move to other nodes and maintain uptime while the node is being deregistered from the load balancer. --- pkg/trigger/aws/autoscaling/asg.go | 13 ++++++++++++- pkg/trigger/aws/handler.go | 19 ++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/pkg/trigger/aws/autoscaling/asg.go b/pkg/trigger/aws/autoscaling/asg.go index e06247d..853c13d 100644 --- a/pkg/trigger/aws/autoscaling/asg.go +++ b/pkg/trigger/aws/autoscaling/asg.go @@ -12,7 +12,10 @@ import ( const ( // InstanceTerminatingStatus describes EC2 instance termination status - InstanceTerminatingStatus = "Terminating:Wait" + InstanceTerminatingStatus = "Terminating" + + // InstanceTerminatingWaitStatus describes EC2 instance termination:wait status + InstanceTerminatingWaitStatus = "Terminating:Wait" // LifecycleActionResultContinue describes ASG instance lifecycle continue result LifecycleActionResultContinue = "CONTINUE" @@ -99,3 +102,11 @@ func (a *AutoScaling) IsTerminating(status *string) bool { } return *status == InstanceTerminatingStatus } + +// IsTerminatingWait returns true if the provided status is in terminating:wait state +func (a *AutoScaling) IsTerminatingWait(status *string) bool { + if status == nil { + return false + } + return *status == InstanceTerminatingWaitStatus +} diff --git a/pkg/trigger/aws/handler.go b/pkg/trigger/aws/handler.go index 495b17c..1a1ada5 100644 --- a/pkg/trigger/aws/handler.go +++ b/pkg/trigger/aws/handler.go @@ -17,6 +17,7 @@ type HookHandler struct { // Loop starts an infinite handler loop func (h *HookHandler) Loop(nodeName string) { + var drained bool glog.Infof("Running node drainer on node '%s' on instance '%s' in region '%s' and profile '%s'", nodeName, h.AutoScaling.Options.InstanceID, h.AutoScaling.Options.Region, h.AutoScaling.Options.Profile) for { @@ -29,17 +30,25 @@ func (h *HookHandler) Loop(nodeName string) { continue } glog.Infof("Status of instance '%v' is '%v', autoscaling group is '%v'", h.AutoScaling.Options.InstanceID, *status, *autoScalingGroupName) - if !h.AutoScaling.IsTerminating(status) { + if !h.AutoScaling.IsTerminating(status) && !h.AutoScaling.IsTerminatingWait(status) { continue } - err = h.Drainer.Drain(nodeName) - if err != nil { - glog.Warningf("Not all pods on this host can be evicted, will try again: %s", err) + if !drained { + err = h.Drainer.Drain(nodeName) + if err != nil { + glog.Warningf("Not all pods on this host can be evicted, will try again: %s", err) + continue + } + drained = true + glog.Info("All evictable pods are gone, waiting to enter Terminating:Wait state") + } + + if !h.AutoScaling.IsTerminatingWait(status) { continue } - glog.Infof("All evictable pods are gone, notifying AutoScalingGroup that instance '%v' can be shutdown", h.AutoScaling.Options.InstanceID) + glog.Infof("Notifying AutoScalingGroup that instance '%v' can be shutdown", h.AutoScaling.Options.InstanceID) lifecycleHookName, err := h.AutoScaling.GetLifecycleHookName(autoScalingGroupName) if err != nil { glog.Warningf("Can not get lifecycle hook, will try again: %s", err)