From c7941877187ed05b24883ea051c366f954660292 Mon Sep 17 00:00:00 2001
From: Jeremy Huntwork <jhuntwork@lightcubesolutions.com>
Date: Wed, 12 Aug 2020 17:59:22 -0400
Subject: [PATCH] Begin draining a node when it enters `Terminating` state

... and continue with other operations once it reaches
`Terminating:wait` state.

As per https://docs.aws.amazon.com/elasticloadbalancing/latest/classic/config-conn-drain.html

> If your instances are part of an Auto Scaling group and connection draining
> is enabled for your load balancer, Auto Scaling waits for the in-flight
> requests to complete, or for the maximum timeout to expire, before
> terminating instances due to a scaling event or health check replacement

ASGs will beging draining nodes from load balancers while in
`Terminating` state, and will even remove them from the load balancer
before the node transitions to `Terminating:wait`. This means if you
depend on pod evictions to move a critical service to another available
node in the load balancer target group, this will only happen _after_
that node has already been drained and removed from the load balancer.

This effect is amplified whenever there is a timeout, or Deregistration delay
value set on the load balancer. The default value is 300 seconds, as per
here: https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-target-groups.html#deregistration-delay

By draining sooner, critical pods providing service through that load
balancer can move to other nodes and maintain uptime while the node is
being deregistered from the load balancer.
---
 pkg/trigger/aws/autoscaling/asg.go | 13 ++++++++++++-
 pkg/trigger/aws/handler.go         | 19 ++++++++++++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/pkg/trigger/aws/autoscaling/asg.go b/pkg/trigger/aws/autoscaling/asg.go
index e06247d..853c13d 100644
--- a/pkg/trigger/aws/autoscaling/asg.go
+++ b/pkg/trigger/aws/autoscaling/asg.go
@@ -12,7 +12,10 @@ import (
 
 const (
 	// InstanceTerminatingStatus describes EC2 instance termination status
-	InstanceTerminatingStatus = "Terminating:Wait"
+	InstanceTerminatingStatus = "Terminating"
+
+	// InstanceTerminatingWaitStatus describes EC2 instance termination:wait status
+	InstanceTerminatingWaitStatus = "Terminating:Wait"
 
 	// LifecycleActionResultContinue describes ASG instance lifecycle continue result
 	LifecycleActionResultContinue = "CONTINUE"
@@ -99,3 +102,11 @@ func (a *AutoScaling) IsTerminating(status *string) bool {
 	}
 	return *status == InstanceTerminatingStatus
 }
+
+// IsTerminatingWait returns true if the provided status is in terminating:wait state
+func (a *AutoScaling) IsTerminatingWait(status *string) bool {
+	if status == nil {
+		return false
+	}
+	return *status == InstanceTerminatingWaitStatus
+}
diff --git a/pkg/trigger/aws/handler.go b/pkg/trigger/aws/handler.go
index 495b17c..1a1ada5 100644
--- a/pkg/trigger/aws/handler.go
+++ b/pkg/trigger/aws/handler.go
@@ -17,6 +17,7 @@ type HookHandler struct {
 
 // Loop starts an infinite handler loop
 func (h *HookHandler) Loop(nodeName string) {
+	var drained bool
 	glog.Infof("Running node drainer on node '%s' on instance '%s' in region '%s' and profile '%s'",
 		nodeName, h.AutoScaling.Options.InstanceID, h.AutoScaling.Options.Region, h.AutoScaling.Options.Profile)
 	for {
@@ -29,17 +30,25 @@ func (h *HookHandler) Loop(nodeName string) {
 			continue
 		}
 		glog.Infof("Status of instance '%v' is '%v', autoscaling group is '%v'", h.AutoScaling.Options.InstanceID, *status, *autoScalingGroupName)
-		if !h.AutoScaling.IsTerminating(status) {
+		if !h.AutoScaling.IsTerminating(status) && !h.AutoScaling.IsTerminatingWait(status) {
 			continue
 		}
 
-		err = h.Drainer.Drain(nodeName)
-		if err != nil {
-			glog.Warningf("Not all pods on this host can be evicted, will try again: %s", err)
+		if !drained {
+			err = h.Drainer.Drain(nodeName)
+			if err != nil {
+				glog.Warningf("Not all pods on this host can be evicted, will try again: %s", err)
+				continue
+			}
+			drained = true
+			glog.Info("All evictable pods are gone, waiting to enter Terminating:Wait state")
+		}
+
+		if !h.AutoScaling.IsTerminatingWait(status) {
 			continue
 		}
-		glog.Infof("All evictable pods are gone, notifying AutoScalingGroup that instance '%v' can be shutdown", h.AutoScaling.Options.InstanceID)
 
+		glog.Infof("Notifying AutoScalingGroup that instance '%v' can be shutdown", h.AutoScaling.Options.InstanceID)
 		lifecycleHookName, err := h.AutoScaling.GetLifecycleHookName(autoScalingGroupName)
 		if err != nil {
 			glog.Warningf("Can not get lifecycle hook, will try again: %s", err)