Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Begin draining a node when it enters Terminating state #5

Merged
merged 1 commit into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion pkg/trigger/aws/autoscaling/asg.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ import (

const (
// InstanceTerminatingStatus describes EC2 instance termination status
InstanceTerminatingStatus = "Terminating:Wait"
InstanceTerminatingStatus = "Terminating"

// InstanceTerminatingWaitStatus describes EC2 instance termination:wait status
InstanceTerminatingWaitStatus = "Terminating:Wait"

// LifecycleActionResultContinue describes ASG instance lifecycle continue result
LifecycleActionResultContinue = "CONTINUE"
Expand Down Expand Up @@ -99,3 +102,11 @@ func (a *AutoScaling) IsTerminating(status *string) bool {
}
return *status == InstanceTerminatingStatus
}

// IsTerminatingWait returns true if the provided status is in terminating:wait state
func (a *AutoScaling) IsTerminatingWait(status *string) bool {
if status == nil {
return false
}
return *status == InstanceTerminatingWaitStatus
}
19 changes: 14 additions & 5 deletions pkg/trigger/aws/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type HookHandler struct {

// Loop starts an infinite handler loop
func (h *HookHandler) Loop(nodeName string) {
var drained bool
glog.Infof("Running node drainer on node '%s' on instance '%s' in region '%s' and profile '%s'",
nodeName, h.AutoScaling.Options.InstanceID, h.AutoScaling.Options.Region, h.AutoScaling.Options.Profile)
for {
Expand All @@ -29,17 +30,25 @@ func (h *HookHandler) Loop(nodeName string) {
continue
}
glog.Infof("Status of instance '%v' is '%v', autoscaling group is '%v'", h.AutoScaling.Options.InstanceID, *status, *autoScalingGroupName)
if !h.AutoScaling.IsTerminating(status) {
if !h.AutoScaling.IsTerminating(status) && !h.AutoScaling.IsTerminatingWait(status) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love the fact that you've used both to make sure it triggers if the docs and impl from AWS diverge.

continue
}

err = h.Drainer.Drain(nodeName)
if err != nil {
glog.Warningf("Not all pods on this host can be evicted, will try again: %s", err)
if !drained {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so if I understand correctly, there was a misleading message in logs, that was not always true, and now we fix that, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, after moving the draining to happen as soon as it sees the "Termination" state, it will probably loop several times before it hits the "Termination:Wait" state, and this is to just avoid it trying to cordon and drain again if it's already done so successfully.

err = h.Drainer.Drain(nodeName)
if err != nil {
glog.Warningf("Not all pods on this host can be evicted, will try again: %s", err)
continue
}
drained = true
glog.Info("All evictable pods are gone, waiting to enter Terminating:Wait state")
}

if !h.AutoScaling.IsTerminatingWait(status) {
continue
}
glog.Infof("All evictable pods are gone, notifying AutoScalingGroup that instance '%v' can be shutdown", h.AutoScaling.Options.InstanceID)

glog.Infof("Notifying AutoScalingGroup that instance '%v' can be shutdown", h.AutoScaling.Options.InstanceID)
lifecycleHookName, err := h.AutoScaling.GetLifecycleHookName(autoScalingGroupName)
if err != nil {
glog.Warningf("Can not get lifecycle hook, will try again: %s", err)
Expand Down