From fc7988ce9d7b36a426dc87d7989897f8f42f6d26 Mon Sep 17 00:00:00 2001 From: Carlos Salas Date: Mon, 4 Mar 2024 19:14:28 +0100 Subject: [PATCH 1/3] feat: use mark and delete instead of ttl Signed-off-by: Carlos Salas --- action.yml | 11 ++++++----- action/cleanup.go | 11 +++++++---- action/cleanup_asg.go | 43 +++++++++++++++++++++++++++++++++++++++---- action/cleanup_eks.go | 43 +++++++++++++++++++++++++++++++------------ action/errors.go | 1 - action/input.go | 13 ++++--------- 6 files changed, 87 insertions(+), 35 deletions(-) diff --git a/action.yml b/action.yml index ef9641c..9b7c820 100644 --- a/action.yml +++ b/action.yml @@ -1,6 +1,6 @@ name: 'AWS Janitor' author: 'Rancher Sandbox' -description: 'Clean-up AWS resources based on a TTL.' +description: 'Mark and clean AWS resources.' inputs: regions: description: 'A comma separated list of regions to clean resources in. You can use * for all regions.' @@ -9,16 +9,17 @@ inputs: description: 'Set to true if you want to allow cleaning resources in all regions. If true then * must be used for regions.' required: false default: 'false' - ttl: - description: 'The duration that a resource can live for. For example, use 24h for 1 day.' - required: true commit: description: 'Should the action just report or do the actual delete.' required: false default: 'false' + ignore-tag: + description: 'The name of the tag that indicates a resource should not be deleted. Defaults to `janitor-ignore`' + required: false + default: 'janitor-ignore' runs: using: 'docker' image: 'docker://ghcr.io/rancher-sandbox/aws-janitor:v0.1.0' branding: icon: 'delete' - color: 'blue' \ No newline at end of file + color: 'blue' diff --git a/action/cleanup.go b/action/cleanup.go index 4ec113e..83cbceb 100644 --- a/action/cleanup.go +++ b/action/cleanup.go @@ -2,15 +2,18 @@ package action import ( "context" - "time" "github.com/aws/aws-sdk-go/aws/session" ) +const ( + DeletionTag = "aws-janitor/marked-for-deletion" +) + type CleanupScope struct { - Session *session.Session - TTL time.Duration - Commit bool + Session *session.Session + Commit bool + IgnoreTag string } type CleanupFunc func(ctx context.Context, input *CleanupScope) error diff --git a/action/cleanup_asg.go b/action/cleanup_asg.go index 91088a3..b0194bf 100644 --- a/action/cleanup_asg.go +++ b/action/cleanup_asg.go @@ -3,7 +3,6 @@ package action import ( "context" "fmt" - "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/autoscaling" @@ -15,12 +14,32 @@ func (a *action) cleanASGs(ctx context.Context, input *CleanupScope) error { asgToDelete := []*autoscaling.Group{} pageFunc := func(page *autoscaling.DescribeAutoScalingGroupsOutput, _ bool) bool { for _, asg := range page.AutoScalingGroups { - maxAge := asg.CreatedTime.Add(input.TTL) + var ignore, markedForDeletion bool + for _, tag := range asg.Tags { + if *tag.Key == input.IgnoreTag { + ignore = true + } else if *tag.Key == DeletionTag { + markedForDeletion = true + } + } - if time.Now().Before(maxAge) { - LogDebug("asg %s has max age greater than now, skipping cleanup", *asg.AutoScalingGroupName) + if ignore { + LogDebug("asg %s has ignore tag, skipping cleanup", *asg.AutoScalingGroupName) continue } + + if !markedForDeletion { + // NOTE: only mark for future deletion if we're not running in dry-mode + if a.commit { + LogDebug("asg %s does not have deletion tag, marking for future deletion and skipping cleanup", *asg.AutoScalingGroupName) + if err := a.markAsgForFutureDeletion(ctx, *asg.AutoScalingGroupName, client); err != nil { + LogError("failed to mark asg %s for future deletion: %s", *asg.AutoScalingGroupName, err.Error()) + } + } + continue + } + + LogDebug("adding asg %s to delete list", *asg.AutoScalingGroupName) asgToDelete = append(asgToDelete, asg) } @@ -62,3 +81,19 @@ func (a *action) cleanASGs(ctx context.Context, input *CleanupScope) error { return nil } + +func (a *action) markAsgForFutureDeletion(ctx context.Context, asgName string, client *autoscaling.AutoScaling) error { + Log("Marking ASG %s for future deletion", asgName) + + _, err := client.CreateOrUpdateTagsWithContext(ctx, &autoscaling.CreateOrUpdateTagsInput{Tags: []*autoscaling.Tag{ + { + Key: aws.String(DeletionTag), + PropagateAtLaunch: aws.Bool(true), + ResourceId: aws.String(asgName), + ResourceType: aws.String("auto-scaling-group"), + Value: aws.String("true"), + }, + }}) + + return err +} diff --git a/action/cleanup_eks.go b/action/cleanup_eks.go index a3af57f..6d79636 100644 --- a/action/cleanup_eks.go +++ b/action/cleanup_eks.go @@ -3,15 +3,15 @@ package action import ( "context" "fmt" - "time" + "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/eks" ) func (a *action) cleanEKSClusters(ctx context.Context, input *CleanupScope) error { client := eks.New(input.Session) - clustersToDelete := []*string{} + clustersToDelete := []*eks.Cluster{} pageFunc := func(page *eks.ListClustersOutput, _ bool) bool { for _, name := range page.Clusters { cluster, err := client.DescribeClusterWithContext(ctx, &eks.DescribeClusterInput{ @@ -22,13 +22,24 @@ func (a *action) cleanEKSClusters(ctx context.Context, input *CleanupScope) erro continue } - maxAge := cluster.Cluster.CreatedAt.Add(input.TTL) + if _, ok := cluster.Cluster.Tags[input.IgnoreTag]; ok { + LogDebug("eks cluster %s has ignore tag, skipping cleanup", *name) + continue + } - if time.Now().Before(maxAge) { - LogDebug("eks cluster %s has max age greater than now, skipping cleanup", *name) + if _, ok := cluster.Cluster.Tags[DeletionTag]; !ok { + // NOTE: only mark for future deletion if we're not running in dry-mode + if a.commit { + LogDebug("eks cluster %s does not have deletion tag, marking for future deletion and skipping cleanup", *name) + if err := a.markEKSClusterForFutureDeletion(ctx, *cluster.Cluster.Arn, client); err != nil { + LogError("failed to mark cluster %s for future deletion: %s", *cluster.Cluster.Arn, err.Error()) + } + } continue } - clustersToDelete = append(clustersToDelete, name) + + LogDebug("adding eks cluster %s to delete list", *name) + clustersToDelete = append(clustersToDelete, cluster.Cluster) } return true @@ -43,26 +54,34 @@ func (a *action) cleanEKSClusters(ctx context.Context, input *CleanupScope) erro return nil } - for _, clusterName := range clustersToDelete { + for _, clusterObj := range clustersToDelete { if !a.commit { - LogDebug("skipping deletion of eks cluster %s as running in dry-mode", *clusterName) + LogDebug("skipping deletion of eks cluster %s as running in dry-mode", *clusterObj.Name) continue } - if err := a.deleteEKSCluster(ctx, *clusterName, client); err != nil { - LogError("failed to delete cluster %s: %s", *clusterName, err.Error()) + if err := a.deleteEKSCluster(ctx, *clusterObj.Name, client); err != nil { + LogError("failed to delete cluster %s: %s", *clusterObj.Name, err.Error()) } } return nil } +func (a *action) markEKSClusterForFutureDeletion(ctx context.Context, clusterArn string, client *eks.EKS) error { + Log("Marking EKS cluster %s for future deletion", clusterArn) + + _, err := client.TagResourceWithContext(ctx, &eks.TagResourceInput{ResourceArn: &clusterArn, Tags: map[string]*string{DeletionTag: aws.String("true")}}) + + return err +} + func (a *action) deleteEKSCluster(ctx context.Context, clusterName string, client *eks.EKS) error { Log("Deleting EKS cluster %s", clusterName) LogDebug("Deleting nodegroups for cluster %s", clusterName) - listErr := client.ListNodegroupsPagesWithContext(ctx, &eks.ListNodegroupsInput{ClusterName: &clusterName}, func(page *eks.ListNodegroupsOutput, b bool) bool { + listErr := client.ListNodegroupsPagesWithContext(ctx, &eks.ListNodegroupsInput{ClusterName: &clusterName}, func(page *eks.ListNodegroupsOutput, _ bool) bool { for _, ngName := range page.Nodegroups { Log("Deleting nodegroup %s in cluster %s", *ngName, clusterName) if _, err := client.DeleteNodegroupWithContext(ctx, &eks.DeleteNodegroupInput{ClusterName: &clusterName, NodegroupName: ngName}); err != nil { @@ -84,7 +103,7 @@ func (a *action) deleteEKSCluster(ctx context.Context, clusterName string, clien return fmt.Errorf("failed to delete cluster %s: %w", clusterName, err) } - if err := client.WaitUntilClusterDeletedWithContext(ctx, &eks.DescribeClusterInput{}); err != nil { + if err := client.WaitUntilClusterDeletedWithContext(ctx, &eks.DescribeClusterInput{Name: &clusterName}); err != nil { return fmt.Errorf("failed to wait for cluster %s to be delete: %w", clusterName, err) } diff --git a/action/errors.go b/action/errors.go index d637cd6..07be689 100644 --- a/action/errors.go +++ b/action/errors.go @@ -5,5 +5,4 @@ import "errors" var ( ErrAllRegionsNotAllowed = errors.New("all regions is not allowed") ErrRegionsRequired = errors.New("regions is required") - ErrTTLRequired = errors.New("ttl is required") ) diff --git a/action/input.go b/action/input.go index e45a3fb..8a33cc4 100644 --- a/action/input.go +++ b/action/input.go @@ -2,17 +2,16 @@ package action import ( "fmt" - "time" "github.com/caarlos0/env/v9" "go.uber.org/multierr" ) type Input struct { - Regions string `env:"INPUT_REGIONS"` - AllowAllRegion bool `env:"INPUT_ALLOW-ALL-REGIONS"` - TTL time.Duration `env:"INPUT_TTL"` - Commit bool `env:"INPUT_COMMIT"` + Regions string `env:"INPUT_REGIONS"` + AllowAllRegion bool `env:"INPUT_ALLOW-ALL-REGIONS"` + Commit bool `env:"INPUT_COMMIT"` + IgnoreTag string `env:"INPUT_IGNORE-TAG"` } // NewInput creates a new input from the environment variables. @@ -36,9 +35,5 @@ func (i *Input) Validate() error { err = multierr.Append(err, ErrAllRegionsNotAllowed) } - if i.TTL.Seconds() == 0 { - err = multierr.Append(err, ErrTTLRequired) - } - return err } From 7de4ba62eeec8ee8a8b869a2060fa40e6d2deb62 Mon Sep 17 00:00:00 2001 From: Carlos Salas Date: Mon, 4 Mar 2024 19:16:09 +0100 Subject: [PATCH 2/3] feat: add cleanup for other resources clouformation stacks, load balancers and security groups Signed-off-by: Carlos Salas --- action/action.go | 35 +++++---- action/cleanup_cf.go | 114 ++++++++++++++++++++++++++++++ action/cleanup_lb.go | 101 ++++++++++++++++++++++++++ action/cleanup_sgs.go | 161 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 399 insertions(+), 12 deletions(-) create mode 100644 action/cleanup_cf.go create mode 100644 action/cleanup_lb.go create mode 100644 action/cleanup_sgs.go diff --git a/action/action.go b/action/action.go index 28a8677..eb1f4e1 100644 --- a/action/action.go +++ b/action/action.go @@ -9,7 +9,10 @@ import ( "github.com/aws/aws-sdk-go/aws/endpoints" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/autoscaling" + "github.com/aws/aws-sdk-go/service/cloudformation" + "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/eks" + "github.com/aws/aws-sdk-go/service/elb" ) type AwsJanitorAction interface { @@ -26,17 +29,25 @@ type action struct { commit bool } +type Cleaner struct { + Service string + Run CleanupFunc +} + func (a *action) Cleanup(ctx context.Context, input *Input) error { - //NOTE: ordering matters here! - cleanupFuncs := map[string]CleanupFunc{ - eks.ServiceName: a.cleanEKSClusters, - autoscaling.ServiceName: a.cleanASGs, + // use []Cleaner to keep the order + cleaners := []Cleaner{ + {Service: eks.ServiceName, Run: a.cleanEKSClusters}, + {Service: autoscaling.ServiceName, Run: a.cleanASGs}, + {Service: elb.ServiceName, Run: a.cleanLoadBalancers}, + {Service: ec2.ServiceName, Run: a.cleanSecurityGroups}, + {Service: cloudformation.ServiceName, Run: a.cleanCfStacks}, } inputRegions := strings.Split(input.Regions, ",") - for service, cleanupFunc := range cleanupFuncs { - regions := getServiceRegions(service, inputRegions) + for _, cleaner := range cleaners { + regions := getServiceRegions(cleaner.Service, inputRegions) for _, region := range regions { sess, err := session.NewSession(&aws.Config{ @@ -47,14 +58,14 @@ func (a *action) Cleanup(ctx context.Context, input *Input) error { } scope := &CleanupScope{ - TTL: input.TTL, - Session: sess, - Commit: input.Commit, + Session: sess, + Commit: input.Commit, + IgnoreTag: input.IgnoreTag, } - Log("Cleaning up resources for service %s in region %s", service, region) - if err := cleanupFunc(ctx, scope); err != nil { - return fmt.Errorf("failed running cleanup for service %s: %w", service, err) + Log("Cleaning up resources for service %s in region %s", cleaner.Service, region) + if err := cleaner.Run(ctx, scope); err != nil { + return fmt.Errorf("failed running cleanup for service %s: %w", cleaner.Service, err) } } } diff --git a/action/cleanup_cf.go b/action/cleanup_cf.go new file mode 100644 index 0000000..da7fabe --- /dev/null +++ b/action/cleanup_cf.go @@ -0,0 +1,114 @@ +package action + +import ( + "context" + "fmt" + + "github.com/aws/aws-sdk-go/aws" + cf "github.com/aws/aws-sdk-go/service/cloudformation" +) + +func (a *action) cleanCfStacks(ctx context.Context, input *CleanupScope) error { + client := cf.New(input.Session) + + stacksToDelete := []*string{} + pageFunc := func(page *cf.DescribeStacksOutput, _ bool) bool { + for _, stack := range page.Stacks { + var ignore, markedForDeletion bool + for _, tag := range stack.Tags { + if *tag.Key == input.IgnoreTag { + ignore = true + } else if *tag.Key == DeletionTag { + markedForDeletion = true + } + } + + if ignore { + LogDebug("cloudformation stack %s has ignore tag, skipping cleanup", *stack.StackName) + continue + } + + if !markedForDeletion { + // NOTE: only mark for future deletion if we're not running in dry-mode + if a.commit { + LogDebug("cloudformation stack %s does not have deletion tag, marking for future deletion and skipping cleanup", *stack.StackName) + if err := a.markCfStackForFutureDeletion(ctx, stack, client); err != nil { + LogError("failed to mark cloudformation stack %s for future deletion: %s", *stack.StackName, err.Error()) + } + } + continue + } + + switch aws.StringValue(stack.StackStatus) { + case cf.ResourceStatusDeleteComplete, + cf.ResourceStatusDeleteInProgress: + LogDebug("cloudformation stack %s is already deleted/deleting, skipping cleanup", *stack.StackName) + continue + } + + LogDebug("adding cloudformation stack %s to delete list", *stack.StackName) + stacksToDelete = append(stacksToDelete, stack.StackName) + } + + return true + } + + if err := client.DescribeStacksPagesWithContext(ctx, &cf.DescribeStacksInput{}, pageFunc); err != nil { + return fmt.Errorf("failed getting list of cloudformation stacks: %w", err) + } + + if len(stacksToDelete) == 0 { + Log("no cloudformation stacks to delete") + return nil + } + + for _, stackName := range stacksToDelete { + if !a.commit { + LogDebug("skipping deletion of cloudformation stack %s as running in dry-mode", *stackName) + continue + } + + if err := a.deleteCfStack(ctx, *stackName, client); err != nil { + LogError("failed to delete cloudformation stack %s: %s", *stackName, err.Error()) + } + } + + return nil +} + +func (a *action) markCfStackForFutureDeletion(ctx context.Context, stack *cf.Stack, client *cf.CloudFormation) error { + Log("Marking CloudFormation stack %s for future deletion", *stack.StackName) + + stack.SetTags(append(stack.Tags, &cf.Tag{Key: aws.String(DeletionTag), Value: aws.String("true")})) + + LogDebug("Updating tags for cloudformation stack %s", *stack.StackName) + + if _, err := client.UpdateStackWithContext(ctx, &cf.UpdateStackInput{ + Capabilities: stack.Capabilities, + StackName: stack.StackName, + Tags: stack.Tags, + UsePreviousTemplate: aws.Bool(true), + }); err != nil { + return fmt.Errorf("failed to update cloudformation stack %s: %w", *stack.StackName, err) + } + + if err := client.WaitUntilStackUpdateCompleteWithContext(ctx, &cf.DescribeStacksInput{StackName: stack.StackName}); err != nil { + return fmt.Errorf("failed to wait for cloudformation stack %s to update: %w", *stack.StackName, err) + } + + return nil +} + +func (a *action) deleteCfStack(ctx context.Context, stackName string, client *cf.CloudFormation) error { + Log("Deleting CloudFormation stack %s", stackName) + + if _, err := client.DeleteStackWithContext(ctx, &cf.DeleteStackInput{StackName: &stackName}); err != nil { + return fmt.Errorf("failed to delete cloudformation stack %s: %w", stackName, err) + } + + if err := client.WaitUntilStackDeleteCompleteWithContext(ctx, &cf.DescribeStacksInput{StackName: &stackName}); err != nil { + return fmt.Errorf("failed to wait for cloudformation stack %s to delete: %w", stackName, err) + } + + return nil +} diff --git a/action/cleanup_lb.go b/action/cleanup_lb.go new file mode 100644 index 0000000..5d9360c --- /dev/null +++ b/action/cleanup_lb.go @@ -0,0 +1,101 @@ +package action + +import ( + "context" + "fmt" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/elb" +) + +func (a *action) cleanLoadBalancers(ctx context.Context, input *CleanupScope) error { + client := elb.New(input.Session) + + loadBalancersToDelete := []*string{} + pageFunc := func(page *elb.DescribeLoadBalancersOutput, _ bool) bool { + for _, lb := range page.LoadBalancerDescriptions { + tags, err := client.DescribeTagsWithContext(ctx, &elb.DescribeTagsInput{LoadBalancerNames: []*string{lb.LoadBalancerName}}) + if err != nil { + LogError("failed getting tags for load balancer %s: %s", *lb.LoadBalancerName, err.Error()) + } + + var ignore, markedForDeletion bool + for _, tagDescription := range tags.TagDescriptions { + for _, tag := range tagDescription.Tags { + if *tag.Key == input.IgnoreTag { + ignore = true + } else if *tag.Key == DeletionTag { + markedForDeletion = true + } + } + } + + if ignore { + LogDebug("load balancer %s has ignore tag, skipping cleanup", *lb.LoadBalancerName) + continue + } + + if !markedForDeletion { + // NOTE: only mark for future deletion if we're not running in dry-mode + if a.commit { + LogDebug("load balancer %s does not have deletion tag, marking for future deletion and skipping cleanup", *lb.LoadBalancerName) + if err := a.markLoadBalancerForFutureDeletion(ctx, *lb.LoadBalancerName, client); err != nil { + LogError("failed to mark load balancer %s for future deletion: %s", *lb.LoadBalancerName, err.Error()) + } + } + continue + } + + LogDebug("adding load balancer %s to delete list", *lb.LoadBalancerName) + loadBalancersToDelete = append(loadBalancersToDelete, lb.LoadBalancerName) + } + + return true + } + + if err := client.DescribeLoadBalancersPagesWithContext(ctx, &elb.DescribeLoadBalancersInput{}, pageFunc); err != nil { + return fmt.Errorf("failed getting list of load balancer: %w", err) + } + + if len(loadBalancersToDelete) == 0 { + Log("no load balancer to delete") + return nil + } + + for _, lbName := range loadBalancersToDelete { + if !a.commit { + LogDebug("skipping deletion of load balancer %s as running in dry-mode", *lbName) + continue + } + + if err := a.deleteLoadBalancer(ctx, *lbName, client); err != nil { + LogError("failed to delete load balancer %s: %s", *lbName, err.Error()) + } + } + + return nil +} +func (a *action) markLoadBalancerForFutureDeletion(ctx context.Context, lbName string, client *elb.ELB) error { + Log("Marking Load Balancer %s for future deletion", lbName) + + _, err := client.AddTagsWithContext(ctx, &elb.AddTagsInput{ + LoadBalancerNames: []*string{&lbName}, + Tags: []*elb.Tag{ + { + Key: aws.String(DeletionTag), + Value: aws.String("true")}, + }, + }) + + return err +} + +func (a *action) deleteLoadBalancer(ctx context.Context, lbName string, client *elb.ELB) error { + Log("Deleting Load Balancer %s", lbName) + + if _, err := client.DeleteLoadBalancerWithContext(ctx, &elb.DeleteLoadBalancerInput{LoadBalancerName: &lbName}); err != nil { + return fmt.Errorf("failed to delete load balancer %s: %w", lbName, err) + } + + return nil +} diff --git a/action/cleanup_sgs.go b/action/cleanup_sgs.go new file mode 100644 index 0000000..991298e --- /dev/null +++ b/action/cleanup_sgs.go @@ -0,0 +1,161 @@ +package action + +import ( + "context" + "fmt" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/ec2" +) + +func (a *action) cleanSecurityGroups(ctx context.Context, input *CleanupScope) error { + client := ec2.New(input.Session) + + sgsToDelete := []*ec2.SecurityGroup{} + // NOTE: we delete security groups based on whether we're later deleting the vpc they belong to or not. + pageFunc := func(page *ec2.DescribeVpcsOutput, _ bool) bool { + sgPageFunc := func(sgPage *ec2.GetSecurityGroupsForVpcOutput, _ bool) bool { + for _, sg := range sgPage.SecurityGroupForVpcs { + var ignore, markedForDeletion bool + for _, tag := range sg.Tags { + if *tag.Key == input.IgnoreTag { + ignore = true + } else if *tag.Key == DeletionTag { + markedForDeletion = true + } + } + + if ignore || *sg.GroupName == "default" { + LogDebug("security group %s has ignore tag or is a default security group, skipping cleanup", *sg.GroupId) + continue + } + + if !markedForDeletion { + // NOTE: only mark for future deletion if we're not running in dry-mode + if a.commit { + LogDebug("security group %s does not have deletion tag, marking for future deletion and skipping cleanup", *sg.GroupId) + if err := a.markSecurityGroupForFutureDeletion(ctx, *sg.GroupId, client); err != nil { + LogError("failed to mark security group %s for future deletion: %s", *sg.GroupId, err.Error()) + } + } + continue + } + + securityGroups, err := client.DescribeSecurityGroupsWithContext(ctx, &ec2.DescribeSecurityGroupsInput{GroupIds: []*string{sg.GroupId}}) + if err != nil || len(securityGroups.SecurityGroups) != 1 { + LogError("failed to describe security group %s: %s", *sg.GroupId, err.Error()) + continue + } + + LogDebug("adding security group %s to delete list", *sg.GroupId) + sgsToDelete = append(sgsToDelete, securityGroups.SecurityGroups[0]) + } + + return true + } + + for _, vpc := range page.Vpcs { + var ignore bool + for _, tag := range vpc.Tags { + if *tag.Key == input.IgnoreTag { + ignore = true + break + } + } + + if ignore || aws.BoolValue(vpc.IsDefault) { + LogDebug("vpc %s has ignore tag or is a default vpc, won't delete security groups associated with it", *vpc.VpcId) + continue + } + + if err := client.GetSecurityGroupsForVpcPagesWithContext(ctx, &ec2.GetSecurityGroupsForVpcInput{VpcId: vpc.VpcId}, sgPageFunc); err != nil { + LogError("failed getting list of security groups for vpc %s: %s", *vpc.VpcId, err.Error()) + continue + } + + } + + return true + } + + if err := client.DescribeVpcsPagesWithContext(ctx, &ec2.DescribeVpcsInput{}, pageFunc); err != nil { + return fmt.Errorf("failed getting list of vpcs: %w", err) + } + + if len(sgsToDelete) == 0 { + Log("no security groups to delete") + return nil + } + + // NOTE: some security groups may have rules that reference other security groups. + // deleting a security group that's referenced in another's rules will fail, + // so we need to delete the rules first. + for _, securityGroup := range sgsToDelete { + if !a.commit { + LogDebug("skipping deletion of security group %s as running in dry-mode", *securityGroup.GroupId) + continue + } + + if err := a.deleteSecurityGroupRules(ctx, *securityGroup.GroupId, securityGroup.IpPermissions, securityGroup.IpPermissionsEgress, client); err != nil { + LogError("failed to delete security group rules for %s: %s", *securityGroup.GroupId, err.Error()) + } + + } + + for _, securityGroup := range sgsToDelete { + if !a.commit { + LogDebug("skipping deletion of security group %s as running in dry-mode", *securityGroup.GroupId) + continue + } + + LogDebug("Sleeping for 10 seconds to allow AWS to catch up") + time.Sleep(10 * time.Second) + + if err := a.deleteSecurityGroup(ctx, *securityGroup.GroupId, client); err != nil { + LogError("failed to delete security group %s: %s", *securityGroup.GroupId, err.Error()) + } + } + + return nil +} + +func (a *action) markSecurityGroupForFutureDeletion(ctx context.Context, sgId string, client *ec2.EC2) error { + Log("Marking Security Group %s for future deletion", sgId) + + _, err := client.CreateTagsWithContext(ctx, &ec2.CreateTagsInput{ + Resources: []*string{&sgId}, Tags: []*ec2.Tag{ + {Key: aws.String(DeletionTag), Value: aws.String("true")}, + }, + }) + + return err +} + +func (a *action) deleteSecurityGroupRules(ctx context.Context, sgId string, sgIngress, sgEgress []*ec2.IpPermission, client *ec2.EC2) error { + Log("Deleting Ingress/Egress Rules from security group %s", sgId) + + if len(sgIngress) != 0 { + if _, err := client.RevokeSecurityGroupIngressWithContext(ctx, &ec2.RevokeSecurityGroupIngressInput{GroupId: &sgId, IpPermissions: sgIngress}); err != nil { + return fmt.Errorf("failed to revoke ingress rules from security group %s: %w", sgId, err) + } + } + + if len(sgEgress) != 0 { + if _, err := client.RevokeSecurityGroupEgressWithContext(ctx, &ec2.RevokeSecurityGroupEgressInput{GroupId: &sgId, IpPermissions: sgEgress}); err != nil { + return fmt.Errorf("failed to revoke egress rules from security group %s: %w", sgId, err) + } + } + + return nil +} + +func (a *action) deleteSecurityGroup(ctx context.Context, sgId string, client *ec2.EC2) error { + Log("Deleting Security Group %s", sgId) + + if _, err := client.DeleteSecurityGroupWithContext(ctx, &ec2.DeleteSecurityGroupInput{GroupId: &sgId}); err != nil { + return fmt.Errorf("failed to delete security group %s: %w", sgId, err) + } + + return nil +} From c7d8849884a4df08aa92e4e67024d27947e36001 Mon Sep 17 00:00:00 2001 From: Carlos Salas Date: Mon, 4 Mar 2024 19:16:52 +0100 Subject: [PATCH 3/3] docs: update readme Signed-off-by: Carlos Salas --- README.md | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d614acf..fea80e9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,14 @@ # AWS Janitor -A GitHub Action to cleanup AWS resources that have exceeded a TTL. +A GitHub Action to cleanup AWS resources. + +It uses a mark and delete approach: +- First time it runs, it describes resources and marks them for deletion. +- Next execution, it deletes previously marked resources. + +The tag `aws-janitor/marked-for-deletion` is used as deletion marker. + +**Any resource that includes the tag key defined by `ignore-tag`, will never be deleted.** > By default the action will not perform the delete (i.e. it will be a dry-run). You need to explicitly set commit to `true`. @@ -8,15 +16,20 @@ It supports cleaning up the following services: - EKS Clusters - Auto Scaling Groups +- Load Balancers +- Security Groups +- CloudFormation Stacks + +It follows this strict order to avoid failures caused by inter-resource dependencies. Although intermittent failures may occur, they should be resolved in subsequent executions. ## Inputs -| Name | Required | Description | -| ----------------- | -------- | -------------------------------------------------------------------------------------- | -| regions | Y | A comma seperated list of regions to clean resources in. You can use * for all regions | -| allow-all-regions | N | Set to true if use * from regions. | -| ttl | Y | The duration that a resource can live for. For example, use 24h for 1 day. | -| commit | N | Whether to perform the delete. Defaults to `false` which is a dry run | +| Name | Required | Description | +| ----------------- | -------- | ------------------------------------------------------------------------------------------------- | +| regions | Y | A comma separated list of regions to clean resources in. You can use * for all regions | +| allow-all-regions | N | Set to true if use * from regions. | +| commit | N | Whether to perform the delete. Defaults to `false` which is a dry run | +| ignore-tag | N | The name of the tag that indicates a resource should not be deleted. Defaults to `janitor-ignore` | ## Example Usage @@ -30,7 +43,7 @@ jobs: uses: rancher-sandbox/aws-janitor@v0.1.0 with: regions: eu-west-1 - ttl: 168h + ignore-tag: janitor-ignore env: AWS_ACCESS_KEY_ID: {{secrets.AWS_ACCESS_KEY_ID}} AWS_SECRET_ACCESS_KEY: {{secrets.AWS_SECRET_ACCESS_KEY}} @@ -38,4 +51,4 @@ jobs: ## Implementation Notes -It currently assumes that an instance of a service will have some form of creation date. This means that the implementation can be simpler as it doesn't need to adopt a "mark & sweep" pattern that requires saving state between runs of the action. +The original implementation of the janitor avoided using the mark and delete approach for simplicity but this solution is not viable when supporting deletion on resources that do not have a creation date.