From 444811eda11ccaa559ece40bc4aae73b8060bdb4 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 30 Oct 2024 11:24:55 +0000 Subject: [PATCH 01/10] add message on queue age alarm --- RELEASE.md | 15 +++++++++++++++ queue/alarms.tf | 34 ++++++++++++++++++++++++++++++++-- queue/variables.tf | 26 ++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 RELEASE.md diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000..3df04b4 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,15 @@ +RELEASE_TYPE: minor + +This change adds a main queue alarm that triggers when the main queue has messages older than a certain age. + +This is useful for monitoring the health of the queue and ensuring that messages are being processed in a timely manner. + +Adds the variables: + +- `dlq_alarm_topic_arn` - The ARN of the SNS topic to send DLQ alarm notifications to +- `main_q_age_alarm_topic_arn` - The ARN of the SNS topic to send main queue age alarm notifications to +- `max_age_in_hours` - The maximum age of a message in the main queue before the alarm triggers +- `queue_age_alarm_name_suffix` - The suffix to append to the age alarm name, used to allow EventBridge to filter on the alarm name +- `dlq_not_empty_alarm_name_suffix` - The suffix to append to the dlq not empty alarm name, used to allow EventBridge to filter on the alarm name + +We deprecate the `alarm_topic_arn` variable in favour of the new `dlq_alarm_topic_arn` and `main_q_age_alarm_topic_arn` variables. \ No newline at end of file diff --git a/queue/alarms.tf b/queue/alarms.tf index 4845538..7b1e51b 100644 --- a/queue/alarms.tf +++ b/queue/alarms.tf @@ -1,5 +1,17 @@ +locals { + max_age_in_seconds = var.max_age_in_hours * 3600 + + # Allows for deprecation of alarm_topic_arn in favor of dlq_alarm_topic_arn + dlq_alarm_topic_arn = var.dlq_alarm_topic_arn != null ? var.dlq_alarm_topic_arn : var.alarm_topic_arn + + # Name suffix allows for EventBridge rules to pick up alarms using wildcard + queue_age_alarm_name_suffix = var.queue_age_alarm_name_suffix != null ? "_${var.queue_age_alarm_name_suffix}" : "" + dlq_not_empty_alarm_name_suffix = var.dlq_not_empty_alarm_name_suffix != null ? "_${var.dlq_not_empty_alarm_name_suffix}" : "" +} + + resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { - count = var.alarm_topic_arn != null ? 1 : 0 + count = local.dlq_alarm_topic_arn != null ? 1 : 0 alarm_name = "${aws_sqs_queue.dlq.name}_not_empty" comparison_operator = "GreaterThanThreshold" @@ -14,6 +26,24 @@ resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { QueueName = aws_sqs_queue.dlq.name } - alarm_actions = [var.alarm_topic_arn] + alarm_actions = [local.dlq_alarm_topic_arn] } +resource "aws_cloudwatch_metric_alarm" "queue_age" { + count = var.main_q_age_alarm_topic_arn != null ? 1 : 0 + + alarm_name = "${aws_sqs_queue.q.name}_age${local.queue_age_alarm_name_suffix}" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + metric_name = "ApproximateAgeOfOldestMessage" + namespace = "AWS/SQS" + period = 60 + threshold = local.max_age_in_seconds + statistic = "Maximum" + + dimensions = { + QueueName = aws_sqs_queue.q.name + } + + alarm_actions = [var.main_q_age_alarm_topic_arn] +} \ No newline at end of file diff --git a/queue/variables.tf b/queue/variables.tf index 7e0150b..3d97006 100644 --- a/queue/variables.tf +++ b/queue/variables.tf @@ -38,10 +38,36 @@ variable "max_receive_count" { } variable "alarm_topic_arn" { + description = "DEPRECATED, use dlq_alarm_topic_arn: ARN of the topic where to send notification for DLQs not being empty. If null, no alarm will be created." + default = null +} + +variable "dlq_alarm_topic_arn" { description = "ARN of the topic where to send notification for DLQs not being empty. If null, no alarm will be created." default = null } +variable "main_q_age_alarm_topic_arn" { + description = "ARN of the topic where to send notification for messages exceeding max_age_in_hours If null, no alarm will be created." + default = null +} + +variable "max_age_in_hours" { + description = "The maximum age of a message in hours" + type = number + default = 6 +} + +variable "queue_age_alarm_name_suffix" { + description = "Suffix to append to the queue name for the age alarm" + default = null +} + +variable "dlq_not_empty_alarm_name_suffix" { + description = "Suffix to append to the DLQ name for the not empty alarm" + default = null +} + variable "fifo_queue" { description = "Boolean designating a FIFO queue" default = false From de4f07eff12c54592414ed3dbcae635e4ed53565 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 30 Oct 2024 12:02:39 +0000 Subject: [PATCH 02/10] add enable switch instead of relying on resource arns --- RELEASE.md | 8 +++++--- queue/alarms.tf | 16 +++++++++------- queue/variables.tf | 22 ++++++++++++++++------ 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 3df04b4..df0343c 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -6,10 +6,12 @@ This is useful for monitoring the health of the queue and ensuring that messages Adds the variables: -- `dlq_alarm_topic_arn` - The ARN of the SNS topic to send DLQ alarm notifications to -- `main_q_age_alarm_topic_arn` - The ARN of the SNS topic to send main queue age alarm notifications to +- `dlq_alarm_action_arns` - The ARNs of the resources to send DLQ alarm notifications to +- `main_q_age_alarm_action_arns` - The ARN of the resources to send main queue age alarm notifications to - `max_age_in_hours` - The maximum age of a message in the main queue before the alarm triggers - `queue_age_alarm_name_suffix` - The suffix to append to the age alarm name, used to allow EventBridge to filter on the alarm name - `dlq_not_empty_alarm_name_suffix` - The suffix to append to the dlq not empty alarm name, used to allow EventBridge to filter on the alarm name +- `enable_dlq_not_empty_alarm` - Whether to enable the DLQ not empty alarm (default: `false`), overridden if `dlq_alarm_action_arns` is not empty +- `enable_main_q_age_alarm` - Whether to enable the main queue age alarm (default: `false`), overridden if `main_q_age_alarm_action_arns` is not empty -We deprecate the `alarm_topic_arn` variable in favour of the new `dlq_alarm_topic_arn` and `main_q_age_alarm_topic_arn` variables. \ No newline at end of file +We deprecate the `alarm_topic_arn` variable in favour of the new `dlq_alarm_action_arns` and `main_q_age_alarm_action_arns` variables. diff --git a/queue/alarms.tf b/queue/alarms.tf index 7b1e51b..5e810f0 100644 --- a/queue/alarms.tf +++ b/queue/alarms.tf @@ -2,16 +2,18 @@ locals { max_age_in_seconds = var.max_age_in_hours * 3600 # Allows for deprecation of alarm_topic_arn in favor of dlq_alarm_topic_arn - dlq_alarm_topic_arn = var.dlq_alarm_topic_arn != null ? var.dlq_alarm_topic_arn : var.alarm_topic_arn + dlq_alarm_action_arns = var.dlq_alarm_action_arns != [] ? var.dlq_alarm_action_arns : [var.alarm_topic_arn] # Name suffix allows for EventBridge rules to pick up alarms using wildcard - queue_age_alarm_name_suffix = var.queue_age_alarm_name_suffix != null ? "_${var.queue_age_alarm_name_suffix}" : "" + queue_age_alarm_name_suffix = var.queue_age_alarm_name_suffix != null ? "_${var.queue_age_alarm_name_suffix}" : "" dlq_not_empty_alarm_name_suffix = var.dlq_not_empty_alarm_name_suffix != null ? "_${var.dlq_not_empty_alarm_name_suffix}" : "" -} + enable_dlq_not_empty_alarm = var.enable_dlq_not_empty_alarm || local.dlq_alarm_action_arns != [] + enable_queue_age_alarm = var.enable_queue_age_alarm && var.main_q_age_alarm_action_arns != [] +} resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { - count = local.dlq_alarm_topic_arn != null ? 1 : 0 + count = local.enable_dlq_not_empty_alarm ? 1 : 0 alarm_name = "${aws_sqs_queue.dlq.name}_not_empty" comparison_operator = "GreaterThanThreshold" @@ -26,11 +28,11 @@ resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { QueueName = aws_sqs_queue.dlq.name } - alarm_actions = [local.dlq_alarm_topic_arn] + alarm_actions = local.dlq_alarm_action_arns } resource "aws_cloudwatch_metric_alarm" "queue_age" { - count = var.main_q_age_alarm_topic_arn != null ? 1 : 0 + count = local.enable_queue_age_alarm ? 1 : 0 alarm_name = "${aws_sqs_queue.q.name}_age${local.queue_age_alarm_name_suffix}" comparison_operator = "GreaterThanThreshold" @@ -45,5 +47,5 @@ resource "aws_cloudwatch_metric_alarm" "queue_age" { QueueName = aws_sqs_queue.q.name } - alarm_actions = [var.main_q_age_alarm_topic_arn] + alarm_actions = var.main_q_age_alarm_action_arns } \ No newline at end of file diff --git a/queue/variables.tf b/queue/variables.tf index 3d97006..b9fbdb1 100644 --- a/queue/variables.tf +++ b/queue/variables.tf @@ -42,14 +42,24 @@ variable "alarm_topic_arn" { default = null } -variable "dlq_alarm_topic_arn" { - description = "ARN of the topic where to send notification for DLQs not being empty. If null, no alarm will be created." - default = null +variable "enable_dlq_not_empty_alarm" { + description = "DEPRECATED, use dlq_alarm_topic_arn: Enable alarm for DLQ not being empty" + default = false } -variable "main_q_age_alarm_topic_arn" { - description = "ARN of the topic where to send notification for messages exceeding max_age_in_hours If null, no alarm will be created." - default = null +variable "enable_queue_age_alarm" { + description = "Enable alarm for messages exceeding max_age_in_hours" + default = false +} + +variable "dlq_alarm_action_arns" { + description = "ARNs for the topics where to send notification for DLQs not being empty. If null, no alarm will be created." + default = [] +} + +variable "main_q_age_alarm_action_arns" { + description = "ARN for the topics where to send notification for messages exceeding max_age_in_hours If null, no alarm will be created." + default = [] } variable "max_age_in_hours" { From 2a755f80f499f4c0d87c7e484d6f2902b00a2fa9 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 30 Oct 2024 15:32:38 +0000 Subject: [PATCH 03/10] account for potential null --- queue/alarms.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/queue/alarms.tf b/queue/alarms.tf index 5e810f0..ee91f17 100644 --- a/queue/alarms.tf +++ b/queue/alarms.tf @@ -2,7 +2,8 @@ locals { max_age_in_seconds = var.max_age_in_hours * 3600 # Allows for deprecation of alarm_topic_arn in favor of dlq_alarm_topic_arn - dlq_alarm_action_arns = var.dlq_alarm_action_arns != [] ? var.dlq_alarm_action_arns : [var.alarm_topic_arn] + alarm_topic_arn_safe = var.alarm_topic_arn != null ? [var.alarm_topic_arn] : [] + dlq_alarm_action_arns = var.dlq_alarm_action_arns != [] ? var.dlq_alarm_action_arns : local.alarm_topic_arn_safe # Name suffix allows for EventBridge rules to pick up alarms using wildcard queue_age_alarm_name_suffix = var.queue_age_alarm_name_suffix != null ? "_${var.queue_age_alarm_name_suffix}" : "" From 71ce3348850f069bfa89457c28d1bc79caddc1b0 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 30 Oct 2024 16:25:51 +0000 Subject: [PATCH 04/10] bad logic --- queue/alarms.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/queue/alarms.tf b/queue/alarms.tf index ee91f17..37298ed 100644 --- a/queue/alarms.tf +++ b/queue/alarms.tf @@ -10,7 +10,7 @@ locals { dlq_not_empty_alarm_name_suffix = var.dlq_not_empty_alarm_name_suffix != null ? "_${var.dlq_not_empty_alarm_name_suffix}" : "" enable_dlq_not_empty_alarm = var.enable_dlq_not_empty_alarm || local.dlq_alarm_action_arns != [] - enable_queue_age_alarm = var.enable_queue_age_alarm && var.main_q_age_alarm_action_arns != [] + enable_queue_age_alarm = var.enable_queue_age_alarm || var.main_q_age_alarm_action_arns != [] } resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { From bf85c7ebd79ff44431eb6c773c89cd88eb22dedd Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 30 Oct 2024 16:36:29 +0000 Subject: [PATCH 05/10] worse logic --- queue/alarms.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/queue/alarms.tf b/queue/alarms.tf index 37298ed..dc5632f 100644 --- a/queue/alarms.tf +++ b/queue/alarms.tf @@ -14,7 +14,7 @@ locals { } resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { - count = local.enable_dlq_not_empty_alarm ? 1 : 0 + count = local.enable_dlq_not_empty_alarm == true ? 1 : 0 alarm_name = "${aws_sqs_queue.dlq.name}_not_empty" comparison_operator = "GreaterThanThreshold" @@ -33,7 +33,7 @@ resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { } resource "aws_cloudwatch_metric_alarm" "queue_age" { - count = local.enable_queue_age_alarm ? 1 : 0 + count = local.enable_queue_age_alarm == true ? 1 : 0 alarm_name = "${aws_sqs_queue.q.name}_age${local.queue_age_alarm_name_suffix}" comparison_operator = "GreaterThanThreshold" From 900eff3be5cf45e4d39ea8a43578c5089c4fe6ac Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 30 Oct 2024 16:41:25 +0000 Subject: [PATCH 06/10] better alarm descriptions --- queue/alarms.tf | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/queue/alarms.tf b/queue/alarms.tf index dc5632f..58da8be 100644 --- a/queue/alarms.tf +++ b/queue/alarms.tf @@ -2,7 +2,7 @@ locals { max_age_in_seconds = var.max_age_in_hours * 3600 # Allows for deprecation of alarm_topic_arn in favor of dlq_alarm_topic_arn - alarm_topic_arn_safe = var.alarm_topic_arn != null ? [var.alarm_topic_arn] : [] + alarm_topic_arn_safe = var.alarm_topic_arn != null ? [var.alarm_topic_arn] : [] dlq_alarm_action_arns = var.dlq_alarm_action_arns != [] ? var.dlq_alarm_action_arns : local.alarm_topic_arn_safe # Name suffix allows for EventBridge rules to pick up alarms using wildcard @@ -25,6 +25,8 @@ resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { threshold = 0 statistic = "Average" + alarm_description = "Alarm if the DLQ is not empty" + dimensions = { QueueName = aws_sqs_queue.dlq.name } @@ -44,6 +46,8 @@ resource "aws_cloudwatch_metric_alarm" "queue_age" { threshold = local.max_age_in_seconds statistic = "Maximum" + alarm_description = "Alarm if the age of the oldest message in the queue exceeds ${var.max_age_in_hours} hours" + dimensions = { QueueName = aws_sqs_queue.q.name } From aa8c1eeaf36f045643954aad56c0462067d66824 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 30 Oct 2024 16:55:23 +0000 Subject: [PATCH 07/10] reduce alarm name size --- queue/alarms.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/queue/alarms.tf b/queue/alarms.tf index 58da8be..41c0ea9 100644 --- a/queue/alarms.tf +++ b/queue/alarms.tf @@ -46,7 +46,7 @@ resource "aws_cloudwatch_metric_alarm" "queue_age" { threshold = local.max_age_in_seconds statistic = "Maximum" - alarm_description = "Alarm if the age of the oldest message in the queue exceeds ${var.max_age_in_hours} hours" + alarm_description = "Message age exceeds ${var.max_age_in_hours} hours" dimensions = { QueueName = aws_sqs_queue.q.name From da06558c941fea4b03b50f77da24eb469d136935 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 31 Oct 2024 09:33:21 +0000 Subject: [PATCH 08/10] remove suffix --- queue/alarms.tf | 6 +----- queue/variables.tf | 10 ---------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/queue/alarms.tf b/queue/alarms.tf index 41c0ea9..135b037 100644 --- a/queue/alarms.tf +++ b/queue/alarms.tf @@ -5,10 +5,6 @@ locals { alarm_topic_arn_safe = var.alarm_topic_arn != null ? [var.alarm_topic_arn] : [] dlq_alarm_action_arns = var.dlq_alarm_action_arns != [] ? var.dlq_alarm_action_arns : local.alarm_topic_arn_safe - # Name suffix allows for EventBridge rules to pick up alarms using wildcard - queue_age_alarm_name_suffix = var.queue_age_alarm_name_suffix != null ? "_${var.queue_age_alarm_name_suffix}" : "" - dlq_not_empty_alarm_name_suffix = var.dlq_not_empty_alarm_name_suffix != null ? "_${var.dlq_not_empty_alarm_name_suffix}" : "" - enable_dlq_not_empty_alarm = var.enable_dlq_not_empty_alarm || local.dlq_alarm_action_arns != [] enable_queue_age_alarm = var.enable_queue_age_alarm || var.main_q_age_alarm_action_arns != [] } @@ -37,7 +33,7 @@ resource "aws_cloudwatch_metric_alarm" "dlq_not_empty" { resource "aws_cloudwatch_metric_alarm" "queue_age" { count = local.enable_queue_age_alarm == true ? 1 : 0 - alarm_name = "${aws_sqs_queue.q.name}_age${local.queue_age_alarm_name_suffix}" + alarm_name = "${aws_sqs_queue.q.name}_age" comparison_operator = "GreaterThanThreshold" evaluation_periods = 1 metric_name = "ApproximateAgeOfOldestMessage" diff --git a/queue/variables.tf b/queue/variables.tf index b9fbdb1..e462234 100644 --- a/queue/variables.tf +++ b/queue/variables.tf @@ -68,16 +68,6 @@ variable "max_age_in_hours" { default = 6 } -variable "queue_age_alarm_name_suffix" { - description = "Suffix to append to the queue name for the age alarm" - default = null -} - -variable "dlq_not_empty_alarm_name_suffix" { - description = "Suffix to append to the DLQ name for the not empty alarm" - default = null -} - variable "fifo_queue" { description = "Boolean designating a FIFO queue" default = false From 89014cfb9a305d9fad2fbff02a7923c384f508a8 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 31 Oct 2024 09:33:50 +0000 Subject: [PATCH 09/10] remove params --- RELEASE.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index df0343c..2ad6466 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -9,8 +9,6 @@ Adds the variables: - `dlq_alarm_action_arns` - The ARNs of the resources to send DLQ alarm notifications to - `main_q_age_alarm_action_arns` - The ARN of the resources to send main queue age alarm notifications to - `max_age_in_hours` - The maximum age of a message in the main queue before the alarm triggers -- `queue_age_alarm_name_suffix` - The suffix to append to the age alarm name, used to allow EventBridge to filter on the alarm name -- `dlq_not_empty_alarm_name_suffix` - The suffix to append to the dlq not empty alarm name, used to allow EventBridge to filter on the alarm name - `enable_dlq_not_empty_alarm` - Whether to enable the DLQ not empty alarm (default: `false`), overridden if `dlq_alarm_action_arns` is not empty - `enable_main_q_age_alarm` - Whether to enable the main queue age alarm (default: `false`), overridden if `main_q_age_alarm_action_arns` is not empty From 0d206735454c411a474e3a0ed35b8fdb5a323a88 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Fri, 1 Nov 2024 11:29:42 +0000 Subject: [PATCH 10/10] fix alarm descriptions --- queue/variables.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/queue/variables.tf b/queue/variables.tf index e462234..73cbf0b 100644 --- a/queue/variables.tf +++ b/queue/variables.tf @@ -43,7 +43,7 @@ variable "alarm_topic_arn" { } variable "enable_dlq_not_empty_alarm" { - description = "DEPRECATED, use dlq_alarm_topic_arn: Enable alarm for DLQ not being empty" + description = "Enable alarm for DLQs not being empty" default = false } @@ -53,12 +53,12 @@ variable "enable_queue_age_alarm" { } variable "dlq_alarm_action_arns" { - description = "ARNs for the topics where to send notification for DLQs not being empty. If null, no alarm will be created." + description = "ARNs for the topics where to send notification for DLQs not being empty, if not empty overrides alarm_topic_arn." default = [] } variable "main_q_age_alarm_action_arns" { - description = "ARN for the topics where to send notification for messages exceeding max_age_in_hours If null, no alarm will be created." + description = "ARN for the topics where to send notification for messages exceeding max_age_in_hours, if not empty overrides enable_queue_age_alarm." default = [] }