From 212dc2868d7ac72d3a6ca54d0665f2f6806e6ae5 Mon Sep 17 00:00:00 2001 From: viktoryathegreat Date: Mon, 26 Jun 2023 11:49:52 +0400 Subject: [PATCH 1/2] feat(DMVP-2489): Changed alert rules expression structure. --- README.md | 10 +++++ modules/alerts/main.tf | 2 +- .../alerts/tests/mixed-metrics/1-example.tf | 10 +++++ .../alerts/tests/node-autoscaling/0-setup.tf | 15 +++++++ .../tests/node-autoscaling/1-example.tf | 36 +++++++++++++++ .../alerts/tests/node-autoscaling/2-assert.tf | 9 ++++ .../alerts/tests/node-autoscaling/README.md | 44 +++++++++++++++++++ 7 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 modules/alerts/tests/node-autoscaling/0-setup.tf create mode 100644 modules/alerts/tests/node-autoscaling/1-example.tf create mode 100644 modules/alerts/tests/node-autoscaling/2-assert.tf create mode 100644 modules/alerts/tests/node-autoscaling/README.md diff --git a/README.md b/README.md index bd1b308..01b526f 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,16 @@ module "grafana_alerts" { } function = "last" condition = "$B < 1" + }, + { + name = "Insufficient nodes in cluster" + summary = "Cluster is using fewer nodes than the required count" + folder_name = "Node Autoscaling" + datasource = "prometheus" + filters = null + metric_name = "sum(kube_node_info)" + function = "mean" + condition = "$B < 2" } ] } diff --git a/modules/alerts/main.tf b/modules/alerts/main.tf index 03a0438..81929d4 100644 --- a/modules/alerts/main.tf +++ b/modules/alerts/main.tf @@ -39,7 +39,7 @@ resource "grafana_rule_group" "alert_rule" { model = < 0) ? format("{%s}", replace(join(", ", [for k, v in rule.value.filters : "${k}=\"${v}\""]), "\"", "\\\"")) : ""}", "hide": false, "intervalMs": "1000", "legendFormat": "__auto", diff --git a/modules/alerts/tests/mixed-metrics/1-example.tf b/modules/alerts/tests/mixed-metrics/1-example.tf index 3624c04..0cc31c8 100644 --- a/modules/alerts/tests/mixed-metrics/1-example.tf +++ b/modules/alerts/tests/mixed-metrics/1-example.tf @@ -47,6 +47,16 @@ module "this" { } function = "mean" condition = "$B < 1" + }, + { + name = "Maximum node utilization in cluster" + summary = "Cluster is using 8 available nodes" + folder_name = "Node Autoscaling" + datasource = "prometheus" + filters = null + metric_name = "sum(kube_node_info)" + function = "mean" + condition = "$B >= 8" } ] } diff --git a/modules/alerts/tests/node-autoscaling/0-setup.tf b/modules/alerts/tests/node-autoscaling/0-setup.tf new file mode 100644 index 0000000..857db77 --- /dev/null +++ b/modules/alerts/tests/node-autoscaling/0-setup.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + test = { + source = "terraform.io/builtin/test" + } + grafana = { + source = "grafana/grafana" + } + } +} + +provider "grafana" { + url = "https://grafana.example.com/" + auth = "xxxxxxxxxxx" +} diff --git a/modules/alerts/tests/node-autoscaling/1-example.tf b/modules/alerts/tests/node-autoscaling/1-example.tf new file mode 100644 index 0000000..4786ea7 --- /dev/null +++ b/modules/alerts/tests/node-autoscaling/1-example.tf @@ -0,0 +1,36 @@ +module "this" { + source = "../../" + + alert_rules = [ + { + name = "Maximum node utilization in cluster" + summary = "Cluster is using 8 available nodes" + folder_name = "Node Autoscaling" + datasource = "prometheus" + filters = null + metric_name = "sum(kube_node_info)" + function = "mean" + condition = "$B >= 8" + }, + { + name = "High node utilization in cluster" + summary = "Cluster is using 6 of the available 8 nodes" + folder_name = "Node Autoscaling" + datasource = "prometheus" + filters = null + metric_name = "sum(kube_node_info)" + function = "mean" + condition = "$B >= 6" + }, + { + name = "Insufficient nodes in cluster" + summary = "Cluster is using fewer nodes than the required count" + folder_name = "Node Autoscaling" + datasource = "prometheus" + filters = null + metric_name = "sum(kube_node_info)" + function = "mean" + condition = "$B < 2" + } + ] +} diff --git a/modules/alerts/tests/node-autoscaling/2-assert.tf b/modules/alerts/tests/node-autoscaling/2-assert.tf new file mode 100644 index 0000000..302130e --- /dev/null +++ b/modules/alerts/tests/node-autoscaling/2-assert.tf @@ -0,0 +1,9 @@ +resource "test_assertions" "dummy" { + component = "grafana-modules-alerts" + + equal "scheme" { + description = "As module does not have any output and data just make sure the case runs. Probably can be thrown away." + got = "all good" + want = "all good" + } +} diff --git a/modules/alerts/tests/node-autoscaling/README.md b/modules/alerts/tests/node-autoscaling/README.md new file mode 100644 index 0000000..dca473c --- /dev/null +++ b/modules/alerts/tests/node-autoscaling/README.md @@ -0,0 +1,44 @@ +# Node Autoscaling +This test case demonstrates how to configure Grafana alerts for monitoring Node count in the cluster. + +Basically it notifies you when node autoscaling reaches +- to its maximum, in our case: `$B >= 8`, +- to more count than the half of maximum: `$B >= 6`, +- to its minimum, in our case: `$B <= 2`. + +Replace the values in the conditions with your real numbers. + +## Usage +Please, note that we pass `null` value to `filters` variable. It's needed when we use such Prometheus metrics which don't get any filters when querying. + + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [test](#provider\_test) | n/a | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [this](#module\_this) | ../../ | n/a | + +## Resources + +| Name | Type | +|------|------| +| test_assertions.dummy | resource | + +## Inputs + +No inputs. + +## Outputs + +No outputs. + From ee6edde9a8657864e28f80b3f7b91219f20cebf6 Mon Sep 17 00:00:00 2001 From: viktoryathegreat Date: Mon, 26 Jun 2023 12:31:48 +0400 Subject: [PATCH 2/2] feat(DMVP-2489): Changed alert rules condition structure. --- README.md | 26 +++++++++++++++---- modules/alerts/README.md | 14 +++++++++- modules/alerts/main.tf | 9 ++++++- .../tests/autoscaling-max-usage/1-example.tf | 6 +++-- .../tests/autoscaling-max-usage/README.md | 2 +- .../available-replica-count/1-example.tf | 6 +++-- .../tests/available-replica-count/README.md | 2 +- .../tests/container-restarts/1-example.tf | 6 +++-- .../alerts/tests/container-restarts/README.md | 2 +- .../alerts/tests/mixed-metrics/1-example.tf | 15 +++++++---- .../tests/node-autoscaling/1-example.tf | 9 ++++--- modules/alerts/variables.tf | 3 ++- 12 files changed, 75 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 01b526f..91d809f 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,17 @@ At this moment we support managing More parts are coming soon. +## Tips +1. Alert conditions are formed based on $B blocks and `equation`, `threshold` parameters users pass to the module. +`equation` parameter can only get these values: +- `lt` corresponds to `<` +- `gt` corresponds to `>` +- `e` corresponds to `=` +- `lte` corresponds to `<=` +- `gte` corresponds to `>=` +And `threshold` parameter is the number value against which B blocks are compared in the math expression. +2. We pass `null` value to `filters` variable. It's needed when we use such Prometheus metrics which don't get any filters when querying. + ## Example for Alert Rules ``` module "grafana_alerts" { @@ -22,7 +33,8 @@ module "grafana_alerts" { deployment = "app-1-microservice" } function = "last" - condition = "$B < 1" + equation = "lt" + threshold = 1 }, { name = "App_2 has 0 available replicas" @@ -33,7 +45,8 @@ module "grafana_alerts" { deployment = "app-2-microservice" } function = "last" - condition = "$B < 1" + equation = "lt" + threshold = 1 }, { name = "Insufficient nodes in cluster" @@ -43,7 +56,8 @@ module "grafana_alerts" { filters = null metric_name = "sum(kube_node_info)" function = "mean" - condition = "$B < 2" + equation = "lte" + threshold = 2 } ] } @@ -89,7 +103,8 @@ module "grafana_alerts" { deployment = "app-1-microservice" } function = "last" - condition = "$B < 1" + equation = "lt" + threshold = 1 }, { name = "App_2 has 0 available replicas" @@ -100,7 +115,8 @@ module "grafana_alerts" { deployment = "app-2-microservice" } function = "last" - condition = "$B < 1" + equation = "lt" + threshold = 1 } ] opsgenie_endpoints = [ diff --git a/modules/alerts/README.md b/modules/alerts/README.md index 118f7e4..fad7e9b 100644 --- a/modules/alerts/README.md +++ b/modules/alerts/README.md @@ -1,5 +1,17 @@ ## Usage To enable some of these alerts for your applications, you just need to replace `App_1`, `App_2` and `App_3` with the actual names of your applications. You can refer to the Prometheus metrics to identify the available filters that can be used for each application. Additionally, modify the values in the conditions to reflect the real cases of your applications. These adjustments will ensure that the alerts accurately monitor your specific applications and their scaling needs. + +## Tips +Alert conditions are formed based on $B blocks and `equation`, `threshold` parameters users pass to the module. +`equation` parameter can only get these values: +- `lt` corresponds to `<` +- `gt` corresponds to `>` +- `e` corresponds to `=` +- `lte` corresponds to `<=` +- `gte` corresponds to `>=` + +And `threshold` parameter is the number value against which B blocks are compared in the math expression. + ## Requirements @@ -30,7 +42,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [alert\_interval\_seconds](#input\_alert\_interval\_seconds) | The interval, in seconds, at which all rules in the group are evaluated. If a group contains many rules, the rules are evaluated sequentially. | `number` | `10` | no | -| [alert\_rules](#input\_alert\_rules) | This varibale describes alert folders, groups and rules. |
list(object({
name = string # The name of the alert rule
summary = optional(string, "") # Rule annotation as a summary
folder_name = optional(string, "Main Alerts") # Grafana folder name in which the rule will be created
datasource = string # Name of the datasource used for the alert
metric_name = string # Prometheus metric name which queries the data for the alert
filters = optional(any, {}) # Filters object to identify each service for alerting
function = optional(string, "mean") # One of Reduce functions which will be used in B block for alerting
condition = string # Math expression which compares B blocks value with a number and generates an alert if needed
}))
| `[]` | no | +| [alert\_rules](#input\_alert\_rules) | This varibale describes alert folders, groups and rules. |
list(object({
name = string # The name of the alert rule
summary = optional(string, "") # Rule annotation as a summary
folder_name = optional(string, "Main Alerts") # Grafana folder name in which the rule will be created
datasource = string # Name of the datasource used for the alert
metric_name = string # Prometheus metric name which queries the data for the alert
filters = optional(any, {}) # Filters object to identify each service for alerting
function = optional(string, "mean") # One of Reduce functions which will be used in B block for alerting
equation = string # The equation in the math expression which compares B blocks value with a number and generates an alert if needed. Possible values: gt, lt, gte, lte, e.
threshold = number # The value against which B blocks are compared in the math expression
}))
| `[]` | no | ## Outputs diff --git a/modules/alerts/main.tf b/modules/alerts/main.tf index 81929d4..8437a93 100644 --- a/modules/alerts/main.tf +++ b/modules/alerts/main.tf @@ -1,6 +1,13 @@ locals { folders = toset(distinct([for rule in var.alert_rules : rule.folder_name])) alerts = { for member in local.folders : member => [for rule in var.alert_rules : rule if rule.folder_name == member] } + comparison_operators = { + gte : ">=", + gt : ">", + lt : "<", + lte : "<=", + e : "=" + } } resource "grafana_folder" "rule_folder" { @@ -132,7 +139,7 @@ EOT "type": "__expr__", "uid": "__expr__" }, - "expression": "${rule.value.condition}", + "expression": "$B ${local.comparison_operators[rule.value.equation]} ${rule.value.threshold}", "hide": false, "intervalMs": 1000, "maxDataPoints": 43200, diff --git a/modules/alerts/tests/autoscaling-max-usage/1-example.tf b/modules/alerts/tests/autoscaling-max-usage/1-example.tf index 7298015..c90d703 100644 --- a/modules/alerts/tests/autoscaling-max-usage/1-example.tf +++ b/modules/alerts/tests/autoscaling-max-usage/1-example.tf @@ -12,7 +12,8 @@ module "this" { deployment = "app-1-microservice" } function = "mean" - condition = "$B >= 20" + equation = "gte" + threshold = 20 }, { name = "App_2 max autoscaling" @@ -24,7 +25,8 @@ module "this" { deployment = "app-2-microservice" } function = "mean" - condition = "$B >= 20" + equation = "gte" + threshold = 20 } ] } diff --git a/modules/alerts/tests/autoscaling-max-usage/README.md b/modules/alerts/tests/autoscaling-max-usage/README.md index 0db26d6..6df7ea3 100644 --- a/modules/alerts/tests/autoscaling-max-usage/README.md +++ b/modules/alerts/tests/autoscaling-max-usage/README.md @@ -3,7 +3,7 @@ This test case demonstrates how to configure Grafana alerts for an application r In this test, we have set up two alert rules for different microservices, `App_1` and `App_2`, within the `Autoscaling Test` folder. The alerts are triggered based on the Prometheus datasource and the metric `kube_deployment_status_replicas_available`. -For each microservice, we have specified a filter to match the deployment name (`app-1-microservice` and `app-2-microservice`). The `mean` function is applied to aggregate the metric values, and the condition `$B >= 20` is used to check if the replicas available are equal to or greater than 20. +For each microservice, we have specified a filter to match the deployment name (`app-1-microservice` and `app-2-microservice`). The `mean` function is applied to aggregate the metric values, and the `eqaution`, `threshold` parameters are used to check if the replicas available are equal to or greater than 20. ## Requirements diff --git a/modules/alerts/tests/available-replica-count/1-example.tf b/modules/alerts/tests/available-replica-count/1-example.tf index 5dd31ea..27ab06d 100644 --- a/modules/alerts/tests/available-replica-count/1-example.tf +++ b/modules/alerts/tests/available-replica-count/1-example.tf @@ -11,7 +11,8 @@ module "this" { deployment = "app-1-microservice" } function = "last" - condition = "$B < 1" + equation = "lt" + threshold = 1 }, { name = "App_2 has 0 available replicas" @@ -22,7 +23,8 @@ module "this" { deployment = "app-2-microservice" } function = "last" - condition = "$B < 1" + equation = "lt" + threshold = 1 } ] } diff --git a/modules/alerts/tests/available-replica-count/README.md b/modules/alerts/tests/available-replica-count/README.md index 91ada7f..7af3675 100644 --- a/modules/alerts/tests/available-replica-count/README.md +++ b/modules/alerts/tests/available-replica-count/README.md @@ -5,7 +5,7 @@ In this test, we have set up two alert rules to detect when the available replic For each microservice, we have specified a filter to match the deployment name (`app-1-microservice` and `app-2-microservice`). The `last` function is used to process the metric values, respectively. -The condition `$B < 1` is used to check if the available replicas fall below 1, indicating that the application doesn't have any replicas. +The `eqaution`, `threshold` parameters are used to check if the available replicas fall below 1, indicating that the application doesn't have any replicas. ## Requirements diff --git a/modules/alerts/tests/container-restarts/1-example.tf b/modules/alerts/tests/container-restarts/1-example.tf index e589545..22fae6d 100644 --- a/modules/alerts/tests/container-restarts/1-example.tf +++ b/modules/alerts/tests/container-restarts/1-example.tf @@ -12,7 +12,8 @@ module "this" { container = "app-1-container" } function = "mean" - condition = "$B > 2" + equation = "gt" + threshold = 2 }, { name = "App_2 has too many restarts" @@ -23,7 +24,8 @@ module "this" { container = "app-2-container" } function = "mean" - condition = "$B >= 4" + equation = "gte" + threshold = 4 } ] } diff --git a/modules/alerts/tests/container-restarts/README.md b/modules/alerts/tests/container-restarts/README.md index 5b4eee3..ee63cfa 100644 --- a/modules/alerts/tests/container-restarts/README.md +++ b/modules/alerts/tests/container-restarts/README.md @@ -5,7 +5,7 @@ In this test, we have set up two alert rules to monitor the restart count of mic For each microservice, we have specified a filter to match the container name (`app-1-container` and `app-2-container`). The `mean` function is used to aggregate the restart count values. -The conditions `$B > 2` and `$B >= 4` are employed to check if the restart count exceeds the thresholds for each microservice. When the conditions are met, indicating a high restart count, the alerts will be triggered. +The `eqaution`, `threshold` parameters are employed to check if the restart count exceeds the thresholds for each microservice. When the conditions are met, indicating a high restart count, the alerts will be triggered. ## Requirements diff --git a/modules/alerts/tests/mixed-metrics/1-example.tf b/modules/alerts/tests/mixed-metrics/1-example.tf index 0cc31c8..616df97 100644 --- a/modules/alerts/tests/mixed-metrics/1-example.tf +++ b/modules/alerts/tests/mixed-metrics/1-example.tf @@ -12,7 +12,8 @@ module "this" { container = "app-1-container" } function = "mean" - condition = "$B > 2" + equation = "gt" + threshold = 2 }, { name = "App_2 max autoscaling" @@ -24,7 +25,8 @@ module "this" { deployment = "app-2-microservice" } function = "mean" - condition = "$B >= 20" + equation = "gte" + threshold = 20 }, { name = "App_1 has 0 available replicas" @@ -35,7 +37,8 @@ module "this" { deployment = "app-1-microservice" } function = "mean" - condition = "$B < 1" + equation = "lt" + threshold = 1 }, { name = "App_3 has 0 available replicas" @@ -46,7 +49,8 @@ module "this" { deployment = "app-3-microservice" } function = "mean" - condition = "$B < 1" + equation = "lt" + threshold = 1 }, { name = "Maximum node utilization in cluster" @@ -56,7 +60,8 @@ module "this" { filters = null metric_name = "sum(kube_node_info)" function = "mean" - condition = "$B >= 8" + equation = "gte" + threshold = 8 } ] } diff --git a/modules/alerts/tests/node-autoscaling/1-example.tf b/modules/alerts/tests/node-autoscaling/1-example.tf index 4786ea7..321d25f 100644 --- a/modules/alerts/tests/node-autoscaling/1-example.tf +++ b/modules/alerts/tests/node-autoscaling/1-example.tf @@ -10,7 +10,8 @@ module "this" { filters = null metric_name = "sum(kube_node_info)" function = "mean" - condition = "$B >= 8" + equation = "gt" + threshold = "8" }, { name = "High node utilization in cluster" @@ -20,7 +21,8 @@ module "this" { filters = null metric_name = "sum(kube_node_info)" function = "mean" - condition = "$B >= 6" + equation = "gt" + threshold = "6" }, { name = "Insufficient nodes in cluster" @@ -30,7 +32,8 @@ module "this" { filters = null metric_name = "sum(kube_node_info)" function = "mean" - condition = "$B < 2" + equation = "lt" + threshold = "2" } ] } diff --git a/modules/alerts/variables.tf b/modules/alerts/variables.tf index e3b6b61..29dc284 100644 --- a/modules/alerts/variables.tf +++ b/modules/alerts/variables.tf @@ -13,7 +13,8 @@ variable "alert_rules" { metric_name = string # Prometheus metric name which queries the data for the alert filters = optional(any, {}) # Filters object to identify each service for alerting function = optional(string, "mean") # One of Reduce functions which will be used in B block for alerting - condition = string # Math expression which compares B blocks value with a number and generates an alert if needed + equation = string # The equation in the math expression which compares B blocks value with a number and generates an alert if needed. Possible values: gt, lt, gte, lte, e. + threshold = number # The value against which B blocks are compared in the math expression })) default = [] description = "This varibale describes alert folders, groups and rules."