From 3f31e7a9a7c883792c4bfd090f7db73d6562f6ef Mon Sep 17 00:00:00 2001 From: Pinglei Guo Date: Wed, 16 Jun 2021 15:44:11 -0700 Subject: [PATCH 1/7] ecs: Add ECS ContainerInsight Prometheus tf files - Run extra apps as service, i.e., no longer run collector as sidecar - Add templates for extra sample apps - NOT tested, jusrt porting previous unmerged PRs --- terraform/basic_components/outputs.tf | 4 + terraform/ecs/extra_apps.tf | 46 +++++ terraform/ecs/main.tf | 22 ++- terraform/ecs/mocked_server_lb.tf | 7 +- terraform/ecs/variables.tf | 25 +++ .../defaults/cloudwatch_context.json | 2 + .../containerinsight_ecs_prometheus/README.md | 49 ++++++ .../cloudwatch_context.json | 12 ++ .../ecs_taskdef.tpl | 29 +++ .../containerinsight_ecs_prometheus/jmx.json | 25 +++ .../nginx.json | 66 +++++++ .../otconfig.tpl | 165 ++++++++++++++++++ .../parameters.tfvars | 29 +++ 13 files changed, 474 insertions(+), 7 deletions(-) create mode 100644 terraform/ecs/extra_apps.tf create mode 100644 terraform/templates/defaults/cloudwatch_context.json create mode 100644 terraform/testcases/containerinsight_ecs_prometheus/README.md create mode 100644 terraform/testcases/containerinsight_ecs_prometheus/cloudwatch_context.json create mode 100644 terraform/testcases/containerinsight_ecs_prometheus/ecs_taskdef.tpl create mode 100644 terraform/testcases/containerinsight_ecs_prometheus/jmx.json create mode 100644 terraform/testcases/containerinsight_ecs_prometheus/nginx.json create mode 100644 terraform/testcases/containerinsight_ecs_prometheus/otconfig.tpl create mode 100644 terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars diff --git a/terraform/basic_components/outputs.tf b/terraform/basic_components/outputs.tf index a605892af..f9d4df7c1 100644 --- a/terraform/basic_components/outputs.tf +++ b/terraform/basic_components/outputs.tf @@ -41,6 +41,10 @@ output "mocked_server_cert_content" { value = data.template_file.mocked_server_cert.rendered } +output "sample_app_image_repo" { + value = data.aws_ecr_repository.sample_apps.repository_url +} + output "sample_app_image" { value = "${data.aws_ecr_repository.sample_apps.repository_url}:${var.sample_app}-latest" } diff --git a/terraform/ecs/extra_apps.tf b/terraform/ecs/extra_apps.tf new file mode 100644 index 000000000..2486a50be --- /dev/null +++ b/terraform/ecs/extra_apps.tf @@ -0,0 +1,46 @@ +data "template_file" "extra_apps_defs" { + for_each = var.ecs_extra_apps + template = file("${var.testcase}/${each.value.definition}") + vars = { + region = var.region + image_repo = local.extra_app_image_repo + } +} + +resource "aws_ecs_task_definition" "extra_apps" { + for_each = var.ecs_extra_apps + family = "taskdef-${module.common.testing_id}-${each.value.service_name}" + container_definitions = data.template_file.extra_apps_defs[each.key].rendered + network_mode = each.value.network_mode + cpu = each.value.cpu + memory = each.value.memory + task_role_arn = module.basic_components.aoc_iam_role_arn + execution_role_arn = module.basic_components.aoc_iam_role_arn +} + +resource "aws_ecs_service" "extra_apps" { + for_each = var.ecs_extra_apps + name = "aocservice-${module.common.testing_id}-${each.value.service_name}" + cluster = module.ecs_cluster.cluster_id + task_definition = "${aws_ecs_task_definition.extra_apps[each.key].family}:1" + desired_count = each.value.replicas + launch_type = each.value.launch_type + platform_version = each.value.launch_type == "FARGATE" ? "1.4.0" : null + + // NOTE: network configuration is only allowed for awsvpc + // a hack for optional block https://github.com/hashicorp/terraform/issues/19898 + dynamic "network_configuration" { + for_each = each.value.network_mode == "awsvpc" ? list(each.value.network_mode) : [] + content { + subnets = module.basic_components.aoc_private_subnet_ids + security_groups = [ + module.basic_components.aoc_security_group_id] + } + } +} + +output "extra_apps_defs_rendered" { + value = { + for k, v in data.template_file.extra_apps_defs : k => v.rendered + } +} \ No newline at end of file diff --git a/terraform/ecs/main.tf b/terraform/ecs/main.tf index a1d572c22..491cf1170 100644 --- a/terraform/ecs/main.tf +++ b/terraform/ecs/main.tf @@ -43,9 +43,11 @@ module "basic_components" { } locals { - ecs_taskdef_path = "../templates/${var.ecs_taskdef_directory}/ecs_taskdef.tpl" - sample_app_image = var.sample_app_image != "" ? var.sample_app_image : module.basic_components.sample_app_image - mocked_server_image = var.mocked_server_image != "" ? var.mocked_server_image : module.basic_components.mocked_server_image + ecs_taskdef_path = fileexists("${var.testcase}/ecs_taskdef.tpl") ? "${var.testcase}/ecs_taskdef.tpl" : "../templates/${var.ecs_taskdef_directory}/ecs_taskdef.tpl" + sample_app_image = var.sample_app_image != "" ? var.sample_app_image : module.basic_components.sample_app_image + mocked_server_image = var.mocked_server_image != "" ? var.mocked_server_image : module.basic_components.mocked_server_image + extra_app_image_repo = var.ecs_extra_apps_image_repo != "" ? var.ecs_extra_apps_image_repo : module.basic_components.sample_app_image_repo + cloudwatch_context_path = fileexists("${var.testcase}/cloudwatch_context.json") ? "${var.testcase}/cloudwatch_context.json" : "../templates/${var.ecs_taskdef_directory}/cloudwatch_context.json" } provider "aws" { @@ -73,6 +75,7 @@ resource "aws_ssm_parameter" "otconfig" { name = "otconfig-${module.common.testing_id}" type = "String" value = module.basic_components.otconfig_content + tier = "Advanced" // need advanced for a long list of prometheus relabel config } ## create task def @@ -261,13 +264,22 @@ module "validator_without_sample_app" { ecs_taskdef_family = aws_ecs_task_definition.aoc.family ecs_taskdef_version = aws_ecs_task_definition.aoc.revision + cloudwatch_context_json = data.template_file.cloudwatch_context.rendered + aws_access_key_id = var.aws_access_key_id aws_secret_access_key = var.aws_secret_access_key depends_on = [aws_ecs_service.aoc_without_sample_app] } - - +data "template_file" "cloudwatch_context" { + # default is just empty json, each test case can set its own override under its own folder. + # See containerinsight_ecs_prometheus as example. + template = file(local.cloudwatch_context_path) + vars = { + testing_id = module.common.testing_id + cluster_name = module.ecs_cluster.cluster_name + } +} diff --git a/terraform/ecs/mocked_server_lb.tf b/terraform/ecs/mocked_server_lb.tf index e0d9025ac..9e5abd273 100644 --- a/terraform/ecs/mocked_server_lb.tf +++ b/terraform/ecs/mocked_server_lb.tf @@ -14,6 +14,7 @@ # ------------------------------------------------------------------------- resource "aws_lb" "mocked_server_lb" { + count = var.disable_mocked_server ? 0 : 1 # use public subnet to make the lb accessible from public internet subnets = module.basic_components.aoc_public_subnet_ids security_groups = [module.basic_components.aoc_security_group_id] @@ -21,6 +22,7 @@ resource "aws_lb" "mocked_server_lb" { } resource "aws_lb_target_group" "mocked_server_lb_tg" { + count = var.disable_mocked_server ? 0 : 1 name = "ms-lbtg-${module.common.testing_id}" port = module.common.mocked_server_http_port protocol = "HTTP" @@ -37,12 +39,13 @@ resource "aws_lb_target_group" "mocked_server_lb_tg" { } resource "aws_lb_listener" "mocked_server_lb_listener" { - load_balancer_arn = aws_lb.mocked_server_lb.arn + count = var.disable_mocked_server ? 0 : 1 + load_balancer_arn = aws_lb.mocked_server_lb[0].arn port = module.common.mocked_server_lb_port protocol = "HTTP" default_action { type = "forward" - target_group_arn = aws_lb_target_group.mocked_server_lb_tg.arn + target_group_arn = aws_lb_target_group.mocked_server_lb_tg[0].arn } } \ No newline at end of file diff --git a/terraform/ecs/variables.tf b/terraform/ecs/variables.tf index 15c85a855..1e717f4f3 100644 --- a/terraform/ecs/variables.tf +++ b/terraform/ecs/variables.tf @@ -21,6 +21,11 @@ variable "sample_app_callable" { default = true } +# prometheus does not need mocked server +variable "disable_mocked_server" { + default = false +} + variable "mock_endpoint" { default = "localhost/put-data" } @@ -28,3 +33,23 @@ variable "mock_endpoint" { variable "ecs_taskdef_directory" { default = "defaults" } + +variable "ecs_extra_apps_image_repo" { + # When empty will use sample image repo + default = "" +} + +variable "ecs_extra_apps" { + type = map(object({ + definition = string + service_name = string + service_type = string + replicas = number + network_mode = string + launch_type = string + cpu = number + memory = number + })) + default = {} +} + diff --git a/terraform/templates/defaults/cloudwatch_context.json b/terraform/templates/defaults/cloudwatch_context.json new file mode 100644 index 000000000..7a73a41bf --- /dev/null +++ b/terraform/templates/defaults/cloudwatch_context.json @@ -0,0 +1,2 @@ +{ +} \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/README.md b/terraform/testcases/containerinsight_ecs_prometheus/README.md new file mode 100644 index 000000000..de1e904bd --- /dev/null +++ b/terraform/testcases/containerinsight_ecs_prometheus/README.md @@ -0,0 +1,49 @@ +# ContainerInsight ECS Prometheus + +## Overview + +This is e2e test +for [extension/ecsobserver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/extension/observer/ecsobserver) + +## TODO + +- [ ] [cloudwatch_context.json](cloudwatch_context.json) + - [ ] we need to update java code to include `taskDefinitionFamily` and `serviceName` +- [x] app image repo is hardcoded and not used +- [ ] log group name for sample applications' container log +- [ ] why I created a new `ecs_taskdef.tpl` is it used? + +## Usage + +Non default image `123456.dkr.ecr.us-west-2.amazonaws.com/aoc:myfeature-0.2` + +- `ecs_launch_type` is the launch type for aoc itself, launch type for extra apps are defined in TODO(where?) +- `aoc_image_repo` is repo for aoc without tag +- `aoc_version` is the image tag for aoc +- `ecs_extra_apps_image_repo` is the repo for all the extra apps, remaining part of image name and version are defined + as tag in templates like [jmx.json](jmx.json) + e.g. `123456.dkr.ecr.us-west-2.amazonaws.com/prometheus-samples:tomcat-jmx-latest` + +```bash +terraform apply \ + -var="ecs_launch_type=FARGATE" \ + -var="disable_efs=true" \ + -var="disable_mocked_server=true" \ + -var="aoc_version=myfeature-0.2" \ + -var="aoc_image_repo=123456.dkr.ecr.us-west-2.amazonaws.com/aoc" \ + -var="ecs_extra_apps_image_repo=123456.dkr.ecr.us-west-2.amazonaws.com/prometheus-samples" \ + -var="testcase=../testcases/containerinsight_ecs_prometheus" \ + -var-file="../testcases/containerinsight_ecs_prometheus/parameters.tfvars" +``` + +## Development + +### Files + +- [ecs_taskdef.tpl](ecs_taskdef.tpl) is override for [default](../../templates/defaults/ecs_taskdef.tpl) because we + don't need mock server and sample app, i.e., not deploying collector as sidecar. + - log group is `/ecs/ecs-adot-collector-service` + +### Build and push sample app image + +There are multiple sample applications diff --git a/terraform/testcases/containerinsight_ecs_prometheus/cloudwatch_context.json b/terraform/testcases/containerinsight_ecs_prometheus/cloudwatch_context.json new file mode 100644 index 000000000..d5ae0f55c --- /dev/null +++ b/terraform/testcases/containerinsight_ecs_prometheus/cloudwatch_context.json @@ -0,0 +1,12 @@ +{ + "clusterName": "${cluster_name}", + "jmx": { + "job": "ecssd", + "taskDefinitionFamily": "taskdef-${testing_id}-jmx" + }, + "nginx": { + "job": "ecssd", + "taskDefinitionFamily": "taskdef-${testing_id}-nginx-service", + "serviceName": "aocservice-${testing_id}-nginx-service" + } +} \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/ecs_taskdef.tpl b/terraform/testcases/containerinsight_ecs_prometheus/ecs_taskdef.tpl new file mode 100644 index 000000000..04ec7b148 --- /dev/null +++ b/terraform/testcases/containerinsight_ecs_prometheus/ecs_taskdef.tpl @@ -0,0 +1,29 @@ +[ + { + "name": "aoc-collector", + "image": "${aoc_image}", + "cpu": 10, + "memory": 256, + "secrets": [ + { + "name": "AOT_CONFIG_CONTENT", + "valueFrom": "${ssm_parameter_arn}" + } + ], + "essential": true, + "entryPoint": [], + "command": [], + "environment": [], + "environmentFiles": [], + "dependsOn": [], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "/ecs/ecs-adot-collector-service", + "awslogs-region": "${region}", + "awslogs-stream-prefix": "ecs", + "awslogs-create-group": "True" + } + } + } +] diff --git a/terraform/testcases/containerinsight_ecs_prometheus/jmx.json b/terraform/testcases/containerinsight_ecs_prometheus/jmx.json new file mode 100644 index 000000000..99154795e --- /dev/null +++ b/terraform/testcases/containerinsight_ecs_prometheus/jmx.json @@ -0,0 +1,25 @@ +[ + { + "name": "tomcat-prometheus-workload-java-ec2-bridge-dynamic-port", + "image": "${ecs_extra_apps_image_repo}:tomcat-jmx-0.1", + "portMappings": [ + { + "protocol": "tcp", + "containerPort": 9404 + } + ], + "dockerLabels": { + "ECS_PROMETHEUS_EXPORTER_PORT": "9404", + "Java_EMF_Metrics": "true" + }, + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-create-group": "True", + "awslogs-group": "/aoc/ecssd", + "awslogs-region": "${region}", + "awslogs-stream-prefix": "prometheus-tomcat-jmx" + } + } + } +] \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/nginx.json b/terraform/testcases/containerinsight_ecs_prometheus/nginx.json new file mode 100644 index 000000000..8fcb40881 --- /dev/null +++ b/terraform/testcases/containerinsight_ecs_prometheus/nginx.json @@ -0,0 +1,66 @@ +[ + { + "name": "nginx", + "image": "${ecs_extra_apps_image_repo}:nginx-cwagent-latest", + "essential": true, + "portMappings": [ + { + "containerPort": 80, + "protocol": "tcp" + } + ], + "links": [ + "app" + ], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-create-group": "True", + "awslogs-group": "/aoc/ecssd", + "awslogs-region": "${region}", + "awslogs-stream-prefix": "nginx-cwagent" + } + } + }, + { + "name": "app", + "image": "${ecs_extra_apps_image_repo}:nginx-app-latest", + "essential": true, + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-create-group": "True", + "awslogs-group": "/aoc/ecssdd", + "awslogs-region": "${region}", + "awslogs-stream-prefix": "nginx-app" + } + } + }, + { + "name": "nginx-prometheus-exporter", + "image": "${ecs_extra_apps_image_repo}:nginx-prometheus-exporter-0.8.0", + "essential": true, + "command": [ + "-nginx.scrape-uri", + "http://nginx:8080/stub_status" + ], + "links": [ + "nginx" + ], + "portMappings": [ + { + "containerPort": 9113, + "protocol": "tcp" + } + ], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-create-group": "True", + "awslogs-group": "/aoc/ecssd", + "awslogs-region": "${region}", + "awslogs-stream-prefix": "nginx-prometheus-exporter" + } + } + } +] \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/otconfig.tpl b/terraform/testcases/containerinsight_ecs_prometheus/otconfig.tpl new file mode 100644 index 000000000..2465af2f7 --- /dev/null +++ b/terraform/testcases/containerinsight_ecs_prometheus/otconfig.tpl @@ -0,0 +1,165 @@ +# NOTE: cluster name for extension and emf exporter is configured using template var testing_id + +extensions: + ecs_observer: + cluster_name: 'aoc-testing-${testing_id}' + cluster_region: 'us-west-2' + result_file: '/etc/ecs_sd_targets.yaml' + refresh_interval: 15s + job_label_name: prometheus_job + # nginx https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights-Prometheus-Setup-nginx-ecs.html + services: + - name_pattern: '^.*nginx-service$' + metrics_ports: + - 9113 + job_name: nginx-prometheus-exporter + # jmx + docker_labels: + - port_label: 'ECS_PROMETHEUS_EXPORTER_PORT' + # appmesh, port and metrics are from envoy sidecar + task_definitions: + - arn_pattern: '.*:task-definition/.*-ColorTeller-(white):[0-9]+' + metrics_path: '/stats/prometheus' + metrics_ports: + - 9901 + job_name: ecs-appmesh-color + - arn_pattern: '.*:task-definition/.*-ColorGateway:[0-9]+' + metrics_path: '/stats/prometheus' + metrics_ports: + - 9901 + job_name: ecs-appmesh-color + +receivers: + prometheus: + config: + scrape_configs: + - job_name: "ecssd" + file_sd_configs: + - files: + - '/etc/ecs_sd_targets.yaml' + relabel_configs: + - source_labels: [ __meta_ecs_cluster_name ] # ClusterName + action: replace + target_label: ClusterName + - source_labels: [ __meta_ecs_service_name ] # ServiceName + action: replace + target_label: ServiceName + - source_labels: [ __meta_ecs_task_definition_family ] # TaskDefinitionFamily + action: replace + target_label: TaskDefinitionFamily + - source_labels: [ __meta_ecs_container_name ] # container_name + action: replace + target_label: container_name + - action: labelmap # docker labels + regex: ^__meta_ecs_container_labels_(.+)$ + replacement: '$$1' + +exporters: + awsemf: + namespace: ECS/ContainerInsights/Prometheus + # TODO: can we inject cluster name down from the pipeline + log_group_name: "/aws/ecs/containerinsights/aoc-testing-${testing_id}/prometheus" + # FIXME: we hard code the log stream name for now + log_stream_name: 'ecssd' + dimension_rollup_option: NoDimensionRollup + metric_declarations: + # nginx + - dimensions: [ [ ClusterName, TaskDefinitionFamily, ServiceName ] ] + label_matchers: + - label_names: + - ServiceName + regex: '^.*nginx-service$' + metric_name_selectors: + - "^nginx_.*$" + - dimensions: [ [ ClusterName, TaskDefinitionFamily, ServiceName ] ] + label_matchers: + - label_names: + - ServiceName + regex: '^.*nginx-plus-service$' + metric_name_selectors: + - "^nginxplus_connections_accepted$" + - "^nginxplus_connections_active$" + - "^nginxplus_connections_dropped$" + - "^nginxplus_connections_idle$" + - "^nginxplus_http_requests_total$" + - "^nginxplus_ssl_handshakes$" + - "^nginxplus_ssl_handshakes_failed$" + - "^nginxplus_up$" + - "^nginxplus_upstream_server_health_checks_fails$" + - dimensions: [ [ ClusterName, TaskDefinitionFamily, ServiceName, upstream ] ] + label_matchers: + - label_names: + - ServiceName + regex: '^.*nginx-plus-service$' + metric_name_selectors: + - "^nginxplus_upstream_server_response_time$" + - dimensions: [ [ ClusterName, TaskDefinitionFamily, ServiceName, code ] ] + label_matchers: + - label_names: + - ServiceName + regex: '^.*nginx-plus-service$' + metric_name_selectors: + - "^nginxplus_upstream_server_responses$" + - "^nginxplus_server_zone_responses$" + # jmx + - dimensions: [ [ ClusterName, TaskDefinitionFamily, area ] ] + label_matchers: + - label_names: + - Java_EMF_Metrics + regex: ^true$ + metric_name_selectors: + - "^jvm_memory_bytes_used$" + - dimensions: [ [ ClusterName, TaskDefinitionFamily, pool ] ] + label_matchers: + - label_names: + - Java_EMF_Metrics + regex: ^true$ + metric_name_selectors: + - "^jvm_memory_pool_bytes_used$" + - dimensions: [ [ ClusterName, TaskDefinitionFamily ] ] + label_matchers: + - label_names: + - Java_EMF_Metrics + regex: ^true$ + metric_name_selectors: + - "^jvm_threads_(current|daemon)$" + - "^jvm_classes_loaded$" + - "^java_lang_operatingsystem_(freephysicalmemorysize|totalphysicalmemorysize|freeswapspacesize|totalswapspacesize|systemcpuload|processcpuload|availableprocessors|openfiledescriptorcount)$" + - "^catalina_manager_(rejectedsessions|activesessions)$" + - "^jvm_gc_collection_seconds_(count|sum)$" + - "^catalina_globalrequestprocessor_(bytesreceived|bytessent|requestcount|errorcount|processingtime)$" + # AppMesh envoy + - dimensions: [ [ "ClusterName","TaskDefinitionFamily" ] ] + label_matchers: + - label_names: + - container_name + regex: ^envoy$ + metric_name_selectors: + - "^envoy_http_downstream_rq_(total|xx)$" + - "^envoy_cluster_upstream_cx_(r|t)x_bytes_total$" + - "^envoy_cluster_membership_(healthy|total)$" + - "^envoy_server_memory_(allocated|heap_size)$" + - "^envoy_cluster_upstream_cx_(connect_timeout|destroy_local_with_active_rq)$" + - "^envoy_cluster_upstream_rq_(pending_failure_eject|pending_overflow|timeout|per_try_timeout|rx_reset|maintenance_mode)$" + - "^envoy_http_downstream_cx_destroy_remote_active_rq$" + - "^envoy_cluster_upstream_flow_control_(paused_reading_total|resumed_reading_total|backed_up_total|drained_total)$" + - "^envoy_cluster_upstream_rq_retry$" + - "^envoy_cluster_upstream_rq_retry_(success|overflow)$" + - "^envoy_server_(version|uptime|live)$" + - dimensions: [ [ "ClusterName","TaskDefinitionFamily","envoy_http_conn_manager_prefix","envoy_response_code_class" ] ] + label_matchers: + - label_names: + - container_name + regex: ^envoy$ + metric_name_selectors: + - "^envoy_http_downstream_rq_xx$" + + logging: + loglevel: debug + +service: + extensions: [ ecs_observer ] + pipelines: + metrics: + receivers: [ prometheus ] + exporters: [ awsemf ] \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars b/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars new file mode 100644 index 000000000..8e3eb551f --- /dev/null +++ b/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars @@ -0,0 +1,29 @@ +# this file is defined in validator/src/main/resources/validations +validation_config = "ecs-container-insight.yml" +# no need for any lb +sample_app_callable = false +# sample apps that emit ecs metrics +ecs_extra_apps = { + # TODO: need both host network, awspvc and fargate + jmx = { + definition = "jmx.tpl" + service_name = "jmx" + service_type = "replica" + replicas = 1 + network_mode = "bridge" + launch_type = "EC2" + cpu = 256 + memory = 256 + } + + nginx = { + definition = "nginx.tpl" + service_name = "nginx-service" + service_type = "replica" + replicas = 1 + network_mode = "bridge" + launch_type = "EC2" + cpu = 384 + memory = 384 + } +} \ No newline at end of file From c19d36960edd9da3cab1641243cd5583d185f1ed Mon Sep 17 00:00:00 2001 From: Pinglei Guo Date: Wed, 16 Jun 2021 19:19:24 -0700 Subject: [PATCH 2/7] ecs: Add ECS Prometheus Log and Metrics validator For cloudwatch container insights --- .../containerinsight_ecs_prometheus/README.md | 21 ++++- .../parameters.tfvars | 2 +- .../PredefinedExpectedTemplate.java | 5 +- .../amazon/aoc/models/CloudWatchContext.java | 3 + ...htECSPrometheusStructuredLogValidator.java | 70 ++++++++++++++++ ...rInsightECSPrometheusMetricsValidator.java | 53 ++++++++++++ .../aoc/validators/ValidatorFactory.java | 8 ++ .../container-insight/ecs/prometheus/jmx.json | 82 +++++++++++++++++++ .../ecs/prometheus/jmx_metrics.mustache | 25 ++++++ .../ecs/prometheus/nginx.json | 40 +++++++++ .../ecs/prometheus/nginx_metrics.mustache | 9 ++ .../ecs-container-insight-prometheus.yml | 7 ++ 12 files changed, 322 insertions(+), 3 deletions(-) create mode 100644 validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java create mode 100644 validator/src/main/java/com/amazon/aoc/validators/ContainerInsightECSPrometheusMetricsValidator.java create mode 100644 validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx.json create mode 100644 validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx_metrics.mustache create mode 100644 validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx.json create mode 100644 validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx_metrics.mustache create mode 100644 validator/src/main/resources/validations/ecs-container-insight-prometheus.yml diff --git a/terraform/testcases/containerinsight_ecs_prometheus/README.md b/terraform/testcases/containerinsight_ecs_prometheus/README.md index de1e904bd..ff9bac644 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/README.md +++ b/terraform/testcases/containerinsight_ecs_prometheus/README.md @@ -9,9 +9,10 @@ for [extension/ecsobserver](https://github.com/open-telemetry/opentelemetry-coll - [ ] [cloudwatch_context.json](cloudwatch_context.json) - [ ] we need to update java code to include `taskDefinitionFamily` and `serviceName` + - [ ] cluster name is used by other tests, need to pass it when rendering the template, could do this for the + default template, it's empty - [x] app image repo is hardcoded and not used - [ ] log group name for sample applications' container log -- [ ] why I created a new `ecs_taskdef.tpl` is it used? ## Usage @@ -47,3 +48,21 @@ terraform apply \ ### Build and push sample app image There are multiple sample applications + +### Validation + +```bash +# Run at project root to make sure the validator code pass style check and compiles +./gradlew :validator:build +``` + +- validation config file name is specified + in [parameters.tfvars](parameters.tfvars) `ecs-container-insight-prometheus.yml` +- the actual validation config is located + in [validator/src/main/resources/validations](../../../validator/src/main/resources/validations/ecs-container-insight-prometheus.yml) +- path to log and metrics validation templates is + in [PredefinedExpectedTemplate](../../../validator/src/main/java/com/amazon/aoc/fileconfigs/PredefinedExpectedTemplate.java) + while the actual files are + in [expected-data-template/container-insight/ecs/prometheus](../../../validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus) +- `validationType: "container-insight-ecs-prometheus-logs"` in config + triggers [ValidatorFactory](../../../validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java) \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars b/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars index 8e3eb551f..9f012ba48 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars +++ b/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars @@ -1,5 +1,5 @@ # this file is defined in validator/src/main/resources/validations -validation_config = "ecs-container-insight.yml" +validation_config = "ecs-container-insight-prometheus.yml" # no need for any lb sample_app_callable = false # sample apps that emit ecs metrics diff --git a/validator/src/main/java/com/amazon/aoc/fileconfigs/PredefinedExpectedTemplate.java b/validator/src/main/java/com/amazon/aoc/fileconfigs/PredefinedExpectedTemplate.java index 61eaa4f48..aea964569 100644 --- a/validator/src/main/java/com/amazon/aoc/fileconfigs/PredefinedExpectedTemplate.java +++ b/validator/src/main/java/com/amazon/aoc/fileconfigs/PredefinedExpectedTemplate.java @@ -31,6 +31,8 @@ public enum PredefinedExpectedTemplate implements FileConfig { ECS_CONTAINER_EXPECTED_METRIC("/expected-data-template/ecsContainerExpectedMetric.mustache"), CONTAINER_INSIGHT_EKS_PROMETHEUS_METRIC( "/expected-data-template/container-insight/eks/prometheus"), + CONTAINER_INSIGHT_ECS_PROMETHEUS_METRIC( + "/expected-data-template/container-insight/ecs/prometheus"), /** * trace template, defined in resources. @@ -69,7 +71,8 @@ public enum PredefinedExpectedTemplate implements FileConfig { "/expected-data-template/container-insight/eks/prometheus"), CONTAINER_INSIGHT_EKS_LOG( "/expected-data-template/container-insight/eks/infrastructure"), - + CONTAINER_INSIGHT_ECS_PROMETHEUS_LOG( + "/expected-data-template/container-insight/ecs/prometheus"), ; private String path; diff --git a/validator/src/main/java/com/amazon/aoc/models/CloudWatchContext.java b/validator/src/main/java/com/amazon/aoc/models/CloudWatchContext.java index f14c1e109..1c909c6f1 100644 --- a/validator/src/main/java/com/amazon/aoc/models/CloudWatchContext.java +++ b/validator/src/main/java/com/amazon/aoc/models/CloudWatchContext.java @@ -57,5 +57,8 @@ public static class App { private String name; private String namespace; private String job; + // For ECS + private String taskDefinitionFamily; + private String serviceName; } } \ No newline at end of file diff --git a/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java b/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java new file mode 100644 index 000000000..1fefd7bcd --- /dev/null +++ b/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java @@ -0,0 +1,70 @@ +package com.amazon.aoc.validators; + + +import com.amazon.aoc.fileconfigs.FileConfig; +import com.amazon.aoc.fileconfigs.LocalPathExpectedTemplate; +import com.amazon.aoc.helpers.MustacheHelper; +import com.amazon.aoc.models.CloudWatchContext; +import com.amazon.aoc.models.Context; +import com.fasterxml.jackson.databind.JsonNode; +import org.apache.commons.io.FilenameUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * Validates ECS Prometheus structured logs. + * + * @see ContainerInsightPrometheusMetricsValidator for ECS Proemtheus Metrics + * @see ContainerInsightPrometheusStructuredLogValidator for EKS + */ +public class ContainerIInsightECSPrometheusStructuredLogValidator + extends AbstractStructuredLogValidator { + + private List validateApps; + + @Override + void init(Context context, FileConfig expectedDataTemplate) throws Exception { + // /aws/ecs/containerinsights/aoc-prometheus-dashboard-1/prometheus + logGroupName = String.format("/aws/ecs/containerinsights/%s/%s", + context.getCloudWatchContext().getClusterName(), "prometheus"); + + // TODO: it's same as eks prometheus + validateApps = getAppsToValidate(context.getCloudWatchContext()); + MustacheHelper mustacheHelper = new MustacheHelper(); + + for (CloudWatchContext.App app : validateApps) { + FileConfig fileConfig = new LocalPathExpectedTemplate(FilenameUtils.concat( + expectedDataTemplate.getPath().toString(), + app.getName() + ".json")); + String templateInput = mustacheHelper.render(fileConfig, context); + schemasToValidate.put(app.getNamespace(), parseJsonSchema(templateInput)); + logStreamNames.add(app.getJob()); + } + } + + @Override + String getJsonSchemaMappingKey(JsonNode logEventNode) { + // We use TaskDefinitionFamily to check because ServiceName is optional in EMF log. + String taskFamily = logEventNode.get("TaskDefinitionFamily").asText(); + // When registering schema in schemasToValidate we + if (taskFamily.contains("jmx")) { + return "jmx"; + } + if (taskFamily.contains("nginx")) { + return "nginx"; + } + return null; + } + + private static List getAppsToValidate(CloudWatchContext cwContext) { + List apps = new ArrayList<>(); + if (cwContext.getNginx() != null) { + apps.add(cwContext.getNginx()); + } + if (cwContext.getJmx() != null) { + apps.add(cwContext.getJmx()); + } + return apps; + } +} diff --git a/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightECSPrometheusMetricsValidator.java b/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightECSPrometheusMetricsValidator.java new file mode 100644 index 000000000..f5cb05f98 --- /dev/null +++ b/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightECSPrometheusMetricsValidator.java @@ -0,0 +1,53 @@ +package com.amazon.aoc.validators; + +import com.amazon.aoc.fileconfigs.FileConfig; +import com.amazon.aoc.fileconfigs.LocalPathExpectedTemplate; +import com.amazon.aoc.helpers.MustacheHelper; +import com.amazon.aoc.models.CloudWatchContext; +import com.amazon.aoc.models.Context; +import com.amazonaws.services.cloudwatch.model.Metric; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import org.apache.commons.io.FilenameUtils; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +public class ContainerInsightECSPrometheusMetricsValidator extends AbstractCWMetricsValidator { + private final ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); + + // TODO: it's same as eks prometheus ... + @Override + List getExpectedMetrics( + Context context, + FileConfig expectedDataTemplate + ) throws Exception { + List expectedMetrics = new ArrayList<>(); + List validateApps = getAppsToValidate(context.getCloudWatchContext()); + MustacheHelper mustacheHelper = new MustacheHelper(); + for (CloudWatchContext.App app : validateApps) { + FileConfig fileConfig = new LocalPathExpectedTemplate(FilenameUtils.concat( + expectedDataTemplate.getPath().toString(), + app.getName() + "_metrics.mustache")); + String templateInput = mustacheHelper.render(fileConfig, context); + List appMetrics = mapper.readValue(templateInput.getBytes(StandardCharsets.UTF_8), + new TypeReference>() { + }); + expectedMetrics.addAll(appMetrics); + } + return expectedMetrics; + } + + private static List getAppsToValidate(CloudWatchContext cwContext) { + List apps = new ArrayList<>(); + if (cwContext.getNginx() != null) { + apps.add(cwContext.getNginx()); + } + if (cwContext.getJmx() != null) { + apps.add(cwContext.getJmx()); + } + return apps; + } +} diff --git a/validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java b/validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java index aa3e3b2cc..934d921a5 100644 --- a/validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java +++ b/validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java @@ -74,6 +74,14 @@ public IValidator launchValidator(ValidationConfig validationConfig) throws Exce validator = new ContainerInsightStructuredLogValidator(); expectedData = validationConfig.getExpectedLogStructureTemplate(); break; + case "container-insight-ecs-prometheus-logs": + validator = new ContainerIInsightECSPrometheusStructuredLogValidator(); + expectedData = validationConfig.getExpectedLogStructureTemplate(); + break; + case "container-insight-ecs-prometheus-metrics": + validator = new ContainerInsightECSPrometheusMetricsValidator(); + expectedData = validationConfig.getExpectedMetricTemplate(); + break; default: throw new BaseException(ExceptionCode.VALIDATION_TYPE_NOT_EXISTED); } diff --git a/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx.json b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx.json new file mode 100644 index 000000000..f765faf88 --- /dev/null +++ b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx.json @@ -0,0 +1,82 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent ECS structured log", + "type": "object", + "properties": { + "ClusterName": {}, + "Timestamp": {}, + "Version": {}, + "instance": {}, + "job": {}, + "prom_metric_type": {}, + "java_lang_operatingsystem_availableprocessors": {}, + "java_lang_operatingsystem_freephysicalmemorysize": {}, + "java_lang_operatingsystem_freeswapspacesize": {}, + "java_lang_operatingsystem_openfiledescriptorcount": {}, + "java_lang_operatingsystem_processcpuload": {}, + "java_lang_operatingsystem_systemcpuload": {}, + "java_lang_operatingsystem_totalphysicalmemorysize": {}, + "java_lang_operatingsystem_totalswapspacesize": {}, + "java_lang_threading_currentthreadcputime": {}, + "java_lang_threading_currentthreadusertime": {}, + "java_lang_threading_daemonthreadcount": {}, + "java_lang_threading_peakthreadcount": {}, + "java_lang_threading_threadcount": {}, + "java_lang_threading_totalstartedthreadcount": {}, + "jmx_scrape_duration_seconds": {}, + "jmx_scrape_error": {}, + "jvm_classes_loaded": {}, + "jvm_threads_current": {}, + "jvm_threads_daemon": {}, + "jvm_threads_deadlocked": {}, + "jvm_threads_deadlocked_monitor": {}, + "jvm_threads_peak": {}, + "process_max_fds": {}, + "process_open_fds": {}, + "process_resident_memory_bytes": {}, + "process_start_time_seconds": {}, + "process_virtual_memory_bytes": {}, + "container_name": {}, + "TaskDefinitionFamily": {}, + "TaskRevision": {}, + "TaskGroup": {}, + "LaunchType": {}, + "InstanceType": {}, + "VpcId": {}, + "SubnetId": {} + }, + "required": [ + "ClusterName", + "java_lang_operatingsystem_availableprocessors", + "java_lang_operatingsystem_freephysicalmemorysize", + "java_lang_operatingsystem_freeswapspacesize", + "java_lang_operatingsystem_openfiledescriptorcount", + "java_lang_operatingsystem_processcpuload", + "java_lang_operatingsystem_systemcpuload", + "java_lang_operatingsystem_totalphysicalmemorysize", + "java_lang_operatingsystem_totalswapspacesize", + "java_lang_threading_currentthreadcputime", + "java_lang_threading_currentthreadusertime", + "java_lang_threading_daemonthreadcount", + "java_lang_threading_peakthreadcount", + "java_lang_threading_threadcount", + "java_lang_threading_totalstartedthreadcount", + "jmx_scrape_duration_seconds", + "jmx_scrape_error", + "jvm_classes_loaded", + "jvm_threads_current", + "jvm_threads_daemon", + "jvm_threads_deadlocked", + "jvm_threads_deadlocked_monitor", + "jvm_threads_peak", + "process_max_fds", + "process_open_fds", + "process_resident_memory_bytes", + "process_start_time_seconds", + "process_virtual_memory_bytes", + "container_name", + "TaskDefinitionFamily" + ], + "additionalProperties": true +} \ No newline at end of file diff --git a/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx_metrics.mustache b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx_metrics.mustache new file mode 100644 index 000000000..56fef616a --- /dev/null +++ b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx_metrics.mustache @@ -0,0 +1,25 @@ +- metricName: jvm_classes_loaded + namespace: ECS/ContainerInsights/Prometheus + dimensions: + - name: ClusterName + value: {{cloudWatchContext.clusterName}} + - name: TaskDefinitionFamily + value: {{cloudWatchContext.jmx.taskDefinitionFamily}} +- metricName: jvm_memory_pool_bytes_used + namespace: ECS/ContainerInsights/Prometheus + dimensions: + - name: ClusterName + value: {{cloudWatchContext.clusterName}} + - name: TaskDefinitionFamily + value: {{cloudWatchContext.jmx.taskDefinitionFamily}} + - name: pool + value: Metaspace +- metricName: jvm_memory_bytes_used + namespace: ECS/ContainerInsights/Prometheus + dimensions: + - name: ClusterName + value: {{cloudWatchContext.clusterName}} + - name: TaskDefinitionFamily + value: {{cloudWatchContext.jmx.taskDefinitionFamily}} + - name: area + value: heap \ No newline at end of file diff --git a/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx.json b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx.json new file mode 100644 index 000000000..5dc884bad --- /dev/null +++ b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx.json @@ -0,0 +1,40 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent ECS structured log", + "type": "object", + "properties": { + "ClusterName": {}, + "Timestamp": {}, + "Version": {}, + "instance": {}, + "job": {}, + "prom_metric_type": {}, + "container_name": {}, + "TaskDefinitionFamily": {}, + "TaskRevision": {}, + "TaskGroup": {}, + "LaunchType": {}, + "ServiceName": {}, + "InstanceType": {}, + "VpcId": {}, + "SubnetId": {}, + "nginx_connections_accepted": {}, + "nginx_connections_handled": {}, + "nginx_http_requests_total": {}, + "nginx_connections_active": {}, + "nginx_connections_reading": {}, + "nginx_connections_waiting": {}, + "nginx_connections_writing": {}, + "nginx_up": {}, + "nginxexporter_build_info": {} + }, + "required": [ + "ClusterName", + "container_name", + "TaskDefinitionFamily", + "ServiceName", + "nginx_up" + ], + "additionalProperties": true +} \ No newline at end of file diff --git a/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx_metrics.mustache b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx_metrics.mustache new file mode 100644 index 000000000..fb0ed91a8 --- /dev/null +++ b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx_metrics.mustache @@ -0,0 +1,9 @@ +- metricName: nginx_up + namespace: ECS/ContainerInsights/Prometheus + dimensions: + - name: ClusterName + value: {{cloudWatchContext.clusterName}} + - name: TaskDefinitionFamily + value: {{cloudWatchContext.nginx.taskDefinitionFamily}} + - name: ServiceName + value: {{cloudWatchContext.nginx.serviceName}} \ No newline at end of file diff --git a/validator/src/main/resources/validations/ecs-container-insight-prometheus.yml b/validator/src/main/resources/validations/ecs-container-insight-prometheus.yml new file mode 100644 index 000000000..902b8fa78 --- /dev/null +++ b/validator/src/main/resources/validations/ecs-container-insight-prometheus.yml @@ -0,0 +1,7 @@ +- validationType: "container-insight-ecs-prometheus-logs" + shouldValidateMetricValue: false + # NOTE: the value here matches enum ExpectedLogStructure under fileconfigs pacakge + expectedLogStructureTemplate: "CONTAINER_INSIGHT_ECS_PROMETHEUS_LOG" +- validationType: "container-insight-ecs-prometheus-metrics" + shouldValidateMetricValue: false + expectedMetricTemplate: "CONTAINER_INSIGHT_ECS_PROMETHEUS_METRIC" From 27abdd7b3df90c1decd7b4b2f5000d57de145903 Mon Sep 17 00:00:00 2001 From: Pinglei Guo Date: Wed, 16 Jun 2021 20:08:04 -0700 Subject: [PATCH 3/7] ecs: Upgrade infrablocks/ecs-cluster to 4.0.0 --- terraform/ecs/efs.tf | 3 ++- terraform/ecs/extra_apps.tf | 4 ++++ terraform/ecs/main.tf | 9 ++++----- terraform/templates/defaults/cloudwatch_context.json | 1 + .../containerinsight_ecs_prometheus/README.md | 10 ++++++++-- .../testcases/containerinsight_ecs_prometheus/jmx.json | 2 +- .../containerinsight_ecs_prometheus/nginx.json | 6 +++--- .../containerinsight_ecs_prometheus/parameters.tfvars | 4 ++-- 8 files changed, 25 insertions(+), 14 deletions(-) diff --git a/terraform/ecs/efs.tf b/terraform/ecs/efs.tf index b78b5a34d..e95eaf94e 100644 --- a/terraform/ecs/efs.tf +++ b/terraform/ecs/efs.tf @@ -125,7 +125,8 @@ resource "null_resource" "scp_cert" { } output "private_key" { - value = tls_private_key.ssh_key.private_key_pem + value = tls_private_key.ssh_key.private_key_pem + sensitive = true } output "efs_ip" { diff --git a/terraform/ecs/extra_apps.tf b/terraform/ecs/extra_apps.tf index 2486a50be..63e3cde1a 100644 --- a/terraform/ecs/extra_apps.tf +++ b/terraform/ecs/extra_apps.tf @@ -1,3 +1,7 @@ +locals { + extra_app_image_repo = var.ecs_extra_apps_image_repo != "" ? var.ecs_extra_apps_image_repo : module.basic_components.sample_app_image_repo +} + data "template_file" "extra_apps_defs" { for_each = var.ecs_extra_apps template = file("${var.testcase}/${each.value.definition}") diff --git a/terraform/ecs/main.tf b/terraform/ecs/main.tf index 491cf1170..8fc63357f 100644 --- a/terraform/ecs/main.tf +++ b/terraform/ecs/main.tf @@ -46,7 +46,6 @@ locals { ecs_taskdef_path = fileexists("${var.testcase}/ecs_taskdef.tpl") ? "${var.testcase}/ecs_taskdef.tpl" : "../templates/${var.ecs_taskdef_directory}/ecs_taskdef.tpl" sample_app_image = var.sample_app_image != "" ? var.sample_app_image : module.basic_components.sample_app_image mocked_server_image = var.mocked_server_image != "" ? var.mocked_server_image : module.basic_components.mocked_server_image - extra_app_image_repo = var.ecs_extra_apps_image_repo != "" ? var.ecs_extra_apps_image_repo : module.basic_components.sample_app_image_repo cloudwatch_context_path = fileexists("${var.testcase}/cloudwatch_context.json") ? "${var.testcase}/cloudwatch_context.json" : "../templates/${var.ecs_taskdef_directory}/cloudwatch_context.json" } @@ -56,7 +55,7 @@ provider "aws" { module "ecs_cluster" { source = "infrablocks/ecs-cluster/aws" - version = "3.0.0" + version = "4.0.0" cluster_name = module.common.testing_id component = "aoc" @@ -200,7 +199,7 @@ resource "aws_ecs_service" "aoc" { } load_balancer { - target_group_arn = aws_lb_target_group.mocked_server_lb_tg.arn + target_group_arn = aws_lb_target_group.mocked_server_lb_tg[0].arn container_name = "mocked-server" container_port = module.common.mocked_server_http_port } @@ -240,7 +239,7 @@ module "validator" { testing_id = module.common.testing_id metric_namespace = "${module.common.otel_service_namespace}/${module.common.otel_service_name}" sample_app_endpoint = "http://${aws_lb.aoc_lb[0].dns_name}:${module.common.sample_app_lb_port}" - mocked_server_validating_url = "http://${aws_lb.mocked_server_lb.dns_name}:${module.common.mocked_server_lb_port}/check-data" + mocked_server_validating_url = "http://${aws_lb.mocked_server_lb[0].dns_name}:${module.common.mocked_server_lb_port}/check-data" cortex_instance_endpoint = var.cortex_instance_endpoint aws_access_key_id = var.aws_access_key_id @@ -257,7 +256,7 @@ module "validator_without_sample_app" { region = var.region testing_id = module.common.testing_id metric_namespace = "${module.common.otel_service_namespace}/${module.common.otel_service_name}" - mocked_server_validating_url = "http://${aws_lb.mocked_server_lb.dns_name}:${module.common.mocked_server_lb_port}/check-data" + mocked_server_validating_url = var.disable_mocked_server ? "" : "http://${aws_lb.mocked_server_lb[0].dns_name}:${module.common.mocked_server_lb_port}/check-data" ecs_cluster_name = module.ecs_cluster.cluster_name ecs_task_arn = aws_ecs_task_definition.aoc.arn diff --git a/terraform/templates/defaults/cloudwatch_context.json b/terraform/templates/defaults/cloudwatch_context.json index 7a73a41bf..d90546345 100644 --- a/terraform/templates/defaults/cloudwatch_context.json +++ b/terraform/templates/defaults/cloudwatch_context.json @@ -1,2 +1,3 @@ { + "clusterName": "${cluster_name}" } \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/README.md b/terraform/testcases/containerinsight_ecs_prometheus/README.md index ff9bac644..8895a57c4 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/README.md +++ b/terraform/testcases/containerinsight_ecs_prometheus/README.md @@ -26,9 +26,9 @@ Non default image `123456.dkr.ecr.us-west-2.amazonaws.com/aoc:myfeature-0.2` e.g. `123456.dkr.ecr.us-west-2.amazonaws.com/prometheus-samples:tomcat-jmx-latest` ```bash +cd $PROJECT/terraform/ecs terraform apply \ -var="ecs_launch_type=FARGATE" \ - -var="disable_efs=true" \ -var="disable_mocked_server=true" \ -var="aoc_version=myfeature-0.2" \ -var="aoc_image_repo=123456.dkr.ecr.us-west-2.amazonaws.com/aoc" \ @@ -65,4 +65,10 @@ There are multiple sample applications while the actual files are in [expected-data-template/container-insight/ecs/prometheus](../../../validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus) - `validationType: "container-insight-ecs-prometheus-logs"` in config - triggers [ValidatorFactory](../../../validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java) \ No newline at end of file + triggers [ValidatorFactory](../../../validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java) + +## Problems + +`Unknown variable; There is no variable named` + +- used wrong variable name in template file \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/jmx.json b/terraform/testcases/containerinsight_ecs_prometheus/jmx.json index 99154795e..fae38052f 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/jmx.json +++ b/terraform/testcases/containerinsight_ecs_prometheus/jmx.json @@ -1,7 +1,7 @@ [ { "name": "tomcat-prometheus-workload-java-ec2-bridge-dynamic-port", - "image": "${ecs_extra_apps_image_repo}:tomcat-jmx-0.1", + "image": "${image_repo}:tomcat-jmx-0.1", "portMappings": [ { "protocol": "tcp", diff --git a/terraform/testcases/containerinsight_ecs_prometheus/nginx.json b/terraform/testcases/containerinsight_ecs_prometheus/nginx.json index 8fcb40881..0f033da91 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/nginx.json +++ b/terraform/testcases/containerinsight_ecs_prometheus/nginx.json @@ -1,7 +1,7 @@ [ { "name": "nginx", - "image": "${ecs_extra_apps_image_repo}:nginx-cwagent-latest", + "image": "${image_repo}:nginx-cwagent-latest", "essential": true, "portMappings": [ { @@ -24,7 +24,7 @@ }, { "name": "app", - "image": "${ecs_extra_apps_image_repo}:nginx-app-latest", + "image": "${image_repo}:nginx-app-latest", "essential": true, "logConfiguration": { "logDriver": "awslogs", @@ -38,7 +38,7 @@ }, { "name": "nginx-prometheus-exporter", - "image": "${ecs_extra_apps_image_repo}:nginx-prometheus-exporter-0.8.0", + "image": "${image_repo}:nginx-prometheus-exporter-0.8.0", "essential": true, "command": [ "-nginx.scrape-uri", diff --git a/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars b/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars index 9f012ba48..85c67dc35 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars +++ b/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars @@ -6,7 +6,7 @@ sample_app_callable = false ecs_extra_apps = { # TODO: need both host network, awspvc and fargate jmx = { - definition = "jmx.tpl" + definition = "jmx.json" service_name = "jmx" service_type = "replica" replicas = 1 @@ -17,7 +17,7 @@ ecs_extra_apps = { } nginx = { - definition = "nginx.tpl" + definition = "nginx.json" service_name = "nginx-service" service_type = "replica" replicas = 1 From c788a706fdebb95f4b7dfcc9ff8d3794e7bdbb3d Mon Sep 17 00:00:00 2001 From: Pinglei Guo Date: Wed, 16 Jun 2021 21:10:06 -0700 Subject: [PATCH 4/7] ecs: Fix json schema mapping key in log validator Should use taskDefinitionFamily directly, copied namespace from eks ... --- terraform/ecs/Makefile | 12 ++++++++++++ .../nginx.json | 2 +- ...htECSPrometheusStructuredLogValidator.java | 19 ++++++++----------- 3 files changed, 21 insertions(+), 12 deletions(-) create mode 100644 terraform/ecs/Makefile diff --git a/terraform/ecs/Makefile b/terraform/ecs/Makefile new file mode 100644 index 000000000..beadc71c8 --- /dev/null +++ b/terraform/ecs/Makefile @@ -0,0 +1,12 @@ +# If you are debugging validator but don't want to rebuild the infra, just run +# make build && make validate +# terraform apply generates validator_docker_compose.yml with right parameters + +validate: +# You can modify this to whatever temp credential helper you have if your ~/.aws is using credential_process +# The docker compose file mount ~/.aws into the container for getting AWS credential +# awsl isengard get > creds.env + docker-compose -f validator_docker_compose.yml up + +build: + docker-compose -f validator_docker_compose.yml build diff --git a/terraform/testcases/containerinsight_ecs_prometheus/nginx.json b/terraform/testcases/containerinsight_ecs_prometheus/nginx.json index 0f033da91..44d85fb0a 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/nginx.json +++ b/terraform/testcases/containerinsight_ecs_prometheus/nginx.json @@ -30,7 +30,7 @@ "logDriver": "awslogs", "options": { "awslogs-create-group": "True", - "awslogs-group": "/aoc/ecssdd", + "awslogs-group": "/aoc/ecssd", "awslogs-region": "${region}", "awslogs-stream-prefix": "nginx-app" } diff --git a/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java b/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java index 1fefd7bcd..024cfae41 100644 --- a/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java +++ b/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java @@ -7,6 +7,7 @@ import com.amazon.aoc.models.CloudWatchContext; import com.amazon.aoc.models.Context; import com.fasterxml.jackson.databind.JsonNode; +import lombok.extern.log4j.Log4j2; import org.apache.commons.io.FilenameUtils; import java.util.ArrayList; @@ -18,6 +19,7 @@ * @see ContainerInsightPrometheusMetricsValidator for ECS Proemtheus Metrics * @see ContainerInsightPrometheusStructuredLogValidator for EKS */ +@Log4j2 public class ContainerIInsightECSPrometheusStructuredLogValidator extends AbstractStructuredLogValidator { @@ -28,8 +30,9 @@ void init(Context context, FileConfig expectedDataTemplate) throws Exception { // /aws/ecs/containerinsights/aoc-prometheus-dashboard-1/prometheus logGroupName = String.format("/aws/ecs/containerinsights/%s/%s", context.getCloudWatchContext().getClusterName(), "prometheus"); + log.info("log group name is {}", logGroupName); - // TODO: it's same as eks prometheus + // It's almost same as EKS prometheus but we use different key to find schema. validateApps = getAppsToValidate(context.getCloudWatchContext()); MustacheHelper mustacheHelper = new MustacheHelper(); @@ -38,23 +41,17 @@ void init(Context context, FileConfig expectedDataTemplate) throws Exception { expectedDataTemplate.getPath().toString(), app.getName() + ".json")); String templateInput = mustacheHelper.render(fileConfig, context); - schemasToValidate.put(app.getNamespace(), parseJsonSchema(templateInput)); + // NOTE: EKS use namespace, we use task family for matching log event to schema. + schemasToValidate.put(app.getTaskDefinitionFamily(), parseJsonSchema(templateInput)); logStreamNames.add(app.getJob()); } + log.info("apps to validate {}", validateApps.size()); } @Override String getJsonSchemaMappingKey(JsonNode logEventNode) { // We use TaskDefinitionFamily to check because ServiceName is optional in EMF log. - String taskFamily = logEventNode.get("TaskDefinitionFamily").asText(); - // When registering schema in schemasToValidate we - if (taskFamily.contains("jmx")) { - return "jmx"; - } - if (taskFamily.contains("nginx")) { - return "nginx"; - } - return null; + return logEventNode.get("TaskDefinitionFamily").asText(); } private static List getAppsToValidate(CloudWatchContext cwContext) { From eae09c2e6aa8d4fa7b258665498463b9cb645a59 Mon Sep 17 00:00:00 2001 From: Pinglei Guo Date: Thu, 17 Jun 2021 21:02:16 -0700 Subject: [PATCH 5/7] ecs: Scale down asg before deleting cluster in tf Fix https://github.com/aws-observability/aws-otel-test-framework/issues/307 --- terraform/ecs/main.tf | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/terraform/ecs/main.tf b/terraform/ecs/main.tf index 8fc63357f..10ba6b304 100644 --- a/terraform/ecs/main.tf +++ b/terraform/ecs/main.tf @@ -70,6 +70,24 @@ module "ecs_cluster" { // TODO(pingleig): pass patch tag for canary and soaking (if any) } +# This is a hack for known issue https://github.com/hashicorp/terraform-provider-aws/issues/4852 +# We always create ECS cluster with active EC2 instances, so when destroy we need to scale down +# the asg so the cluster can be destroyed. +resource "null_resource" "scale_down_asg" { + # https://discuss.hashicorp.com/t/how-to-rewrite-null-resource-with-local-exec-provisioner-when-destroy-to-prepare-for-deprecation-after-0-12-8/4580/2 + triggers = { + asg_name = module.ecs_cluster.autoscaling_group_name + } + + # Only run during destroy, do nothing for apply. + provisioner "local-exec" { + when = destroy + command = <<-EOT + aws autoscaling update-auto-scaling-group --auto-scaling-group-name "${self.triggers.asg_name}" --min-size 0 --desired-capacity 0 +EOT + } +} + resource "aws_ssm_parameter" "otconfig" { name = "otconfig-${module.common.testing_id}" type = "String" @@ -208,6 +226,8 @@ resource "aws_ecs_service" "aoc" { subnets = module.basic_components.aoc_private_subnet_ids security_groups = [module.basic_components.aoc_security_group_id] } + + depends_on = [null_resource.scale_down_asg] } # remove lb since there's no callable sample app, some test cases will drop in here, for example, ecsmetadata receiver test @@ -225,6 +245,7 @@ resource "aws_ecs_service" "aoc_without_sample_app" { security_groups = [module.basic_components.aoc_security_group_id] } + depends_on = [null_resource.scale_down_asg] } ########################################## From 2086ad3ccaf402444215ea450bf9e469caace6c2 Mon Sep 17 00:00:00 2001 From: Pinglei Guo Date: Thu, 17 Jun 2021 21:19:02 -0700 Subject: [PATCH 6/7] ecs: Share Promeheus CWMetrics validator between ECS and EKS --- ...htECSPrometheusStructuredLogValidator.java | 16 +++++- ...rInsightECSPrometheusMetricsValidator.java | 53 ------------------- .../aoc/validators/ValidatorFactory.java | 5 +- 3 files changed, 16 insertions(+), 58 deletions(-) delete mode 100644 validator/src/main/java/com/amazon/aoc/validators/ContainerInsightECSPrometheusMetricsValidator.java diff --git a/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java b/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java index 024cfae41..1e9dc027f 100644 --- a/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java +++ b/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java @@ -1,5 +1,19 @@ -package com.amazon.aoc.validators; +/* + * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * or in the "license" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ +package com.amazon.aoc.validators; import com.amazon.aoc.fileconfigs.FileConfig; import com.amazon.aoc.fileconfigs.LocalPathExpectedTemplate; diff --git a/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightECSPrometheusMetricsValidator.java b/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightECSPrometheusMetricsValidator.java deleted file mode 100644 index f5cb05f98..000000000 --- a/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightECSPrometheusMetricsValidator.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.amazon.aoc.validators; - -import com.amazon.aoc.fileconfigs.FileConfig; -import com.amazon.aoc.fileconfigs.LocalPathExpectedTemplate; -import com.amazon.aoc.helpers.MustacheHelper; -import com.amazon.aoc.models.CloudWatchContext; -import com.amazon.aoc.models.Context; -import com.amazonaws.services.cloudwatch.model.Metric; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; -import org.apache.commons.io.FilenameUtils; - -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; - -public class ContainerInsightECSPrometheusMetricsValidator extends AbstractCWMetricsValidator { - private final ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); - - // TODO: it's same as eks prometheus ... - @Override - List getExpectedMetrics( - Context context, - FileConfig expectedDataTemplate - ) throws Exception { - List expectedMetrics = new ArrayList<>(); - List validateApps = getAppsToValidate(context.getCloudWatchContext()); - MustacheHelper mustacheHelper = new MustacheHelper(); - for (CloudWatchContext.App app : validateApps) { - FileConfig fileConfig = new LocalPathExpectedTemplate(FilenameUtils.concat( - expectedDataTemplate.getPath().toString(), - app.getName() + "_metrics.mustache")); - String templateInput = mustacheHelper.render(fileConfig, context); - List appMetrics = mapper.readValue(templateInput.getBytes(StandardCharsets.UTF_8), - new TypeReference>() { - }); - expectedMetrics.addAll(appMetrics); - } - return expectedMetrics; - } - - private static List getAppsToValidate(CloudWatchContext cwContext) { - List apps = new ArrayList<>(); - if (cwContext.getNginx() != null) { - apps.add(cwContext.getNginx()); - } - if (cwContext.getJmx() != null) { - apps.add(cwContext.getJmx()); - } - return apps; - } -} diff --git a/validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java b/validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java index 934d921a5..532515127 100644 --- a/validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java +++ b/validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java @@ -63,6 +63,7 @@ public IValidator launchValidator(ValidationConfig validationConfig) throws Exce validator = new PerformanceValidator(); break; case "container-insight-eks-prometheus-metrics": + case "container-insight-ecs-prometheus-metrics": validator = new ContainerInsightPrometheusMetricsValidator(); expectedData = validationConfig.getExpectedMetricTemplate(); break; @@ -78,10 +79,6 @@ public IValidator launchValidator(ValidationConfig validationConfig) throws Exce validator = new ContainerIInsightECSPrometheusStructuredLogValidator(); expectedData = validationConfig.getExpectedLogStructureTemplate(); break; - case "container-insight-ecs-prometheus-metrics": - validator = new ContainerInsightECSPrometheusMetricsValidator(); - expectedData = validationConfig.getExpectedMetricTemplate(); - break; default: throw new BaseException(ExceptionCode.VALIDATION_TYPE_NOT_EXISTED); } From 2c95cb76e080125f955bdf6734936a6a3979e0b3 Mon Sep 17 00:00:00 2001 From: Pinglei Guo Date: Thu, 17 Jun 2021 22:18:54 -0700 Subject: [PATCH 7/7] ecs: Add test for fargate and EC2 awsvpc --- terraform/ecs/extra_apps.tf | 20 ++++++--- terraform/ecs/main.tf | 2 +- terraform/setup/setup.tf | 9 ++++ .../containerinsight_ecs_prometheus/README.md | 29 +++++++------ .../cloudwatch_context.json | 10 ++++- .../parameters.tfvars | 27 +++++++++++- .../amazon/aoc/models/CloudWatchContext.java | 2 +- ...htECSPrometheusStructuredLogValidator.java | 9 +++- ...inerInsightPrometheusMetricsValidator.java | 3 ++ .../ecs/prometheus/jmx_metrics.mustache | 12 ++++-- .../ecs/prometheus/nginx_metrics.mustache | 6 ++- ...InsightPrometheusMetricsValidatorTest.java | 43 ++++++++++++++----- 12 files changed, 129 insertions(+), 43 deletions(-) diff --git a/terraform/ecs/extra_apps.tf b/terraform/ecs/extra_apps.tf index 63e3cde1a..1d560027b 100644 --- a/terraform/ecs/extra_apps.tf +++ b/terraform/ecs/extra_apps.tf @@ -15,11 +15,14 @@ resource "aws_ecs_task_definition" "extra_apps" { for_each = var.ecs_extra_apps family = "taskdef-${module.common.testing_id}-${each.value.service_name}" container_definitions = data.template_file.extra_apps_defs[each.key].rendered - network_mode = each.value.network_mode - cpu = each.value.cpu - memory = each.value.memory - task_role_arn = module.basic_components.aoc_iam_role_arn - execution_role_arn = module.basic_components.aoc_iam_role_arn + requires_compatibilities = each.value.launch_type == "FARGATE" ? [ + "FARGATE"] : [ + "EC2"] + network_mode = each.value.network_mode + cpu = each.value.cpu + memory = each.value.memory + task_role_arn = module.basic_components.aoc_iam_role_arn + execution_role_arn = module.basic_components.aoc_iam_role_arn } resource "aws_ecs_service" "extra_apps" { @@ -34,7 +37,8 @@ resource "aws_ecs_service" "extra_apps" { // NOTE: network configuration is only allowed for awsvpc // a hack for optional block https://github.com/hashicorp/terraform/issues/19898 dynamic "network_configuration" { - for_each = each.value.network_mode == "awsvpc" ? list(each.value.network_mode) : [] + for_each = each.value.network_mode == "awsvpc" ? tolist([ + each.value.network_mode]) : [] content { subnets = module.basic_components.aoc_private_subnet_ids security_groups = [ @@ -47,4 +51,8 @@ output "extra_apps_defs_rendered" { value = { for k, v in data.template_file.extra_apps_defs : k => v.rendered } +} + +output "extra_app_task_defs" { + value = aws_ecs_task_definition.extra_apps } \ No newline at end of file diff --git a/terraform/ecs/main.tf b/terraform/ecs/main.tf index 10ba6b304..255e6ff38 100644 --- a/terraform/ecs/main.tf +++ b/terraform/ecs/main.tf @@ -289,7 +289,7 @@ module "validator_without_sample_app" { aws_access_key_id = var.aws_access_key_id aws_secret_access_key = var.aws_secret_access_key - depends_on = [aws_ecs_service.aoc_without_sample_app] + depends_on = [aws_ecs_service.aoc_without_sample_app, aws_ecs_service.extra_apps] } data "template_file" "cloudwatch_context" { diff --git a/terraform/setup/setup.tf b/terraform/setup/setup.tf index 3253901d8..d0f094ef0 100644 --- a/terraform/setup/setup.tf +++ b/terraform/setup/setup.tf @@ -97,6 +97,15 @@ resource "aws_security_group" "aoc_sg" { name = module.common.aoc_vpc_security_group vpc_id = module.vpc.vpc_id + # Allow all TCP ingress within the VPC so prometheus scrape can work with private IP. + # https://stackoverflow.com/questions/49995417/self-reference-not-allowed-in-security-group-definition + ingress { + from_port = 0 + to_port = 65535 + protocol = "tcp" + self = true + } + ingress { from_port = 22 to_port = 22 diff --git a/terraform/testcases/containerinsight_ecs_prometheus/README.md b/terraform/testcases/containerinsight_ecs_prometheus/README.md index 8895a57c4..794f76ff0 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/README.md +++ b/terraform/testcases/containerinsight_ecs_prometheus/README.md @@ -5,20 +5,13 @@ This is e2e test for [extension/ecsobserver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/extension/observer/ecsobserver) -## TODO - -- [ ] [cloudwatch_context.json](cloudwatch_context.json) - - [ ] we need to update java code to include `taskDefinitionFamily` and `serviceName` - - [ ] cluster name is used by other tests, need to pass it when rendering the template, could do this for the - default template, it's empty -- [x] app image repo is hardcoded and not used -- [ ] log group name for sample applications' container log - ## Usage -Non default image `123456.dkr.ecr.us-west-2.amazonaws.com/aoc:myfeature-0.2` +If your AWS account is `123456` and you want to run your own image +`123456.dkr.ecr.us-west-2.amazonaws.com/aoc:myfeature-0.2`. -- `ecs_launch_type` is the launch type for aoc itself, launch type for extra apps are defined in TODO(where?) +- `ecs_launch_type` is the launch type for aoc itself, launch type for extra apps are defined + in [parameters.tfvars](parameters.tfvars) - `aoc_image_repo` is repo for aoc without tag - `aoc_version` is the image tag for aoc - `ecs_extra_apps_image_repo` is the repo for all the extra apps, remaining part of image name and version are defined @@ -53,7 +46,9 @@ There are multiple sample applications ```bash # Run at project root to make sure the validator code pass style check and compiles -./gradlew :validator:build +./gradlew :validator:build +# Run at terraform/ecs to run validation without spinning up new infra +make validate ``` - validation config file name is specified @@ -69,6 +64,12 @@ There are multiple sample applications ## Problems -`Unknown variable; There is no variable named` +List of common problems you may encounter. + +### Unknown variable -- used wrong variable name in template file \ No newline at end of file +You are using wrong variable name in template files, or you didn't define the var when rendering template. + +``` +Unknown variable; There is no variable named +``` \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/cloudwatch_context.json b/terraform/testcases/containerinsight_ecs_prometheus/cloudwatch_context.json index d5ae0f55c..9502b926f 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/cloudwatch_context.json +++ b/terraform/testcases/containerinsight_ecs_prometheus/cloudwatch_context.json @@ -2,11 +2,17 @@ "clusterName": "${cluster_name}", "jmx": { "job": "ecssd", - "taskDefinitionFamily": "taskdef-${testing_id}-jmx" + "taskDefinitionFamilies": [ + "taskdef-${testing_id}-jmx", + "taskdef-${testing_id}-jmxawsvpc", + "taskdef-${testing_id}-jmxfargate" + ] }, "nginx": { "job": "ecssd", - "taskDefinitionFamily": "taskdef-${testing_id}-nginx-service", + "taskDefinitionFamilies": [ + "taskdef-${testing_id}-nginx-service" + ], "serviceName": "aocservice-${testing_id}-nginx-service" } } \ No newline at end of file diff --git a/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars b/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars index 85c67dc35..bac3906cd 100644 --- a/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars +++ b/terraform/testcases/containerinsight_ecs_prometheus/parameters.tfvars @@ -4,7 +4,6 @@ validation_config = "ecs-container-insight-prometheus.yml" sample_app_callable = false # sample apps that emit ecs metrics ecs_extra_apps = { - # TODO: need both host network, awspvc and fargate jmx = { definition = "jmx.json" service_name = "jmx" @@ -16,6 +15,32 @@ ecs_extra_apps = { memory = 256 } + # NOTE: for awsvpc to work form prometheus, we need change security group + # to allow all traffic within the VPC + jmxawsvpc = { + definition = "jmx.json" + service_name = "jmxawsvpc" + service_type = "replica" + replicas = 1 + network_mode = "awsvpc" + launch_type = "EC2" + cpu = 256 + memory = 256 + } + + jmxfargate = { + definition = "jmx.json" + service_name = "jmxfargate" + service_type = "replica" + replicas = 1 + network_mode = "awsvpc" + launch_type = "FARGATE" + cpu = 256 + # Must set cpu and memory for fargate in specific ways + # https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-ecs-taskdefinition.html#:~:text=ContainerDefinition-,If%20your%20tasks%20will,cpu%20parameter,-512 + memory = 512 + } + nginx = { definition = "nginx.json" service_name = "nginx-service" diff --git a/validator/src/main/java/com/amazon/aoc/models/CloudWatchContext.java b/validator/src/main/java/com/amazon/aoc/models/CloudWatchContext.java index 1c909c6f1..6bc036932 100644 --- a/validator/src/main/java/com/amazon/aoc/models/CloudWatchContext.java +++ b/validator/src/main/java/com/amazon/aoc/models/CloudWatchContext.java @@ -58,7 +58,7 @@ public static class App { private String namespace; private String job; // For ECS - private String taskDefinitionFamily; + private String[] taskDefinitionFamilies; private String serviceName; } } \ No newline at end of file diff --git a/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java b/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java index 1e9dc027f..802b36b35 100644 --- a/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java +++ b/validator/src/main/java/com/amazon/aoc/validators/ContainerIInsightECSPrometheusStructuredLogValidator.java @@ -56,10 +56,15 @@ void init(Context context, FileConfig expectedDataTemplate) throws Exception { app.getName() + ".json")); String templateInput = mustacheHelper.render(fileConfig, context); // NOTE: EKS use namespace, we use task family for matching log event to schema. - schemasToValidate.put(app.getTaskDefinitionFamily(), parseJsonSchema(templateInput)); + for (String taskDefinitionFamily : app.getTaskDefinitionFamilies()) { + // We can deploy one workload in different ways (EC2, fargate etc.) + // so we have a list of task definition families. + schemasToValidate.put(taskDefinitionFamily, parseJsonSchema(templateInput)); + } logStreamNames.add(app.getJob()); } - log.info("apps to validate {}", validateApps.size()); + log.info("apps to validate {} schema to validate {}", validateApps.size(), + schemasToValidate.keySet()); } @Override diff --git a/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightPrometheusMetricsValidator.java b/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightPrometheusMetricsValidator.java index c6ed7025e..9f0a917dc 100644 --- a/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightPrometheusMetricsValidator.java +++ b/validator/src/main/java/com/amazon/aoc/validators/ContainerInsightPrometheusMetricsValidator.java @@ -9,12 +9,14 @@ import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import lombok.extern.log4j.Log4j2; import org.apache.commons.io.FilenameUtils; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +@Log4j2 public class ContainerInsightPrometheusMetricsValidator extends AbstractCWMetricsValidator { private final ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); @@ -31,6 +33,7 @@ List getExpectedMetrics( expectedDataTemplate.getPath().toString(), app.getName() + "_metrics.mustache")); String templateInput = mustacheHelper.render(fileConfig, context); + // log.info("Rendered template {}", templateInput); List appMetrics = mapper.readValue(templateInput.getBytes(StandardCharsets.UTF_8), new TypeReference>() { }); diff --git a/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx_metrics.mustache b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx_metrics.mustache index 56fef616a..680ab7824 100644 --- a/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx_metrics.mustache +++ b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/jmx_metrics.mustache @@ -1,17 +1,20 @@ +# https://stackoverflow.com/questions/4067093/mustache-read-variables-from-parent-section-in-child-section +# https://stackoverflow.com/questions/3954913/iterating-over-arrays-with-mustache +{{#cloudWatchContext.jmx.taskDefinitionFamilies}} - metricName: jvm_classes_loaded namespace: ECS/ContainerInsights/Prometheus dimensions: - name: ClusterName value: {{cloudWatchContext.clusterName}} - name: TaskDefinitionFamily - value: {{cloudWatchContext.jmx.taskDefinitionFamily}} + value: {{.}} - metricName: jvm_memory_pool_bytes_used namespace: ECS/ContainerInsights/Prometheus dimensions: - name: ClusterName value: {{cloudWatchContext.clusterName}} - name: TaskDefinitionFamily - value: {{cloudWatchContext.jmx.taskDefinitionFamily}} + value: {{.}} - name: pool value: Metaspace - metricName: jvm_memory_bytes_used @@ -20,6 +23,7 @@ - name: ClusterName value: {{cloudWatchContext.clusterName}} - name: TaskDefinitionFamily - value: {{cloudWatchContext.jmx.taskDefinitionFamily}} + value: {{.}} - name: area - value: heap \ No newline at end of file + value: heap +{{/cloudWatchContext.jmx.taskDefinitionFamilies}} \ No newline at end of file diff --git a/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx_metrics.mustache b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx_metrics.mustache index fb0ed91a8..7621a53c9 100644 --- a/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx_metrics.mustache +++ b/validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus/nginx_metrics.mustache @@ -1,9 +1,11 @@ +{{#cloudWatchContext.nginx.taskDefinitionFamilies}} - metricName: nginx_up namespace: ECS/ContainerInsights/Prometheus dimensions: - name: ClusterName value: {{cloudWatchContext.clusterName}} - name: TaskDefinitionFamily - value: {{cloudWatchContext.nginx.taskDefinitionFamily}} + value: {{.}} - name: ServiceName - value: {{cloudWatchContext.nginx.serviceName}} \ No newline at end of file + value: {{cloudWatchContext.nginx.serviceName}} +{{/cloudWatchContext.nginx.taskDefinitionFamilies}} \ No newline at end of file diff --git a/validator/src/test/java/com/amazon/aoc/validators/ContainerInsightPrometheusMetricsValidatorTest.java b/validator/src/test/java/com/amazon/aoc/validators/ContainerInsightPrometheusMetricsValidatorTest.java index afc844291..7c3064738 100644 --- a/validator/src/test/java/com/amazon/aoc/validators/ContainerInsightPrometheusMetricsValidatorTest.java +++ b/validator/src/test/java/com/amazon/aoc/validators/ContainerInsightPrometheusMetricsValidatorTest.java @@ -7,7 +7,6 @@ import com.amazon.aoc.fileconfigs.PredefinedExpectedTemplate; import com.amazon.aoc.models.CloudWatchContext; import com.amazon.aoc.models.Context; -import com.amazon.aoc.models.ValidationConfig; import com.amazon.aoc.services.CloudWatchService; import com.amazonaws.services.cloudwatch.model.MetricDataResult; import org.junit.Test; @@ -23,12 +22,6 @@ public class ContainerInsightPrometheusMetricsValidatorTest { @Test public void testValidationSucceed() throws Exception { - // fake a validation config - ValidationConfig validationConfig = new ValidationConfig(); - validationConfig.setCallingType("http"); - validationConfig.setExpectedMetricTemplate( - PredefinedExpectedTemplate.CONTAINER_INSIGHT_EKS_PROMETHEUS_METRIC.name()); - // mock cloudwatch service CloudWatchService cloudWatchService = mock(CloudWatchService.class); List metricDataResults = new ArrayList<>(); @@ -39,9 +32,7 @@ public void testValidationSucceed() throws Exception { ContainerInsightPrometheusMetricsValidator validator = new ContainerInsightPrometheusMetricsValidator(); validator.init( - getContext(), - validationConfig, - null, + getContext(), null, null, PredefinedExpectedTemplate.CONTAINER_INSIGHT_EKS_PROMETHEUS_METRIC ); validator.setCloudWatchService(cloudWatchService); @@ -50,6 +41,25 @@ public void testValidationSucceed() throws Exception { validator.validate(); } + @Test + public void tesECS() throws Exception { + CloudWatchService cloudWatchService = mock(CloudWatchService.class); + List metricDataResults = new ArrayList<>(); + metricDataResults.add(new MetricDataResult().withStatusCode("200").withValues(1.0)); + when(cloudWatchService.getMetricData(any(), any(), any())).thenReturn(metricDataResults); + + ContainerInsightPrometheusMetricsValidator validator = + new ContainerInsightPrometheusMetricsValidator(); + validator.init( + getECSContext(), null, null, + PredefinedExpectedTemplate.CONTAINER_INSIGHT_ECS_PROMETHEUS_METRIC + ); + validator.setCloudWatchService(cloudWatchService); + validator.setMaxRetryCount(1); + validator.setInitialSleepTime(0); + validator.validate(); + } + private Context getContext() { String namespace = "fakednamespace"; String testingId = "fakedTesingId"; @@ -74,4 +84,17 @@ private Context getContext() { context.setCloudWatchContext(cloudWatchContext); return context; } + + private Context getECSContext() { + CloudWatchContext.App jmx = new CloudWatchContext.App(); + jmx.setJob("jmx"); + jmx.setTaskDefinitionFamilies(new String[]{"jmxawsvpc", "jmxfargate"}); + CloudWatchContext cloudWatchContext = new CloudWatchContext(); + cloudWatchContext.setJmx(jmx); + cloudWatchContext.setClusterName(this.clusterName); + + Context context = getContext(); + context.setCloudWatchContext(cloudWatchContext); + return context; + } }