Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ecs][prometheus] Add ContainerInsight ECS Prometheus #308

Merged
merged 7 commits into from
Jun 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions terraform/basic_components/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ output "mocked_server_cert_content" {
value = data.template_file.mocked_server_cert.rendered
}

output "sample_app_image_repo" {
value = data.aws_ecr_repository.sample_apps.repository_url
}

output "sample_app_image" {
value = "${data.aws_ecr_repository.sample_apps.repository_url}:${var.sample_app}-latest"
}
Expand Down
12 changes: 12 additions & 0 deletions terraform/ecs/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# If you are debugging validator but don't want to rebuild the infra, just run
# make build && make validate
# terraform apply generates validator_docker_compose.yml with right parameters

validate:
# You can modify this to whatever temp credential helper you have if your ~/.aws is using credential_process
# The docker compose file mount ~/.aws into the container for getting AWS credential
# awsl isengard get > creds.env
docker-compose -f validator_docker_compose.yml up

build:
docker-compose -f validator_docker_compose.yml build
3 changes: 2 additions & 1 deletion terraform/ecs/efs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ resource "null_resource" "scp_cert" {
}

output "private_key" {
value = tls_private_key.ssh_key.private_key_pem
value = tls_private_key.ssh_key.private_key_pem
sensitive = true
}

output "efs_ip" {
Expand Down
58 changes: 58 additions & 0 deletions terraform/ecs/extra_apps.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
locals {
extra_app_image_repo = var.ecs_extra_apps_image_repo != "" ? var.ecs_extra_apps_image_repo : module.basic_components.sample_app_image_repo
}

data "template_file" "extra_apps_defs" {
for_each = var.ecs_extra_apps
template = file("${var.testcase}/${each.value.definition}")
vars = {
region = var.region
image_repo = local.extra_app_image_repo
}
}

resource "aws_ecs_task_definition" "extra_apps" {
for_each = var.ecs_extra_apps
family = "taskdef-${module.common.testing_id}-${each.value.service_name}"
container_definitions = data.template_file.extra_apps_defs[each.key].rendered
requires_compatibilities = each.value.launch_type == "FARGATE" ? [
"FARGATE"] : [
"EC2"]
network_mode = each.value.network_mode
cpu = each.value.cpu
memory = each.value.memory
task_role_arn = module.basic_components.aoc_iam_role_arn
execution_role_arn = module.basic_components.aoc_iam_role_arn
}

resource "aws_ecs_service" "extra_apps" {
for_each = var.ecs_extra_apps
name = "aocservice-${module.common.testing_id}-${each.value.service_name}"
cluster = module.ecs_cluster.cluster_id
task_definition = "${aws_ecs_task_definition.extra_apps[each.key].family}:1"
desired_count = each.value.replicas
launch_type = each.value.launch_type
platform_version = each.value.launch_type == "FARGATE" ? "1.4.0" : null

// NOTE: network configuration is only allowed for awsvpc
// a hack for optional block https://github.com/hashicorp/terraform/issues/19898
dynamic "network_configuration" {
for_each = each.value.network_mode == "awsvpc" ? tolist([
each.value.network_mode]) : []
content {
subnets = module.basic_components.aoc_private_subnet_ids
security_groups = [
module.basic_components.aoc_security_group_id]
}
}
}

output "extra_apps_defs_rendered" {
value = {
for k, v in data.template_file.extra_apps_defs : k => v.rendered
}
}

output "extra_app_task_defs" {
value = aws_ecs_task_definition.extra_apps
}
52 changes: 42 additions & 10 deletions terraform/ecs/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ module "basic_components" {
}

locals {
ecs_taskdef_path = "../templates/${var.ecs_taskdef_directory}/ecs_taskdef.tpl"
sample_app_image = var.sample_app_image != "" ? var.sample_app_image : module.basic_components.sample_app_image
mocked_server_image = var.mocked_server_image != "" ? var.mocked_server_image : module.basic_components.mocked_server_image
ecs_taskdef_path = fileexists("${var.testcase}/ecs_taskdef.tpl") ? "${var.testcase}/ecs_taskdef.tpl" : "../templates/${var.ecs_taskdef_directory}/ecs_taskdef.tpl"
sample_app_image = var.sample_app_image != "" ? var.sample_app_image : module.basic_components.sample_app_image
mocked_server_image = var.mocked_server_image != "" ? var.mocked_server_image : module.basic_components.mocked_server_image
cloudwatch_context_path = fileexists("${var.testcase}/cloudwatch_context.json") ? "${var.testcase}/cloudwatch_context.json" : "../templates/${var.ecs_taskdef_directory}/cloudwatch_context.json"
}

provider "aws" {
Expand All @@ -54,7 +55,7 @@ provider "aws" {

module "ecs_cluster" {
source = "infrablocks/ecs-cluster/aws"
version = "3.0.0"
version = "4.0.0"

cluster_name = module.common.testing_id
component = "aoc"
Expand All @@ -69,10 +70,29 @@ module "ecs_cluster" {
// TODO(pingleig): pass patch tag for canary and soaking (if any)
}

# This is a hack for known issue https://github.com/hashicorp/terraform-provider-aws/issues/4852
# We always create ECS cluster with active EC2 instances, so when destroy we need to scale down
# the asg so the cluster can be destroyed.
resource "null_resource" "scale_down_asg" {
# https://discuss.hashicorp.com/t/how-to-rewrite-null-resource-with-local-exec-provisioner-when-destroy-to-prepare-for-deprecation-after-0-12-8/4580/2
triggers = {
asg_name = module.ecs_cluster.autoscaling_group_name
}

# Only run during destroy, do nothing for apply.
provisioner "local-exec" {
when = destroy
command = <<-EOT
aws autoscaling update-auto-scaling-group --auto-scaling-group-name "${self.triggers.asg_name}" --min-size 0 --desired-capacity 0
EOT
}
}

resource "aws_ssm_parameter" "otconfig" {
name = "otconfig-${module.common.testing_id}"
type = "String"
value = module.basic_components.otconfig_content
tier = "Advanced" // need advanced for a long list of prometheus relabel config
}

## create task def
Expand Down Expand Up @@ -197,7 +217,7 @@ resource "aws_ecs_service" "aoc" {
}

load_balancer {
target_group_arn = aws_lb_target_group.mocked_server_lb_tg.arn
target_group_arn = aws_lb_target_group.mocked_server_lb_tg[0].arn
container_name = "mocked-server"
container_port = module.common.mocked_server_http_port
}
Expand All @@ -206,6 +226,8 @@ resource "aws_ecs_service" "aoc" {
subnets = module.basic_components.aoc_private_subnet_ids
security_groups = [module.basic_components.aoc_security_group_id]
}

depends_on = [null_resource.scale_down_asg]
}

# remove lb since there's no callable sample app, some test cases will drop in here, for example, ecsmetadata receiver test
Expand All @@ -223,6 +245,7 @@ resource "aws_ecs_service" "aoc_without_sample_app" {
security_groups = [module.basic_components.aoc_security_group_id]
}

depends_on = [null_resource.scale_down_asg]
}

##########################################
Expand All @@ -237,7 +260,7 @@ module "validator" {
testing_id = module.common.testing_id
metric_namespace = "${module.common.otel_service_namespace}/${module.common.otel_service_name}"
sample_app_endpoint = "http://${aws_lb.aoc_lb[0].dns_name}:${module.common.sample_app_lb_port}"
mocked_server_validating_url = "http://${aws_lb.mocked_server_lb.dns_name}:${module.common.mocked_server_lb_port}/check-data"
mocked_server_validating_url = "http://${aws_lb.mocked_server_lb[0].dns_name}:${module.common.mocked_server_lb_port}/check-data"
cortex_instance_endpoint = var.cortex_instance_endpoint

aws_access_key_id = var.aws_access_key_id
Expand All @@ -254,20 +277,29 @@ module "validator_without_sample_app" {
region = var.region
testing_id = module.common.testing_id
metric_namespace = "${module.common.otel_service_namespace}/${module.common.otel_service_name}"
mocked_server_validating_url = "http://${aws_lb.mocked_server_lb.dns_name}:${module.common.mocked_server_lb_port}/check-data"
mocked_server_validating_url = var.disable_mocked_server ? "" : "http://${aws_lb.mocked_server_lb[0].dns_name}:${module.common.mocked_server_lb_port}/check-data"

ecs_cluster_name = module.ecs_cluster.cluster_name
ecs_task_arn = aws_ecs_task_definition.aoc.arn
ecs_taskdef_family = aws_ecs_task_definition.aoc.family
ecs_taskdef_version = aws_ecs_task_definition.aoc.revision

cloudwatch_context_json = data.template_file.cloudwatch_context.rendered

aws_access_key_id = var.aws_access_key_id
aws_secret_access_key = var.aws_secret_access_key

depends_on = [aws_ecs_service.aoc_without_sample_app]
depends_on = [aws_ecs_service.aoc_without_sample_app, aws_ecs_service.extra_apps]
}



data "template_file" "cloudwatch_context" {
# default is just empty json, each test case can set its own override under its own folder.
# See containerinsight_ecs_prometheus as example.
template = file(local.cloudwatch_context_path)
vars = {
testing_id = module.common.testing_id
cluster_name = module.ecs_cluster.cluster_name
}
}


7 changes: 5 additions & 2 deletions terraform/ecs/mocked_server_lb.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
# -------------------------------------------------------------------------

resource "aws_lb" "mocked_server_lb" {
count = var.disable_mocked_server ? 0 : 1
# use public subnet to make the lb accessible from public internet
subnets = module.basic_components.aoc_public_subnet_ids
security_groups = [module.basic_components.aoc_security_group_id]
name = "aoc-lb-${module.common.testing_id}"
}

resource "aws_lb_target_group" "mocked_server_lb_tg" {
count = var.disable_mocked_server ? 0 : 1
name = "ms-lbtg-${module.common.testing_id}"
port = module.common.mocked_server_http_port
protocol = "HTTP"
Expand All @@ -37,12 +39,13 @@ resource "aws_lb_target_group" "mocked_server_lb_tg" {
}

resource "aws_lb_listener" "mocked_server_lb_listener" {
load_balancer_arn = aws_lb.mocked_server_lb.arn
count = var.disable_mocked_server ? 0 : 1
load_balancer_arn = aws_lb.mocked_server_lb[0].arn
port = module.common.mocked_server_lb_port
protocol = "HTTP"

default_action {
type = "forward"
target_group_arn = aws_lb_target_group.mocked_server_lb_tg.arn
target_group_arn = aws_lb_target_group.mocked_server_lb_tg[0].arn
}
}
25 changes: 25 additions & 0 deletions terraform/ecs/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,35 @@ variable "sample_app_callable" {
default = true
}

# prometheus does not need mocked server
variable "disable_mocked_server" {
default = false
}

variable "mock_endpoint" {
default = "localhost/put-data"
}

variable "ecs_taskdef_directory" {
default = "defaults"
}

variable "ecs_extra_apps_image_repo" {
# When empty will use sample image repo
default = ""
}

variable "ecs_extra_apps" {
type = map(object({
definition = string
service_name = string
service_type = string
replicas = number
network_mode = string
launch_type = string
cpu = number
memory = number
}))
default = {}
}

9 changes: 9 additions & 0 deletions terraform/setup/setup.tf
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ resource "aws_security_group" "aoc_sg" {
name = module.common.aoc_vpc_security_group
vpc_id = module.vpc.vpc_id

# Allow all TCP ingress within the VPC so prometheus scrape can work with private IP.
# https://stackoverflow.com/questions/49995417/self-reference-not-allowed-in-security-group-definition
ingress {
from_port = 0
to_port = 65535
protocol = "tcp"
self = true
}

ingress {
from_port = 22
to_port = 22
Expand Down
3 changes: 3 additions & 0 deletions terraform/templates/defaults/cloudwatch_context.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"clusterName": "${cluster_name}"
}
75 changes: 75 additions & 0 deletions terraform/testcases/containerinsight_ecs_prometheus/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# ContainerInsight ECS Prometheus

## Overview

This is e2e test
for [extension/ecsobserver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/extension/observer/ecsobserver)

## Usage

If your AWS account is `123456` and you want to run your own image
`123456.dkr.ecr.us-west-2.amazonaws.com/aoc:myfeature-0.2`.

- `ecs_launch_type` is the launch type for aoc itself, launch type for extra apps are defined
in [parameters.tfvars](parameters.tfvars)
- `aoc_image_repo` is repo for aoc without tag
- `aoc_version` is the image tag for aoc
- `ecs_extra_apps_image_repo` is the repo for all the extra apps, remaining part of image name and version are defined
as tag in templates like [jmx.json](jmx.json)
e.g. `123456.dkr.ecr.us-west-2.amazonaws.com/prometheus-samples:tomcat-jmx-latest`

```bash
cd $PROJECT/terraform/ecs
terraform apply \
-var="ecs_launch_type=FARGATE" \
-var="disable_mocked_server=true" \
-var="aoc_version=myfeature-0.2" \
-var="aoc_image_repo=123456.dkr.ecr.us-west-2.amazonaws.com/aoc" \
-var="ecs_extra_apps_image_repo=123456.dkr.ecr.us-west-2.amazonaws.com/prometheus-samples" \
-var="testcase=../testcases/containerinsight_ecs_prometheus" \
-var-file="../testcases/containerinsight_ecs_prometheus/parameters.tfvars"
```

## Development

### Files

- [ecs_taskdef.tpl](ecs_taskdef.tpl) is override for [default](../../templates/defaults/ecs_taskdef.tpl) because we
don't need mock server and sample app, i.e., not deploying collector as sidecar.
- log group is `/ecs/ecs-adot-collector-service`

### Build and push sample app image

There are multiple sample applications

### Validation

```bash
# Run at project root to make sure the validator code pass style check and compiles
./gradlew :validator:build
# Run at terraform/ecs to run validation without spinning up new infra
make validate
```

- validation config file name is specified
in [parameters.tfvars](parameters.tfvars) `ecs-container-insight-prometheus.yml`
- the actual validation config is located
in [validator/src/main/resources/validations](../../../validator/src/main/resources/validations/ecs-container-insight-prometheus.yml)
- path to log and metrics validation templates is
in [PredefinedExpectedTemplate](../../../validator/src/main/java/com/amazon/aoc/fileconfigs/PredefinedExpectedTemplate.java)
while the actual files are
in [expected-data-template/container-insight/ecs/prometheus](../../../validator/src/main/resources/expected-data-template/container-insight/ecs/prometheus)
- `validationType: "container-insight-ecs-prometheus-logs"` in config
triggers [ValidatorFactory](../../../validator/src/main/java/com/amazon/aoc/validators/ValidatorFactory.java)

## Problems

List of common problems you may encounter.

### Unknown variable

You are using wrong variable name in template files, or you didn't define the var when rendering template.

```
Unknown variable; There is no variable named
```
Loading