Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ecs: Add internal doc and cfn template for ECS Prometheus #560

Merged
merged 2 commits into from
Jul 13, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
Parameters:
ClusterName:
Type: String
Description: Enter the name of your ECS cluster from which you want to collect prometheus metrics
# IAM
CreateIAMRoles:
Type: String
Default: 'False'
AllowedValues:
- 'True'
- 'False'
Description: Create new default IAM roles or use existing ones.
ConstraintDescription: must specify True or False.
TaskRoleArn:
Type: String
Default: Default
Description: Enter the role arn you want to use as the ecs task role
ExecutionRoleArn:
Type: String
Default: Default
Description: Enter the role arn you want to use as the ecs execution role
# Collector
CollectorImage:
Type: String
Default: 'amazon/aws-otel-collector:latest'
pingleig marked this conversation as resolved.
Show resolved Hide resolved
Conditions:
CreateRoles: !Equals
- !Ref CreateIAMRoles
- 'True'
DefaultTaskRole: !Equals
- !Ref TaskRoleArn
- Default
DefaultExecutionRole: !Equals
- !Ref ExecutionRoleArn
- Default
Resources:
ECSTaskDefinition:
Type: 'AWS::ECS::TaskDefinition'
Properties:
Family: !Sub 'adot-container-insights-prometheus-${ClusterName}'
TaskRoleArn: !If
- CreateRoles
- !GetAtt
- ECSTaskRole
- Arn
- !If
- DefaultTaskRole
- !Sub 'arn:aws:iam::${AWS::AccountId}:role/AWSOTelRole'
- !Ref TaskRoleArn
ExecutionRoleArn: !If
- CreateRoles
- !GetAtt
- ECSExecutionRole
- Arn
- !If
- DefaultExecutionRole
- !Sub 'arn:aws:iam::${AWS::AccountId}:role/AWSOTelExecutionRole'
- !Ref ExecutionRoleArn
NetworkMode: bridge
ContainerDefinitions:
- LogConfiguration:
LogDriver: awslogs
Options:
awslogs-create-group: 'True'
awslogs-group: !Sub '/ecs/aws-otel-collector/${ClusterName}'
awslogs-region: !Ref 'AWS::Region'
awslogs-stream-prefix: ecs
Image: !Ref CollectorImage
Name: aws-collector
Secrets:
- Name: AOT_CONFIG_CONTENT
ValueFrom: !Sub 'AmazonCloudWatch-AOC-ECS-Prometheus-${ClusterName}'
Memory: '512'
RequiresCompatibilities:
- EC2
Cpu: '256'
ECSReplicaService:
Type: 'AWS::ECS::Service'
Properties:
TaskDefinition: !Ref ECSTaskDefinition
Cluster: !Ref ClusterName
LaunchType: EC2
SchedulingStrategy: REPLICA
DesiredCount: 1
ServiceName: adot-container-insights-prometheus-service
ECSTaskRole:
Type: 'AWS::IAM::Role'
Condition: CreateRoles
Properties:
Description: Allows ECS tasks to call AWS services on your behalf.
AssumeRolePolicyDocument:
Version: 2012-10-17
Statement:
- Sid: ''
Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: 'sts:AssumeRole'
Policies:
- PolicyName: AWSOpenTelemetryPolicy
PolicyDocument:
Version: 2012-10-17
Statement:
- Effect: Allow
Action:
- 'logs:PutLogEvents'
- 'logs:CreateLogGroup'
- 'logs:CreateLogStream'
- 'logs:DescribeLogStreams'
- 'logs:DescribeLogGroups'
- 'xray:PutTraceSegments'
- 'xray:PutTelemetryRecords'
- 'xray:GetSamplingRules'
- 'xray:GetSamplingTargets'
- 'xray:GetSamplingStatisticSummaries'
- 'ssm:GetParameters'
Resource: '*'
- PolicyName: AWSOpenTelemetryPolicyPrometheusECSDiscovery
PolicyDocument:
Version: 2012-10-17
Statement:
- Effect: Allow
Action:
- 'ec2:DescribeInstances'
- 'ecs:ListTasks'
- 'ecs:ListServices'
- 'ecs:DescribeContainerInstances'
- 'ecs:DescribeServices'
- 'ecs:DescribeTasks'
- 'ecs:DescribeTaskDefinition'
Resource: '*'
RoleName: AWSOTelRolePrometheusECS
ECSExecutionRole:
Type: 'AWS::IAM::Role'
Condition: CreateRoles
Properties:
Description: >-
Allows ECS container agent makes calls to the Amazon ECS API on your
behalf.
AssumeRolePolicyDocument:
Version: 2012-10-17
Statement:
- Sid: ''
Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: 'sts:AssumeRole'
ManagedPolicyArns:
- 'arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy'
- 'arn:aws:iam::aws:policy/CloudWatchLogsFullAccess'
- 'arn:aws:iam::aws:policy/AmazonSSMReadOnlyAccess'
RoleName: AWSOTelExecutionRolePrometheusECS
AocConfigSSMParameter:
Type: AWS::SSM::Parameter
Properties:
Name: !Sub 'AmazonCloudWatch-AOC-ECS-Prometheus-${ClusterName}'
Type: String
Tier: Intelligent-Tiering
Description: !Sub 'CWAgent SSM Parameter with App Mesh and Java EMF Definition for ECS Cluster: ${ClusterName}'
Value: !Sub |-

extensions:
ecs_observer:
cluster_name: '${ClusterName}'
cluster_region: '${AWS::Region}'
result_file: '/etc/ecs_sd_targets.yaml'
refresh_interval: 60s
job_label_name: prometheus_job
# nginx and nginx plus
# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights-Prometheus-Setup-nginx-ecs.html
services:
- name_pattern: '^.*nginx-service$'
metrics_ports:
- 9113
job_name: nginx-prometheus-exporter
# jmx
docker_labels:
- port_label: 'ECS_PROMETHEUS_EXPORTER_PORT'
# App Mesh, port and metrics are from envoy sidecar
task_definitions:
- arn_pattern: '.*:task-definition/.*-ColorTeller-(white):[0-9]+'
metrics_path: '/stats/prometheus'
metrics_ports:
- 9901
job_name: ecs-appmesh-color
- arn_pattern: '.*:task-definition/.*-ColorGateway:[0-9]+'
metrics_path: '/stats/prometheus'
metrics_ports:
- 9901
job_name: ecs-appmesh-color

receivers:
prometheus:
config:
scrape_configs:
- job_name: "ecssd"
file_sd_configs:
- files:
- '/etc/ecs_sd_targets.yaml'
relabel_configs:
- source_labels: [ __meta_ecs_cluster_name ] # ClusterName
action: replace
target_label: ClusterName
- source_labels: [ __meta_ecs_service_name ] # ServiceName
action: replace
target_label: ServiceName
- source_labels: [ __meta_ecs_task_definition_family ] # TaskDefinitionFamily
action: replace
target_label: TaskDefinitionFamily
- source_labels: [ __meta_ecs_container_name ] # container_name
action: replace
target_label: container_name
- action: labelmap # docker labels
regex: ^__meta_ecs_container_labels_(.+)$
replacement: '$$1'

exporters:
awsemf:
region: '${AWS::Region}'
namespace: ECS/ContainerInsights/Prometheus
log_group_name: "/aws/ecs/containerinsights/${ClusterName}/prometheus"
dimension_rollup_option: NoDimensionRollup
metric_declarations:
# nginx
- dimensions: [ [ ClusterName, TaskDefinitionFamily, ServiceName ] ]
label_matchers:
- label_names:
- ServiceName
regex: '^.*nginx-service$'
metric_name_selectors:
- "^nginx_.*$"
# nginx plus
- dimensions: [ [ ClusterName, TaskDefinitionFamily, ServiceName ] ]
label_matchers:
- label_names:
- ServiceName
regex: '^.*nginx-plus-service$'
metric_name_selectors:
- "^nginxplus_connections_accepted$"
- "^nginxplus_connections_active$"
- "^nginxplus_connections_dropped$"
- "^nginxplus_connections_idle$"
- "^nginxplus_http_requests_total$"
- "^nginxplus_ssl_handshakes$"
- "^nginxplus_ssl_handshakes_failed$"
- "^nginxplus_up$"
- "^nginxplus_upstream_server_health_checks_fails$"
- dimensions: [ [ ClusterName, TaskDefinitionFamily, ServiceName, upstream ] ]
label_matchers:
- label_names:
- ServiceName
regex: '^.*nginx-plus-service$'
metric_name_selectors:
- "^nginxplus_upstream_server_response_time$"
- dimensions: [ [ ClusterName, TaskDefinitionFamily, ServiceName, code ] ]
label_matchers:
- label_names:
- ServiceName
regex: '^.*nginx-plus-service$'
metric_name_selectors:
- "^nginxplus_upstream_server_responses$"
- "^nginxplus_server_zone_responses$"
# jmx
- dimensions: [ [ ClusterName, TaskDefinitionFamily, area ] ]
label_matchers:
- label_names:
- Java_EMF_Metrics
regex: ^true$
metric_name_selectors:
- "^jvm_memory_bytes_used$"
- dimensions: [ [ ClusterName, TaskDefinitionFamily, pool ] ]
label_matchers:
- label_names:
- Java_EMF_Metrics
regex: ^true$
metric_name_selectors:
- "^jvm_memory_pool_bytes_used$"
- dimensions: [ [ ClusterName, TaskDefinitionFamily ] ]
label_matchers:
- label_names:
- Java_EMF_Metrics
regex: ^true$
metric_name_selectors:
- "^jvm_threads_(current|daemon)$"
- "^jvm_classes_loaded$"
- "^java_lang_operatingsystem_(freephysicalmemorysize|totalphysicalmemorysize|freeswapspacesize|totalswapspacesize|systemcpuload|processcpuload|availableprocessors|openfiledescriptorcount)$"
- "^catalina_manager_(rejectedsessions|activesessions)$"
- "^jvm_gc_collection_seconds_(count|sum)$"
- "^catalina_globalrequestprocessor_(bytesreceived|bytessent|requestcount|errorcount|processingtime)$"
# AppMesh envoy
- dimensions: [ [ "ClusterName","TaskDefinitionFamily" ] ]
label_matchers:
- label_names:
- container_name
regex: ^envoy$
metric_name_selectors:
- "^envoy_http_downstream_rq_(total|xx)$"
- "^envoy_cluster_upstream_cx_(r|t)x_bytes_total$"
- "^envoy_cluster_membership_(healthy|total)$"
- "^envoy_server_memory_(allocated|heap_size)$"
- "^envoy_cluster_upstream_cx_(connect_timeout|destroy_local_with_active_rq)$"
- "^envoy_cluster_upstream_rq_(pending_failure_eject|pending_overflow|timeout|per_try_timeout|rx_reset|maintenance_mode)$"
- "^envoy_http_downstream_cx_destroy_remote_active_rq$"
- "^envoy_cluster_upstream_flow_control_(paused_reading_total|resumed_reading_total|backed_up_total|drained_total)$"
- "^envoy_cluster_upstream_rq_retry$"
- "^envoy_cluster_upstream_rq_retry_(success|overflow)$"
- "^envoy_server_(version|uptime|live)$"
- dimensions: [ [ "ClusterName","TaskDefinitionFamily","envoy_http_conn_manager_prefix","envoy_response_code_class" ] ]
label_matchers:
- label_names:
- container_name
regex: ^envoy$
metric_name_selectors:
- "^envoy_http_downstream_rq_xx$"


service:
extensions: [ ecs_observer ]
pipelines:
metrics:
receivers: [ prometheus ]
exporters: [ awsemf ]

Loading