From 5c8ca620f6f1f0979845dccd8e6ce1ef7af621db Mon Sep 17 00:00:00 2001 From: Ian Kerins Date: Mon, 7 Oct 2024 01:28:17 -0400 Subject: [PATCH] Overhaul all metrics - Fix names to comply with the [official guidelines](https://prometheus.io/docs/practices/naming/#metric-and-label-naming) and to better mirror the names of similar timeseries from the much-more-popular cAdvisor, when reasonable. And don't use the word "svc" to refer to tasks, as it is just not correct. - Improve `help`s. - Stop reporting per-CPU usage metrics. They're empirically only available in Fargate, but the current collector implementation assumes they're available everywhere. (They were previously available in EC2 but that stopped being the case when ecs-agent was upgraded to use cgroups v2.) Given that it's not clear why per-CPU numbers are useful in general, remove them everywhere instead of exposing disjoint metrics for Fargate and EC2. This will also prevent Fargate from potentially spontaneously breaking in the same way EC2 did. - Fix task-level memory limit to actually be in bytes (it previously said "bytes" but was in fact MiB). - Correctly report container-level memory limits in all cases - the stats `limit` is nonsense if, as in Fargate, there is no container-level limit configured in the task definition. While the right data for all cases is hiding in the stats response somewhere, I have instead opted to cut out the stats middleman and use the task metadata directly to drive this metric. I think it's substantially less likely that ECS fails to effect the configured limits upon cgroups correctly than it is that we fail to interrogate cgroups output correctly: the latter empirically happens with some frequency :^). - Add metrics concerning Fargate ephemeral storage and task image pull timestamps. - Remove the `task_arn` label on task-level metrics, as it does not distinctly identify anything within the instance - the instance is the task! Users needing the task ARN in their timeseries labels may do so by joining to `ecs_task_metadata_info`. I have tested these changes both in Fargate and EC2 and they look correct to me. Signed-off-by: Ian Kerins --- README.md | 241 +++++++++++++++++++++++--------------- ecscollector/collector.go | 235 +++++++++++++++++++++++-------------- 2 files changed, 295 insertions(+), 181 deletions(-) diff --git a/README.md b/README.md index 740b896..0e1731c 100644 --- a/README.md +++ b/README.md @@ -37,52 +37,79 @@ from App Runner services. ## Labels -* **container**: Container associated with a metric. -* **cpu**: Available to CPU metrics, helps to breakdown metrics by CPU. -* **device**: Network interface device associated with the metric. Only +### On task-level metrics +None. + +### On container-level metrics + +* **container_name**: Name of the container (as in the ECS task definition) associated with a metric. +* **interface**: Network interface device associated with the metric. Only available for several network metrics. ## Example output ``` -# HELP ecs_cpu_seconds_total Total CPU usage in seconds. -# TYPE ecs_cpu_seconds_total counter -ecs_cpu_seconds_total{container="ecs-metadata-proxy",cpu="0"} 1.746774278e+08 -ecs_cpu_seconds_total{container="ecs-metadata-proxy",cpu="1"} 1.7417992266e+08 -# HELP ecs_memory_bytes Memory usage in bytes. -# TYPE ecs_memory_bytes gauge -ecs_memory_bytes{container="ecs-metadata-proxy"} 4.440064e+06 -# HELP ecs_memory_limit_bytes Memory limit in bytes. -# TYPE ecs_memory_limit_bytes gauge -ecs_memory_limit_bytes{container="ecs-metadata-proxy"} 9.223372036854772e+18 -# HELP ecs_memory_max_bytes Maximum memory usage in bytes. -# TYPE ecs_memory_max_bytes gauge -ecs_memory_max_bytes{container="ecs-metadata-proxy"} 9.023488e+06 -# HELP ecs_network_receive_bytes_total Network received in bytes. -# TYPE ecs_network_receive_bytes_total counter -ecs_network_receive_bytes_total{container="ecs-metadata-proxy",device="eth1"} 4.2851757e+07 -# HELP ecs_network_receive_dropped_total Network packets dropped in receiving. -# TYPE ecs_network_receive_dropped_total counter -ecs_network_receive_dropped_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_receive_errors_total Network errors in receiving. -# TYPE ecs_network_receive_errors_total counter -ecs_network_receive_errors_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_receive_packets_total Network packets received. -# TYPE ecs_network_receive_packets_total counter -ecs_network_receive_packets_total{container="ecs-metadata-proxy",device="eth1"} 516239 -# HELP ecs_network_transmit_bytes_total Network transmitted in bytes. -# TYPE ecs_network_transmit_bytes_total counter -ecs_network_transmit_bytes_total{container="ecs-metadata-proxy",device="eth1"} 1.28412758e+08 -# HELP ecs_network_transmit_dropped_total Network packets dropped in transmit. -# TYPE ecs_network_transmit_dropped_total counter -ecs_network_transmit_dropped_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_transmit_errors_total Network errors in transmit. -# TYPE ecs_network_transmit_errors_total counter -ecs_network_transmit_errors_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_transmit_packets_total Network packets transmitted. -# TYPE ecs_network_transmit_packets_total counter -ecs_network_transmit_packets_total{container="ecs-metadata-proxy",device="eth1"} 429472 -# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# HELP ecs_container_cpu_usage_seconds_total Cumulative total container CPU usage in seconds. +# TYPE ecs_container_cpu_usage_seconds_total counter +ecs_container_cpu_usage_seconds_total{container_name="ecs-exporter"} 0.027095748000000003 +# HELP ecs_container_memory_limit_bytes Configured container memory limit in bytes, set from the container-level limit in the task definition if any, otherwise the task-level limit. +# TYPE ecs_container_memory_limit_bytes gauge +ecs_container_memory_limit_bytes{container_name="ecs-exporter"} 5.36870912e+08 +# HELP ecs_container_memory_page_cache_size_bytes Current container memory page cache size in bytes. This is not a subset of used bytes. +# TYPE ecs_container_memory_page_cache_size_bytes gauge +ecs_container_memory_page_cache_size_bytes{container_name="ecs-exporter"} 0 +# HELP ecs_container_memory_usage_bytes Current container memory usage in bytes. +# TYPE ecs_container_memory_usage_bytes gauge +ecs_container_memory_usage_bytes{container_name="ecs-exporter"} 4.452352e+06 +# HELP ecs_container_network_receive_bytes_total Cumulative total size of container network packets received in bytes. +# TYPE ecs_container_network_receive_bytes_total counter +ecs_container_network_receive_bytes_total{container_name="ecs-exporter",interface="eth1"} 1.1112267e+07 +# HELP ecs_container_network_receive_errors_total Cumulative total count of container network errors in receiving. +# TYPE ecs_container_network_receive_errors_total counter +ecs_container_network_receive_errors_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_receive_packets_dropped_total Cumulative total count of container network packets dropped in receiving. +# TYPE ecs_container_network_receive_packets_dropped_total counter +ecs_container_network_receive_packets_dropped_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_receive_packets_total Cumulative total count of container network packets received. +# TYPE ecs_container_network_receive_packets_total counter +ecs_container_network_receive_packets_total{container_name="ecs-exporter",interface="eth1"} 8039 +# HELP ecs_container_network_transmit_bytes_total Cumulative total size of container network packets transmitted in bytes. +# TYPE ecs_container_network_transmit_bytes_total counter +ecs_container_network_transmit_bytes_total{container_name="ecs-exporter",interface="eth1"} 165338 +# HELP ecs_container_network_transmit_dropped_total Cumulative total count of container network packets dropped in transmit. +# TYPE ecs_container_network_transmit_dropped_total counter +ecs_container_network_transmit_dropped_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_transmit_errors_total Cumulative total count of container network errors in transmit. +# TYPE ecs_container_network_transmit_errors_total counter +ecs_container_network_transmit_errors_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_transmit_packets_total Cumulative total count of container network packets transmitted. +# TYPE ecs_container_network_transmit_packets_total counter +ecs_container_network_transmit_packets_total{container_name="ecs-exporter",interface="eth1"} 713 +# HELP ecs_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ecs_exporter was built, and the goos and goarch for the build. +# TYPE ecs_exporter_build_info gauge +ecs_exporter_build_info{branch="",goarch="arm64",goos="linux",goversion="go1.23.2",revision="unknown",tags="unknown",version=""} 1 +# HELP ecs_task_cpu_limit_vcpus Configured task CPU limit in vCPUs (1 vCPU = 1024 CPU units). This is optional when running on EC2; if no limit is set, this metric has no value. +# TYPE ecs_task_cpu_limit_vcpus gauge +ecs_task_cpu_limit_vcpus 0.25 +# HELP ecs_task_ephemeral_storage_allocated_bytes Configured Fargate task ephemeral storage allocated size in bytes. +# TYPE ecs_task_ephemeral_storage_allocated_bytes gauge +ecs_task_ephemeral_storage_allocated_bytes 2.1491613696e+10 +# HELP ecs_task_ephemeral_storage_used_bytes Current Fargate task ephemeral storage usage in bytes. +# TYPE ecs_task_ephemeral_storage_used_bytes gauge +ecs_task_ephemeral_storage_used_bytes 3.7748736e+07 +# HELP ecs_task_image_pull_start_timestamp_seconds The time at which the task started pulling docker images for its containers. +# TYPE ecs_task_image_pull_start_timestamp_seconds gauge +ecs_task_image_pull_start_timestamp_seconds 1.7291179014941156e+09 +# HELP ecs_task_image_pull_stop_timestamp_seconds The time at which the task stopped (i.e. completed) pulling docker images for its containers. +# TYPE ecs_task_image_pull_stop_timestamp_seconds gauge +ecs_task_image_pull_stop_timestamp_seconds 1.7291179144469e+09 +# HELP ecs_task_memory_limit_bytes Configured task memory limit in bytes. This is optional when running on EC2; if no limit is set, this metric has no value. +# TYPE ecs_task_memory_limit_bytes gauge +ecs_task_memory_limit_bytes 5.36870912e+08 +# HELP ecs_task_metadata_info ECS task metadata, sourced from the task metadata endpoint version 4. +# TYPE ecs_task_metadata_info gauge +ecs_task_metadata_info{availability_zone="us-east-1a",cluster="arn:aws:ecs:us-east-1:829490980523:cluster/prom-ecs-exporter-sandbox",desired_status="RUNNING",family="prom-ecs-exporter-sandbox-isker-fargate",known_status="RUNNING",launch_type="FARGATE",pull_started_at="2024-10-16T22:31:41.494115693Z",pull_stopped_at="2024-10-16T22:31:54.446899683Z",revision="11",task_arn="arn:aws:ecs:us-east-1:829490980523:task/prom-ecs-exporter-sandbox/0c7f6b0414dc47d0a15019a099cd919b"} 1 +# HELP go_gc_duration_seconds A summary of the wall-time pause (stop-the-world) duration in garbage collection cycles. # TYPE go_gc_duration_seconds summary go_gc_duration_seconds{quantile="0"} 0 go_gc_duration_seconds{quantile="0.25"} 0 @@ -91,87 +118,117 @@ go_gc_duration_seconds{quantile="0.75"} 0 go_gc_duration_seconds{quantile="1"} 0 go_gc_duration_seconds_sum 0 go_gc_duration_seconds_count 0 +# HELP go_gc_gogc_percent Heap size target percentage configured by the user, otherwise 100. This value is set by the GOGC environment variable, and the runtime/debug.SetGCPercent function. Sourced from /gc/gogc:percent +# TYPE go_gc_gogc_percent gauge +go_gc_gogc_percent 100 +# HELP go_gc_gomemlimit_bytes Go runtime memory limit configured by the user, otherwise math.MaxInt64. This value is set by the GOMEMLIMIT environment variable, and the runtime/debug.SetMemoryLimit function. Sourced from /gc/gomemlimit:bytes +# TYPE go_gc_gomemlimit_bytes gauge +go_gc_gomemlimit_bytes 9.223372036854776e+18 # HELP go_goroutines Number of goroutines that currently exist. # TYPE go_goroutines gauge -go_goroutines 8 +go_goroutines 9 # HELP go_info Information about the Go environment. # TYPE go_info gauge -go_info{version="go1.16.3"} 1 -# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. +go_info{version="go1.23.2"} 1 +# HELP go_memstats_alloc_bytes Number of bytes allocated in heap and currently in use. Equals to /memory/classes/heap/objects:bytes. # TYPE go_memstats_alloc_bytes gauge -go_memstats_alloc_bytes 595760 -# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. +go_memstats_alloc_bytes 2.38768e+06 +# HELP go_memstats_alloc_bytes_total Total number of bytes allocated in heap until now, even if released already. Equals to /gc/heap/allocs:bytes. # TYPE go_memstats_alloc_bytes_total counter -go_memstats_alloc_bytes_total 595760 -# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. +go_memstats_alloc_bytes_total 2.38768e+06 +# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. Equals to /memory/classes/profiling/buckets:bytes. # TYPE go_memstats_buck_hash_sys_bytes gauge -go_memstats_buck_hash_sys_bytes 4092 -# HELP go_memstats_frees_total Total number of frees. +go_memstats_buck_hash_sys_bytes 4772 +# HELP go_memstats_frees_total Total number of heap objects frees. Equals to /gc/heap/frees:objects + /gc/heap/tiny/allocs:objects. # TYPE go_memstats_frees_total counter -go_memstats_frees_total 123 -# HELP go_memstats_gc_cpu_fraction The fraction of this program's available CPU time used by the GC since the program started. -# TYPE go_memstats_gc_cpu_fraction gauge -go_memstats_gc_cpu_fraction 0 -# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. +go_memstats_frees_total 237 +# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. Equals to /memory/classes/metadata/other:bytes. # TYPE go_memstats_gc_sys_bytes gauge -go_memstats_gc_sys_bytes 3.97448e+06 -# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. +go_memstats_gc_sys_bytes 1.595176e+06 +# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and currently in use, same as go_memstats_alloc_bytes. Equals to /memory/classes/heap/objects:bytes. # TYPE go_memstats_heap_alloc_bytes gauge -go_memstats_heap_alloc_bytes 595760 -# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. +go_memstats_heap_alloc_bytes 2.38768e+06 +# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. Equals to /memory/classes/heap/released:bytes + /memory/classes/heap/free:bytes. # TYPE go_memstats_heap_idle_bytes gauge -go_memstats_heap_idle_bytes 6.508544e+07 -# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. +go_memstats_heap_idle_bytes 3.801088e+06 +# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. Equals to /memory/classes/heap/objects:bytes + /memory/classes/heap/unused:bytes # TYPE go_memstats_heap_inuse_bytes gauge -go_memstats_heap_inuse_bytes 1.59744e+06 -# HELP go_memstats_heap_objects Number of allocated objects. +go_memstats_heap_inuse_bytes 4.030464e+06 +# HELP go_memstats_heap_objects Number of currently allocated objects. Equals to /gc/heap/objects:objects. # TYPE go_memstats_heap_objects gauge -go_memstats_heap_objects 2439 -# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. +go_memstats_heap_objects 13702 +# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. Equals to /memory/classes/heap/released:bytes. # TYPE go_memstats_heap_released_bytes gauge -go_memstats_heap_released_bytes 6.508544e+07 -# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. +go_memstats_heap_released_bytes 3.801088e+06 +# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. Equals to /memory/classes/heap/objects:bytes + /memory/classes/heap/unused:bytes + /memory/classes/heap/released:bytes + /memory/classes/heap/free:bytes. # TYPE go_memstats_heap_sys_bytes gauge -go_memstats_heap_sys_bytes 6.668288e+07 +go_memstats_heap_sys_bytes 7.831552e+06 # HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. # TYPE go_memstats_last_gc_time_seconds gauge go_memstats_last_gc_time_seconds 0 -# HELP go_memstats_lookups_total Total number of pointer lookups. -# TYPE go_memstats_lookups_total counter -go_memstats_lookups_total 0 -# HELP go_memstats_mallocs_total Total number of mallocs. +# HELP go_memstats_mallocs_total Total number of heap objects allocated, both live and gc-ed. Semantically a counter version for go_memstats_heap_objects gauge. Equals to /gc/heap/allocs:objects + /gc/heap/tiny/allocs:objects. # TYPE go_memstats_mallocs_total counter -go_memstats_mallocs_total 2562 -# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. +go_memstats_mallocs_total 13939 +# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. Equals to /memory/classes/metadata/mcache/inuse:bytes. # TYPE go_memstats_mcache_inuse_bytes gauge -go_memstats_mcache_inuse_bytes 9600 -# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. +go_memstats_mcache_inuse_bytes 2400 +# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. Equals to /memory/classes/metadata/mcache/inuse:bytes + /memory/classes/metadata/mcache/free:bytes. # TYPE go_memstats_mcache_sys_bytes gauge -go_memstats_mcache_sys_bytes 16384 -# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. +go_memstats_mcache_sys_bytes 15600 +# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. Equals to /memory/classes/metadata/mspan/inuse:bytes. # TYPE go_memstats_mspan_inuse_bytes gauge -go_memstats_mspan_inuse_bytes 37400 -# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. +go_memstats_mspan_inuse_bytes 74720 +# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. Equals to /memory/classes/metadata/mspan/inuse:bytes + /memory/classes/metadata/mspan/free:bytes. # TYPE go_memstats_mspan_sys_bytes gauge -go_memstats_mspan_sys_bytes 49152 -# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. +go_memstats_mspan_sys_bytes 81600 +# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. Equals to /gc/heap/goal:bytes. # TYPE go_memstats_next_gc_bytes gauge -go_memstats_next_gc_bytes 4.473924e+06 -# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. +go_memstats_next_gc_bytes 4.194304e+06 +# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. Equals to /memory/classes/other:bytes. # TYPE go_memstats_other_sys_bytes gauge -go_memstats_other_sys_bytes 497348 -# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. +go_memstats_other_sys_bytes 587412 +# HELP go_memstats_stack_inuse_bytes Number of bytes obtained from system for stack allocator in non-CGO environments. Equals to /memory/classes/heap/stacks:bytes. # TYPE go_memstats_stack_inuse_bytes gauge -go_memstats_stack_inuse_bytes 425984 -# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. +go_memstats_stack_inuse_bytes 524288 +# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. Equals to /memory/classes/heap/stacks:bytes + /memory/classes/os-stacks:bytes. # TYPE go_memstats_stack_sys_bytes gauge -go_memstats_stack_sys_bytes 425984 -# HELP go_memstats_sys_bytes Number of bytes obtained from system. +go_memstats_stack_sys_bytes 524288 +# HELP go_memstats_sys_bytes Number of bytes obtained from system. Equals to /memory/classes/total:byte. # TYPE go_memstats_sys_bytes gauge -go_memstats_sys_bytes 7.165032e+07 +go_memstats_sys_bytes 1.06404e+07 +# HELP go_sched_gomaxprocs_threads The current runtime.GOMAXPROCS setting, or the number of operating system threads that can execute user-level Go code simultaneously. Sourced from /sched/gomaxprocs:threads +# TYPE go_sched_gomaxprocs_threads gauge +go_sched_gomaxprocs_threads 2 # HELP go_threads Number of OS threads created. # TYPE go_threads gauge -go_threads 7 +go_threads 5 +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +process_cpu_seconds_total 0.02 +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +process_max_fds 65535 +# HELP process_network_receive_bytes_total Number of bytes received by the process over the network. +# TYPE process_network_receive_bytes_total counter +process_network_receive_bytes_total 1.0833544e+07 +# HELP process_network_transmit_bytes_total Number of bytes sent by the process over the network. +# TYPE process_network_transmit_bytes_total counter +process_network_transmit_bytes_total 153323 +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +process_open_fds 8 +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +process_resident_memory_bytes 1.6584704e+07 +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +process_start_time_seconds 1.72911791496e+09 +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +process_virtual_memory_bytes 1.269272576e+09 +# HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. +# TYPE process_virtual_memory_max_bytes gauge +process_virtual_memory_max_bytes 1.8446744073709552e+19 # HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. # TYPE promhttp_metric_handler_requests_in_flight gauge promhttp_metric_handler_requests_in_flight 1 diff --git a/ecscollector/collector.go b/ecscollector/collector.go index df8d9c5..aa34777 100644 --- a/ecscollector/collector.go +++ b/ecscollector/collector.go @@ -17,7 +17,6 @@ package ecscollector import ( "context" - "fmt" "log/slog" "time" @@ -27,94 +26,116 @@ import ( // ECS cpu_stats are from upstream docker/moby. These values are in nanoseconds. // https://github.com/moby/moby/blob/49f021ebf00a76d74f5ce158244083e2dfba26fb/api/types/stats.go#L18-L40 -const nanoSeconds = 1.0e9 +const nanoseconds = 1 / 1.0e9 -var ( - metadataDesc = prometheus.NewDesc( - "ecs_metadata_info", - "ECS service metadata.", - metadataLabels, nil) - - svcCPULimitDesc = prometheus.NewDesc( - "ecs_svc_cpu_limit", - "Total CPU Limit.", - svcLabels, nil) +// Task definition memory parameters are defined in MiB, while Prometheus +// standard metrics use bytes. +const mebibytes = 1024 * 1024 - svcMemLimitDesc = prometheus.NewDesc( - "ecs_svc_memory_limit_bytes", - "Total MEM Limit in bytes.", - svcLabels, nil) +var ( + taskMetadataDesc = prometheus.NewDesc( + "ecs_task_metadata_info", + "ECS task metadata, sourced from the task metadata endpoint version 4.", + taskMetadataLabels, nil) + + taskCpuLimitDesc = prometheus.NewDesc( + "ecs_task_cpu_limit_vcpus", + "Configured task CPU limit in vCPUs (1 vCPU = 1024 CPU units). This is optional when running on EC2; if no limit is set, this metric has no value.", + taskLabels, nil) + + taskMemLimitDesc = prometheus.NewDesc( + "ecs_task_memory_limit_bytes", + "Configured task memory limit in bytes. This is optional when running on EC2; if no limit is set, this metric has no value.", + taskLabels, nil) + + taskEphemeralStorageUsedDesc = prometheus.NewDesc( + "ecs_task_ephemeral_storage_used_bytes", + "Current Fargate task ephemeral storage usage in bytes.", + taskLabels, nil) + + taskEphemeralStorageAllocatedDesc = prometheus.NewDesc( + "ecs_task_ephemeral_storage_allocated_bytes", + "Configured Fargate task ephemeral storage allocated size in bytes.", + taskLabels, nil) + + taskImagePullStartDesc = prometheus.NewDesc( + "ecs_task_image_pull_start_timestamp_seconds", + "The time at which the task started pulling docker images for its containers.", + taskLabels, nil) + + taskImagePullStopDesc = prometheus.NewDesc( + "ecs_task_image_pull_stop_timestamp_seconds", + "The time at which the task stopped (i.e. completed) pulling docker images for its containers.", + taskLabels, nil) cpuTotalDesc = prometheus.NewDesc( - "ecs_cpu_seconds_total", - "Total CPU usage in seconds.", - cpuLabels, nil) + "ecs_container_cpu_usage_seconds_total", + "Cumulative total container CPU usage in seconds.", + containerLabels, nil) memUsageDesc = prometheus.NewDesc( - "ecs_memory_bytes", - "Memory usage in bytes.", - labels, nil) + "ecs_container_memory_usage_bytes", + "Current container memory usage in bytes.", + containerLabels, nil) memLimitDesc = prometheus.NewDesc( - "ecs_memory_limit_bytes", - "Memory limit in bytes.", - labels, nil) + "ecs_container_memory_limit_bytes", + "Configured container memory limit in bytes, set from the container-level limit in the task definition if any, otherwise the task-level limit.", + containerLabels, nil) - memCacheUsageDesc = prometheus.NewDesc( - "ecs_memory_cache_usage", - "Memory cache usage in bytes.", - labels, nil) + memCacheSizeDesc = prometheus.NewDesc( + "ecs_container_memory_page_cache_size_bytes", + "Current container memory page cache size in bytes. This is not a subset of used bytes.", + containerLabels, nil) networkRxBytesDesc = prometheus.NewDesc( - "ecs_network_receive_bytes_total", - "Network received in bytes.", - networkLabels, nil) + "ecs_container_network_receive_bytes_total", + "Cumulative total size of container network packets received in bytes.", + containerNetworkLabels, nil) networkRxPacketsDesc = prometheus.NewDesc( - "ecs_network_receive_packets_total", - "Network packets received.", - networkLabels, nil) + "ecs_container_network_receive_packets_total", + "Cumulative total count of container network packets received.", + containerNetworkLabels, nil) networkRxDroppedDesc = prometheus.NewDesc( - "ecs_network_receive_dropped_total", - "Network packets dropped in receiving.", - networkLabels, nil) + "ecs_container_network_receive_packets_dropped_total", + "Cumulative total count of container network packets dropped in receiving.", + containerNetworkLabels, nil) networkRxErrorsDesc = prometheus.NewDesc( - "ecs_network_receive_errors_total", - "Network errors in receiving.", - networkLabels, nil) + "ecs_container_network_receive_errors_total", + "Cumulative total count of container network errors in receiving.", + containerNetworkLabels, nil) networkTxBytesDesc = prometheus.NewDesc( - "ecs_network_transmit_bytes_total", - "Network transmitted in bytes.", - networkLabels, nil) + "ecs_container_network_transmit_bytes_total", + "Cumulative total size of container network packets transmitted in bytes.", + containerNetworkLabels, nil) networkTxPacketsDesc = prometheus.NewDesc( - "ecs_network_transmit_packets_total", - "Network packets transmitted.", - networkLabels, nil) + "ecs_container_network_transmit_packets_total", + "Cumulative total count of container network packets transmitted.", + containerNetworkLabels, nil) networkTxDroppedDesc = prometheus.NewDesc( - "ecs_network_transmit_dropped_total", - "Network packets dropped in transmit.", - networkLabels, nil) + "ecs_container_network_transmit_dropped_total", + "Cumulative total count of container network packets dropped in transmit.", + containerNetworkLabels, nil) networkTxErrorsDesc = prometheus.NewDesc( - "ecs_network_transmit_errors_total", - "Network errors in transmit.", - networkLabels, nil) + "ecs_container_network_transmit_errors_total", + "Cumulative total count of container network errors in transmit.", + containerNetworkLabels, nil) ) -var labels = []string{ - "container", +var containerLabels = []string{ + "container_name", } -var svcLabels = []string{ - "task_arn", -} +var taskLabels = []string{} -var metadataLabels = []string{ +var taskMetadataLabels = []string{ "cluster", "task_arn", "family", @@ -127,14 +148,9 @@ var metadataLabels = []string{ "launch_type", } -var cpuLabels = append( - labels, - "cpu", -) - -var networkLabels = append( - labels, - "device", +var containerNetworkLabels = append( + containerLabels, + "interface", ) // NewCollector returns a new Collector that queries ECS metadata server @@ -149,10 +165,17 @@ type collector struct { } func (c *collector) Describe(ch chan<- *prometheus.Desc) { + ch <- taskMetadataDesc + ch <- taskCpuLimitDesc + ch <- taskMemLimitDesc + ch <- taskEphemeralStorageUsedDesc + ch <- taskEphemeralStorageAllocatedDesc + ch <- taskImagePullStartDesc + ch <- taskImagePullStopDesc ch <- cpuTotalDesc ch <- memUsageDesc ch <- memLimitDesc - ch <- memCacheUsageDesc + ch <- memCacheSizeDesc ch <- networkRxBytesDesc ch <- networkRxPacketsDesc ch <- networkRxDroppedDesc @@ -173,7 +196,7 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { c.logger.Debug("Got ECS task metadata response", "stats", metadata) ch <- prometheus.MustNewConstMetric( - metadataDesc, + taskMetadataDesc, prometheus.GaugeValue, 1.0, metadata.Cluster, @@ -193,22 +216,48 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { if metadata.Limits != nil { if metadata.Limits.CPU != nil { ch <- prometheus.MustNewConstMetric( - svcCPULimitDesc, + taskCpuLimitDesc, prometheus.GaugeValue, *metadata.Limits.CPU, - metadata.TaskARN, ) } if metadata.Limits.Memory != nil { ch <- prometheus.MustNewConstMetric( - svcMemLimitDesc, + taskMemLimitDesc, prometheus.GaugeValue, - float64(*metadata.Limits.Memory), - metadata.TaskARN, + float64(*metadata.Limits.Memory*mebibytes), ) } } + if metadata.EphemeralStorageMetrics != nil { + ch <- prometheus.MustNewConstMetric( + taskEphemeralStorageUsedDesc, + prometheus.GaugeValue, + float64(metadata.EphemeralStorageMetrics.UtilizedMiBs*mebibytes), + ) + ch <- prometheus.MustNewConstMetric( + taskEphemeralStorageAllocatedDesc, + prometheus.GaugeValue, + float64(metadata.EphemeralStorageMetrics.ReservedMiBs*mebibytes), + ) + } + + if metadata.PullStartedAt != nil { + ch <- prometheus.MustNewConstMetric( + taskImagePullStartDesc, + prometheus.GaugeValue, + float64(metadata.PullStartedAt.UnixNano())*nanoseconds, + ) + } + if metadata.PullStoppedAt != nil { + ch <- prometheus.MustNewConstMetric( + taskImagePullStopDesc, + prometheus.GaugeValue, + float64(metadata.PullStoppedAt.UnixNano())*nanoseconds, + ) + } + stats, err := c.client.RetrieveTaskStats(ctx) if err != nil { c.logger.Debug("Failed to retrieve container stats", "error", err) @@ -223,41 +272,49 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { continue } - labelVals := []string{ + containerLabelVals := []string{ container.Name, } - for i, cpuUsage := range s.CPUStats.CPUUsage.PercpuUsage { - cpu := fmt.Sprintf("%d", i) - ch <- prometheus.MustNewConstMetric( - cpuTotalDesc, - prometheus.CounterValue, - float64(cpuUsage)/nanoSeconds, - append(labelVals, cpu)..., - ) - } + ch <- prometheus.MustNewConstMetric( + cpuTotalDesc, + prometheus.CounterValue, + float64(s.CPUStats.CPUUsage.TotalUsage)*nanoseconds, + containerLabelVals..., + ) cacheValue := 0.0 if val, ok := s.MemoryStats.Stats["cache"]; ok { cacheValue = float64(val) } + // Report the container's memory limit as its own, if any, otherwise the + // task's limit. This is correct in that this is the precise logic used + // to configure the cgroups limit for the container. + var containerMemoryLimitMib int64 + if container.Limits.Memory != nil { + containerMemoryLimitMib = *container.Limits.Memory + } else { + // This must be set if the container limit is not set, and thus is + // safe to dereference. + containerMemoryLimitMib = *metadata.Limits.Memory + } for desc, value := range map[*prometheus.Desc]float64{ - memUsageDesc: float64(s.MemoryStats.Usage), - memLimitDesc: float64(s.MemoryStats.Limit), - memCacheUsageDesc: cacheValue, + memUsageDesc: float64(s.MemoryStats.Usage), + memLimitDesc: float64(containerMemoryLimitMib * mebibytes), + memCacheSizeDesc: cacheValue, } { ch <- prometheus.MustNewConstMetric( desc, prometheus.GaugeValue, value, - labelVals..., + containerLabelVals..., ) } // Network metrics per interface. for iface, netStats := range s.Networks { - networkLabelVals := append(labelVals, iface) + networkLabelVals := append(containerLabelVals, iface) for desc, value := range map[*prometheus.Desc]float64{ networkRxBytesDesc: float64(netStats.RxBytes),