diff --git a/README.md b/README.md index 740b896..0e1731c 100644 --- a/README.md +++ b/README.md @@ -37,52 +37,79 @@ from App Runner services. ## Labels -* **container**: Container associated with a metric. -* **cpu**: Available to CPU metrics, helps to breakdown metrics by CPU. -* **device**: Network interface device associated with the metric. Only +### On task-level metrics +None. + +### On container-level metrics + +* **container_name**: Name of the container (as in the ECS task definition) associated with a metric. +* **interface**: Network interface device associated with the metric. Only available for several network metrics. ## Example output ``` -# HELP ecs_cpu_seconds_total Total CPU usage in seconds. -# TYPE ecs_cpu_seconds_total counter -ecs_cpu_seconds_total{container="ecs-metadata-proxy",cpu="0"} 1.746774278e+08 -ecs_cpu_seconds_total{container="ecs-metadata-proxy",cpu="1"} 1.7417992266e+08 -# HELP ecs_memory_bytes Memory usage in bytes. -# TYPE ecs_memory_bytes gauge -ecs_memory_bytes{container="ecs-metadata-proxy"} 4.440064e+06 -# HELP ecs_memory_limit_bytes Memory limit in bytes. -# TYPE ecs_memory_limit_bytes gauge -ecs_memory_limit_bytes{container="ecs-metadata-proxy"} 9.223372036854772e+18 -# HELP ecs_memory_max_bytes Maximum memory usage in bytes. -# TYPE ecs_memory_max_bytes gauge -ecs_memory_max_bytes{container="ecs-metadata-proxy"} 9.023488e+06 -# HELP ecs_network_receive_bytes_total Network received in bytes. -# TYPE ecs_network_receive_bytes_total counter -ecs_network_receive_bytes_total{container="ecs-metadata-proxy",device="eth1"} 4.2851757e+07 -# HELP ecs_network_receive_dropped_total Network packets dropped in receiving. -# TYPE ecs_network_receive_dropped_total counter -ecs_network_receive_dropped_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_receive_errors_total Network errors in receiving. -# TYPE ecs_network_receive_errors_total counter -ecs_network_receive_errors_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_receive_packets_total Network packets received. -# TYPE ecs_network_receive_packets_total counter -ecs_network_receive_packets_total{container="ecs-metadata-proxy",device="eth1"} 516239 -# HELP ecs_network_transmit_bytes_total Network transmitted in bytes. -# TYPE ecs_network_transmit_bytes_total counter -ecs_network_transmit_bytes_total{container="ecs-metadata-proxy",device="eth1"} 1.28412758e+08 -# HELP ecs_network_transmit_dropped_total Network packets dropped in transmit. -# TYPE ecs_network_transmit_dropped_total counter -ecs_network_transmit_dropped_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_transmit_errors_total Network errors in transmit. -# TYPE ecs_network_transmit_errors_total counter -ecs_network_transmit_errors_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_transmit_packets_total Network packets transmitted. -# TYPE ecs_network_transmit_packets_total counter -ecs_network_transmit_packets_total{container="ecs-metadata-proxy",device="eth1"} 429472 -# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# HELP ecs_container_cpu_usage_seconds_total Cumulative total container CPU usage in seconds. +# TYPE ecs_container_cpu_usage_seconds_total counter +ecs_container_cpu_usage_seconds_total{container_name="ecs-exporter"} 0.027095748000000003 +# HELP ecs_container_memory_limit_bytes Configured container memory limit in bytes, set from the container-level limit in the task definition if any, otherwise the task-level limit. +# TYPE ecs_container_memory_limit_bytes gauge +ecs_container_memory_limit_bytes{container_name="ecs-exporter"} 5.36870912e+08 +# HELP ecs_container_memory_page_cache_size_bytes Current container memory page cache size in bytes. This is not a subset of used bytes. +# TYPE ecs_container_memory_page_cache_size_bytes gauge +ecs_container_memory_page_cache_size_bytes{container_name="ecs-exporter"} 0 +# HELP ecs_container_memory_usage_bytes Current container memory usage in bytes. +# TYPE ecs_container_memory_usage_bytes gauge +ecs_container_memory_usage_bytes{container_name="ecs-exporter"} 4.452352e+06 +# HELP ecs_container_network_receive_bytes_total Cumulative total size of container network packets received in bytes. +# TYPE ecs_container_network_receive_bytes_total counter +ecs_container_network_receive_bytes_total{container_name="ecs-exporter",interface="eth1"} 1.1112267e+07 +# HELP ecs_container_network_receive_errors_total Cumulative total count of container network errors in receiving. +# TYPE ecs_container_network_receive_errors_total counter +ecs_container_network_receive_errors_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_receive_packets_dropped_total Cumulative total count of container network packets dropped in receiving. +# TYPE ecs_container_network_receive_packets_dropped_total counter +ecs_container_network_receive_packets_dropped_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_receive_packets_total Cumulative total count of container network packets received. +# TYPE ecs_container_network_receive_packets_total counter +ecs_container_network_receive_packets_total{container_name="ecs-exporter",interface="eth1"} 8039 +# HELP ecs_container_network_transmit_bytes_total Cumulative total size of container network packets transmitted in bytes. +# TYPE ecs_container_network_transmit_bytes_total counter +ecs_container_network_transmit_bytes_total{container_name="ecs-exporter",interface="eth1"} 165338 +# HELP ecs_container_network_transmit_dropped_total Cumulative total count of container network packets dropped in transmit. +# TYPE ecs_container_network_transmit_dropped_total counter +ecs_container_network_transmit_dropped_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_transmit_errors_total Cumulative total count of container network errors in transmit. +# TYPE ecs_container_network_transmit_errors_total counter +ecs_container_network_transmit_errors_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_transmit_packets_total Cumulative total count of container network packets transmitted. +# TYPE ecs_container_network_transmit_packets_total counter +ecs_container_network_transmit_packets_total{container_name="ecs-exporter",interface="eth1"} 713 +# HELP ecs_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ecs_exporter was built, and the goos and goarch for the build. +# TYPE ecs_exporter_build_info gauge +ecs_exporter_build_info{branch="",goarch="arm64",goos="linux",goversion="go1.23.2",revision="unknown",tags="unknown",version=""} 1 +# HELP ecs_task_cpu_limit_vcpus Configured task CPU limit in vCPUs (1 vCPU = 1024 CPU units). This is optional when running on EC2; if no limit is set, this metric has no value. +# TYPE ecs_task_cpu_limit_vcpus gauge +ecs_task_cpu_limit_vcpus 0.25 +# HELP ecs_task_ephemeral_storage_allocated_bytes Configured Fargate task ephemeral storage allocated size in bytes. +# TYPE ecs_task_ephemeral_storage_allocated_bytes gauge +ecs_task_ephemeral_storage_allocated_bytes 2.1491613696e+10 +# HELP ecs_task_ephemeral_storage_used_bytes Current Fargate task ephemeral storage usage in bytes. +# TYPE ecs_task_ephemeral_storage_used_bytes gauge +ecs_task_ephemeral_storage_used_bytes 3.7748736e+07 +# HELP ecs_task_image_pull_start_timestamp_seconds The time at which the task started pulling docker images for its containers. +# TYPE ecs_task_image_pull_start_timestamp_seconds gauge +ecs_task_image_pull_start_timestamp_seconds 1.7291179014941156e+09 +# HELP ecs_task_image_pull_stop_timestamp_seconds The time at which the task stopped (i.e. completed) pulling docker images for its containers. +# TYPE ecs_task_image_pull_stop_timestamp_seconds gauge +ecs_task_image_pull_stop_timestamp_seconds 1.7291179144469e+09 +# HELP ecs_task_memory_limit_bytes Configured task memory limit in bytes. This is optional when running on EC2; if no limit is set, this metric has no value. +# TYPE ecs_task_memory_limit_bytes gauge +ecs_task_memory_limit_bytes 5.36870912e+08 +# HELP ecs_task_metadata_info ECS task metadata, sourced from the task metadata endpoint version 4. +# TYPE ecs_task_metadata_info gauge +ecs_task_metadata_info{availability_zone="us-east-1a",cluster="arn:aws:ecs:us-east-1:829490980523:cluster/prom-ecs-exporter-sandbox",desired_status="RUNNING",family="prom-ecs-exporter-sandbox-isker-fargate",known_status="RUNNING",launch_type="FARGATE",pull_started_at="2024-10-16T22:31:41.494115693Z",pull_stopped_at="2024-10-16T22:31:54.446899683Z",revision="11",task_arn="arn:aws:ecs:us-east-1:829490980523:task/prom-ecs-exporter-sandbox/0c7f6b0414dc47d0a15019a099cd919b"} 1 +# HELP go_gc_duration_seconds A summary of the wall-time pause (stop-the-world) duration in garbage collection cycles. # TYPE go_gc_duration_seconds summary go_gc_duration_seconds{quantile="0"} 0 go_gc_duration_seconds{quantile="0.25"} 0 @@ -91,87 +118,117 @@ go_gc_duration_seconds{quantile="0.75"} 0 go_gc_duration_seconds{quantile="1"} 0 go_gc_duration_seconds_sum 0 go_gc_duration_seconds_count 0 +# HELP go_gc_gogc_percent Heap size target percentage configured by the user, otherwise 100. This value is set by the GOGC environment variable, and the runtime/debug.SetGCPercent function. Sourced from /gc/gogc:percent +# TYPE go_gc_gogc_percent gauge +go_gc_gogc_percent 100 +# HELP go_gc_gomemlimit_bytes Go runtime memory limit configured by the user, otherwise math.MaxInt64. This value is set by the GOMEMLIMIT environment variable, and the runtime/debug.SetMemoryLimit function. Sourced from /gc/gomemlimit:bytes +# TYPE go_gc_gomemlimit_bytes gauge +go_gc_gomemlimit_bytes 9.223372036854776e+18 # HELP go_goroutines Number of goroutines that currently exist. # TYPE go_goroutines gauge -go_goroutines 8 +go_goroutines 9 # HELP go_info Information about the Go environment. # TYPE go_info gauge -go_info{version="go1.16.3"} 1 -# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. +go_info{version="go1.23.2"} 1 +# HELP go_memstats_alloc_bytes Number of bytes allocated in heap and currently in use. Equals to /memory/classes/heap/objects:bytes. # TYPE go_memstats_alloc_bytes gauge -go_memstats_alloc_bytes 595760 -# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. +go_memstats_alloc_bytes 2.38768e+06 +# HELP go_memstats_alloc_bytes_total Total number of bytes allocated in heap until now, even if released already. Equals to /gc/heap/allocs:bytes. # TYPE go_memstats_alloc_bytes_total counter -go_memstats_alloc_bytes_total 595760 -# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. +go_memstats_alloc_bytes_total 2.38768e+06 +# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. Equals to /memory/classes/profiling/buckets:bytes. # TYPE go_memstats_buck_hash_sys_bytes gauge -go_memstats_buck_hash_sys_bytes 4092 -# HELP go_memstats_frees_total Total number of frees. +go_memstats_buck_hash_sys_bytes 4772 +# HELP go_memstats_frees_total Total number of heap objects frees. Equals to /gc/heap/frees:objects + /gc/heap/tiny/allocs:objects. # TYPE go_memstats_frees_total counter -go_memstats_frees_total 123 -# HELP go_memstats_gc_cpu_fraction The fraction of this program's available CPU time used by the GC since the program started. -# TYPE go_memstats_gc_cpu_fraction gauge -go_memstats_gc_cpu_fraction 0 -# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. +go_memstats_frees_total 237 +# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. Equals to /memory/classes/metadata/other:bytes. # TYPE go_memstats_gc_sys_bytes gauge -go_memstats_gc_sys_bytes 3.97448e+06 -# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. +go_memstats_gc_sys_bytes 1.595176e+06 +# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and currently in use, same as go_memstats_alloc_bytes. Equals to /memory/classes/heap/objects:bytes. # TYPE go_memstats_heap_alloc_bytes gauge -go_memstats_heap_alloc_bytes 595760 -# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. +go_memstats_heap_alloc_bytes 2.38768e+06 +# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. Equals to /memory/classes/heap/released:bytes + /memory/classes/heap/free:bytes. # TYPE go_memstats_heap_idle_bytes gauge -go_memstats_heap_idle_bytes 6.508544e+07 -# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. +go_memstats_heap_idle_bytes 3.801088e+06 +# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. Equals to /memory/classes/heap/objects:bytes + /memory/classes/heap/unused:bytes # TYPE go_memstats_heap_inuse_bytes gauge -go_memstats_heap_inuse_bytes 1.59744e+06 -# HELP go_memstats_heap_objects Number of allocated objects. +go_memstats_heap_inuse_bytes 4.030464e+06 +# HELP go_memstats_heap_objects Number of currently allocated objects. Equals to /gc/heap/objects:objects. # TYPE go_memstats_heap_objects gauge -go_memstats_heap_objects 2439 -# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. +go_memstats_heap_objects 13702 +# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. Equals to /memory/classes/heap/released:bytes. # TYPE go_memstats_heap_released_bytes gauge -go_memstats_heap_released_bytes 6.508544e+07 -# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. +go_memstats_heap_released_bytes 3.801088e+06 +# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. Equals to /memory/classes/heap/objects:bytes + /memory/classes/heap/unused:bytes + /memory/classes/heap/released:bytes + /memory/classes/heap/free:bytes. # TYPE go_memstats_heap_sys_bytes gauge -go_memstats_heap_sys_bytes 6.668288e+07 +go_memstats_heap_sys_bytes 7.831552e+06 # HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. # TYPE go_memstats_last_gc_time_seconds gauge go_memstats_last_gc_time_seconds 0 -# HELP go_memstats_lookups_total Total number of pointer lookups. -# TYPE go_memstats_lookups_total counter -go_memstats_lookups_total 0 -# HELP go_memstats_mallocs_total Total number of mallocs. +# HELP go_memstats_mallocs_total Total number of heap objects allocated, both live and gc-ed. Semantically a counter version for go_memstats_heap_objects gauge. Equals to /gc/heap/allocs:objects + /gc/heap/tiny/allocs:objects. # TYPE go_memstats_mallocs_total counter -go_memstats_mallocs_total 2562 -# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. +go_memstats_mallocs_total 13939 +# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. Equals to /memory/classes/metadata/mcache/inuse:bytes. # TYPE go_memstats_mcache_inuse_bytes gauge -go_memstats_mcache_inuse_bytes 9600 -# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. +go_memstats_mcache_inuse_bytes 2400 +# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. Equals to /memory/classes/metadata/mcache/inuse:bytes + /memory/classes/metadata/mcache/free:bytes. # TYPE go_memstats_mcache_sys_bytes gauge -go_memstats_mcache_sys_bytes 16384 -# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. +go_memstats_mcache_sys_bytes 15600 +# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. Equals to /memory/classes/metadata/mspan/inuse:bytes. # TYPE go_memstats_mspan_inuse_bytes gauge -go_memstats_mspan_inuse_bytes 37400 -# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. +go_memstats_mspan_inuse_bytes 74720 +# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. Equals to /memory/classes/metadata/mspan/inuse:bytes + /memory/classes/metadata/mspan/free:bytes. # TYPE go_memstats_mspan_sys_bytes gauge -go_memstats_mspan_sys_bytes 49152 -# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. +go_memstats_mspan_sys_bytes 81600 +# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. Equals to /gc/heap/goal:bytes. # TYPE go_memstats_next_gc_bytes gauge -go_memstats_next_gc_bytes 4.473924e+06 -# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. +go_memstats_next_gc_bytes 4.194304e+06 +# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. Equals to /memory/classes/other:bytes. # TYPE go_memstats_other_sys_bytes gauge -go_memstats_other_sys_bytes 497348 -# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. +go_memstats_other_sys_bytes 587412 +# HELP go_memstats_stack_inuse_bytes Number of bytes obtained from system for stack allocator in non-CGO environments. Equals to /memory/classes/heap/stacks:bytes. # TYPE go_memstats_stack_inuse_bytes gauge -go_memstats_stack_inuse_bytes 425984 -# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. +go_memstats_stack_inuse_bytes 524288 +# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. Equals to /memory/classes/heap/stacks:bytes + /memory/classes/os-stacks:bytes. # TYPE go_memstats_stack_sys_bytes gauge -go_memstats_stack_sys_bytes 425984 -# HELP go_memstats_sys_bytes Number of bytes obtained from system. +go_memstats_stack_sys_bytes 524288 +# HELP go_memstats_sys_bytes Number of bytes obtained from system. Equals to /memory/classes/total:byte. # TYPE go_memstats_sys_bytes gauge -go_memstats_sys_bytes 7.165032e+07 +go_memstats_sys_bytes 1.06404e+07 +# HELP go_sched_gomaxprocs_threads The current runtime.GOMAXPROCS setting, or the number of operating system threads that can execute user-level Go code simultaneously. Sourced from /sched/gomaxprocs:threads +# TYPE go_sched_gomaxprocs_threads gauge +go_sched_gomaxprocs_threads 2 # HELP go_threads Number of OS threads created. # TYPE go_threads gauge -go_threads 7 +go_threads 5 +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +process_cpu_seconds_total 0.02 +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +process_max_fds 65535 +# HELP process_network_receive_bytes_total Number of bytes received by the process over the network. +# TYPE process_network_receive_bytes_total counter +process_network_receive_bytes_total 1.0833544e+07 +# HELP process_network_transmit_bytes_total Number of bytes sent by the process over the network. +# TYPE process_network_transmit_bytes_total counter +process_network_transmit_bytes_total 153323 +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +process_open_fds 8 +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +process_resident_memory_bytes 1.6584704e+07 +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +process_start_time_seconds 1.72911791496e+09 +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +process_virtual_memory_bytes 1.269272576e+09 +# HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. +# TYPE process_virtual_memory_max_bytes gauge +process_virtual_memory_max_bytes 1.8446744073709552e+19 # HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. # TYPE promhttp_metric_handler_requests_in_flight gauge promhttp_metric_handler_requests_in_flight 1 diff --git a/ecscollector/collector.go b/ecscollector/collector.go index df8d9c5..aa34777 100644 --- a/ecscollector/collector.go +++ b/ecscollector/collector.go @@ -17,7 +17,6 @@ package ecscollector import ( "context" - "fmt" "log/slog" "time" @@ -27,94 +26,116 @@ import ( // ECS cpu_stats are from upstream docker/moby. These values are in nanoseconds. // https://github.com/moby/moby/blob/49f021ebf00a76d74f5ce158244083e2dfba26fb/api/types/stats.go#L18-L40 -const nanoSeconds = 1.0e9 +const nanoseconds = 1 / 1.0e9 -var ( - metadataDesc = prometheus.NewDesc( - "ecs_metadata_info", - "ECS service metadata.", - metadataLabels, nil) - - svcCPULimitDesc = prometheus.NewDesc( - "ecs_svc_cpu_limit", - "Total CPU Limit.", - svcLabels, nil) +// Task definition memory parameters are defined in MiB, while Prometheus +// standard metrics use bytes. +const mebibytes = 1024 * 1024 - svcMemLimitDesc = prometheus.NewDesc( - "ecs_svc_memory_limit_bytes", - "Total MEM Limit in bytes.", - svcLabels, nil) +var ( + taskMetadataDesc = prometheus.NewDesc( + "ecs_task_metadata_info", + "ECS task metadata, sourced from the task metadata endpoint version 4.", + taskMetadataLabels, nil) + + taskCpuLimitDesc = prometheus.NewDesc( + "ecs_task_cpu_limit_vcpus", + "Configured task CPU limit in vCPUs (1 vCPU = 1024 CPU units). This is optional when running on EC2; if no limit is set, this metric has no value.", + taskLabels, nil) + + taskMemLimitDesc = prometheus.NewDesc( + "ecs_task_memory_limit_bytes", + "Configured task memory limit in bytes. This is optional when running on EC2; if no limit is set, this metric has no value.", + taskLabels, nil) + + taskEphemeralStorageUsedDesc = prometheus.NewDesc( + "ecs_task_ephemeral_storage_used_bytes", + "Current Fargate task ephemeral storage usage in bytes.", + taskLabels, nil) + + taskEphemeralStorageAllocatedDesc = prometheus.NewDesc( + "ecs_task_ephemeral_storage_allocated_bytes", + "Configured Fargate task ephemeral storage allocated size in bytes.", + taskLabels, nil) + + taskImagePullStartDesc = prometheus.NewDesc( + "ecs_task_image_pull_start_timestamp_seconds", + "The time at which the task started pulling docker images for its containers.", + taskLabels, nil) + + taskImagePullStopDesc = prometheus.NewDesc( + "ecs_task_image_pull_stop_timestamp_seconds", + "The time at which the task stopped (i.e. completed) pulling docker images for its containers.", + taskLabels, nil) cpuTotalDesc = prometheus.NewDesc( - "ecs_cpu_seconds_total", - "Total CPU usage in seconds.", - cpuLabels, nil) + "ecs_container_cpu_usage_seconds_total", + "Cumulative total container CPU usage in seconds.", + containerLabels, nil) memUsageDesc = prometheus.NewDesc( - "ecs_memory_bytes", - "Memory usage in bytes.", - labels, nil) + "ecs_container_memory_usage_bytes", + "Current container memory usage in bytes.", + containerLabels, nil) memLimitDesc = prometheus.NewDesc( - "ecs_memory_limit_bytes", - "Memory limit in bytes.", - labels, nil) + "ecs_container_memory_limit_bytes", + "Configured container memory limit in bytes, set from the container-level limit in the task definition if any, otherwise the task-level limit.", + containerLabels, nil) - memCacheUsageDesc = prometheus.NewDesc( - "ecs_memory_cache_usage", - "Memory cache usage in bytes.", - labels, nil) + memCacheSizeDesc = prometheus.NewDesc( + "ecs_container_memory_page_cache_size_bytes", + "Current container memory page cache size in bytes. This is not a subset of used bytes.", + containerLabels, nil) networkRxBytesDesc = prometheus.NewDesc( - "ecs_network_receive_bytes_total", - "Network received in bytes.", - networkLabels, nil) + "ecs_container_network_receive_bytes_total", + "Cumulative total size of container network packets received in bytes.", + containerNetworkLabels, nil) networkRxPacketsDesc = prometheus.NewDesc( - "ecs_network_receive_packets_total", - "Network packets received.", - networkLabels, nil) + "ecs_container_network_receive_packets_total", + "Cumulative total count of container network packets received.", + containerNetworkLabels, nil) networkRxDroppedDesc = prometheus.NewDesc( - "ecs_network_receive_dropped_total", - "Network packets dropped in receiving.", - networkLabels, nil) + "ecs_container_network_receive_packets_dropped_total", + "Cumulative total count of container network packets dropped in receiving.", + containerNetworkLabels, nil) networkRxErrorsDesc = prometheus.NewDesc( - "ecs_network_receive_errors_total", - "Network errors in receiving.", - networkLabels, nil) + "ecs_container_network_receive_errors_total", + "Cumulative total count of container network errors in receiving.", + containerNetworkLabels, nil) networkTxBytesDesc = prometheus.NewDesc( - "ecs_network_transmit_bytes_total", - "Network transmitted in bytes.", - networkLabels, nil) + "ecs_container_network_transmit_bytes_total", + "Cumulative total size of container network packets transmitted in bytes.", + containerNetworkLabels, nil) networkTxPacketsDesc = prometheus.NewDesc( - "ecs_network_transmit_packets_total", - "Network packets transmitted.", - networkLabels, nil) + "ecs_container_network_transmit_packets_total", + "Cumulative total count of container network packets transmitted.", + containerNetworkLabels, nil) networkTxDroppedDesc = prometheus.NewDesc( - "ecs_network_transmit_dropped_total", - "Network packets dropped in transmit.", - networkLabels, nil) + "ecs_container_network_transmit_dropped_total", + "Cumulative total count of container network packets dropped in transmit.", + containerNetworkLabels, nil) networkTxErrorsDesc = prometheus.NewDesc( - "ecs_network_transmit_errors_total", - "Network errors in transmit.", - networkLabels, nil) + "ecs_container_network_transmit_errors_total", + "Cumulative total count of container network errors in transmit.", + containerNetworkLabels, nil) ) -var labels = []string{ - "container", +var containerLabels = []string{ + "container_name", } -var svcLabels = []string{ - "task_arn", -} +var taskLabels = []string{} -var metadataLabels = []string{ +var taskMetadataLabels = []string{ "cluster", "task_arn", "family", @@ -127,14 +148,9 @@ var metadataLabels = []string{ "launch_type", } -var cpuLabels = append( - labels, - "cpu", -) - -var networkLabels = append( - labels, - "device", +var containerNetworkLabels = append( + containerLabels, + "interface", ) // NewCollector returns a new Collector that queries ECS metadata server @@ -149,10 +165,17 @@ type collector struct { } func (c *collector) Describe(ch chan<- *prometheus.Desc) { + ch <- taskMetadataDesc + ch <- taskCpuLimitDesc + ch <- taskMemLimitDesc + ch <- taskEphemeralStorageUsedDesc + ch <- taskEphemeralStorageAllocatedDesc + ch <- taskImagePullStartDesc + ch <- taskImagePullStopDesc ch <- cpuTotalDesc ch <- memUsageDesc ch <- memLimitDesc - ch <- memCacheUsageDesc + ch <- memCacheSizeDesc ch <- networkRxBytesDesc ch <- networkRxPacketsDesc ch <- networkRxDroppedDesc @@ -173,7 +196,7 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { c.logger.Debug("Got ECS task metadata response", "stats", metadata) ch <- prometheus.MustNewConstMetric( - metadataDesc, + taskMetadataDesc, prometheus.GaugeValue, 1.0, metadata.Cluster, @@ -193,22 +216,48 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { if metadata.Limits != nil { if metadata.Limits.CPU != nil { ch <- prometheus.MustNewConstMetric( - svcCPULimitDesc, + taskCpuLimitDesc, prometheus.GaugeValue, *metadata.Limits.CPU, - metadata.TaskARN, ) } if metadata.Limits.Memory != nil { ch <- prometheus.MustNewConstMetric( - svcMemLimitDesc, + taskMemLimitDesc, prometheus.GaugeValue, - float64(*metadata.Limits.Memory), - metadata.TaskARN, + float64(*metadata.Limits.Memory*mebibytes), ) } } + if metadata.EphemeralStorageMetrics != nil { + ch <- prometheus.MustNewConstMetric( + taskEphemeralStorageUsedDesc, + prometheus.GaugeValue, + float64(metadata.EphemeralStorageMetrics.UtilizedMiBs*mebibytes), + ) + ch <- prometheus.MustNewConstMetric( + taskEphemeralStorageAllocatedDesc, + prometheus.GaugeValue, + float64(metadata.EphemeralStorageMetrics.ReservedMiBs*mebibytes), + ) + } + + if metadata.PullStartedAt != nil { + ch <- prometheus.MustNewConstMetric( + taskImagePullStartDesc, + prometheus.GaugeValue, + float64(metadata.PullStartedAt.UnixNano())*nanoseconds, + ) + } + if metadata.PullStoppedAt != nil { + ch <- prometheus.MustNewConstMetric( + taskImagePullStopDesc, + prometheus.GaugeValue, + float64(metadata.PullStoppedAt.UnixNano())*nanoseconds, + ) + } + stats, err := c.client.RetrieveTaskStats(ctx) if err != nil { c.logger.Debug("Failed to retrieve container stats", "error", err) @@ -223,41 +272,49 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { continue } - labelVals := []string{ + containerLabelVals := []string{ container.Name, } - for i, cpuUsage := range s.CPUStats.CPUUsage.PercpuUsage { - cpu := fmt.Sprintf("%d", i) - ch <- prometheus.MustNewConstMetric( - cpuTotalDesc, - prometheus.CounterValue, - float64(cpuUsage)/nanoSeconds, - append(labelVals, cpu)..., - ) - } + ch <- prometheus.MustNewConstMetric( + cpuTotalDesc, + prometheus.CounterValue, + float64(s.CPUStats.CPUUsage.TotalUsage)*nanoseconds, + containerLabelVals..., + ) cacheValue := 0.0 if val, ok := s.MemoryStats.Stats["cache"]; ok { cacheValue = float64(val) } + // Report the container's memory limit as its own, if any, otherwise the + // task's limit. This is correct in that this is the precise logic used + // to configure the cgroups limit for the container. + var containerMemoryLimitMib int64 + if container.Limits.Memory != nil { + containerMemoryLimitMib = *container.Limits.Memory + } else { + // This must be set if the container limit is not set, and thus is + // safe to dereference. + containerMemoryLimitMib = *metadata.Limits.Memory + } for desc, value := range map[*prometheus.Desc]float64{ - memUsageDesc: float64(s.MemoryStats.Usage), - memLimitDesc: float64(s.MemoryStats.Limit), - memCacheUsageDesc: cacheValue, + memUsageDesc: float64(s.MemoryStats.Usage), + memLimitDesc: float64(containerMemoryLimitMib * mebibytes), + memCacheSizeDesc: cacheValue, } { ch <- prometheus.MustNewConstMetric( desc, prometheus.GaugeValue, value, - labelVals..., + containerLabelVals..., ) } // Network metrics per interface. for iface, netStats := range s.Networks { - networkLabelVals := append(labelVals, iface) + networkLabelVals := append(containerLabelVals, iface) for desc, value := range map[*prometheus.Desc]float64{ networkRxBytesDesc: float64(netStats.RxBytes),