diff --git a/README.md b/README.md index 740b896..272197d 100644 --- a/README.md +++ b/README.md @@ -37,149 +37,80 @@ from App Runner services. ## Labels -* **container**: Container associated with a metric. -* **cpu**: Available to CPU metrics, helps to breakdown metrics by CPU. -* **device**: Network interface device associated with the metric. Only +### On task-level metrics +None. + +### On container-level metrics + +* **container_name**: Name of the container (as in the ECS task definition) associated with a metric. +* **interface**: Network interface device associated with the metric. Only available for several network metrics. ## Example output +(With `--web.disable-exporter-metrics` passed, such that standard Go metrics are not included here.) + ``` -# HELP ecs_cpu_seconds_total Total CPU usage in seconds. -# TYPE ecs_cpu_seconds_total counter -ecs_cpu_seconds_total{container="ecs-metadata-proxy",cpu="0"} 1.746774278e+08 -ecs_cpu_seconds_total{container="ecs-metadata-proxy",cpu="1"} 1.7417992266e+08 -# HELP ecs_memory_bytes Memory usage in bytes. -# TYPE ecs_memory_bytes gauge -ecs_memory_bytes{container="ecs-metadata-proxy"} 4.440064e+06 -# HELP ecs_memory_limit_bytes Memory limit in bytes. -# TYPE ecs_memory_limit_bytes gauge -ecs_memory_limit_bytes{container="ecs-metadata-proxy"} 9.223372036854772e+18 -# HELP ecs_memory_max_bytes Maximum memory usage in bytes. -# TYPE ecs_memory_max_bytes gauge -ecs_memory_max_bytes{container="ecs-metadata-proxy"} 9.023488e+06 -# HELP ecs_network_receive_bytes_total Network received in bytes. -# TYPE ecs_network_receive_bytes_total counter -ecs_network_receive_bytes_total{container="ecs-metadata-proxy",device="eth1"} 4.2851757e+07 -# HELP ecs_network_receive_dropped_total Network packets dropped in receiving. -# TYPE ecs_network_receive_dropped_total counter -ecs_network_receive_dropped_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_receive_errors_total Network errors in receiving. -# TYPE ecs_network_receive_errors_total counter -ecs_network_receive_errors_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_receive_packets_total Network packets received. -# TYPE ecs_network_receive_packets_total counter -ecs_network_receive_packets_total{container="ecs-metadata-proxy",device="eth1"} 516239 -# HELP ecs_network_transmit_bytes_total Network transmitted in bytes. -# TYPE ecs_network_transmit_bytes_total counter -ecs_network_transmit_bytes_total{container="ecs-metadata-proxy",device="eth1"} 1.28412758e+08 -# HELP ecs_network_transmit_dropped_total Network packets dropped in transmit. -# TYPE ecs_network_transmit_dropped_total counter -ecs_network_transmit_dropped_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_transmit_errors_total Network errors in transmit. -# TYPE ecs_network_transmit_errors_total counter -ecs_network_transmit_errors_total{container="ecs-metadata-proxy",device="eth1"} 0 -# HELP ecs_network_transmit_packets_total Network packets transmitted. -# TYPE ecs_network_transmit_packets_total counter -ecs_network_transmit_packets_total{container="ecs-metadata-proxy",device="eth1"} 429472 -# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. -# TYPE go_gc_duration_seconds summary -go_gc_duration_seconds{quantile="0"} 0 -go_gc_duration_seconds{quantile="0.25"} 0 -go_gc_duration_seconds{quantile="0.5"} 0 -go_gc_duration_seconds{quantile="0.75"} 0 -go_gc_duration_seconds{quantile="1"} 0 -go_gc_duration_seconds_sum 0 -go_gc_duration_seconds_count 0 -# HELP go_goroutines Number of goroutines that currently exist. -# TYPE go_goroutines gauge -go_goroutines 8 -# HELP go_info Information about the Go environment. -# TYPE go_info gauge -go_info{version="go1.16.3"} 1 -# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. -# TYPE go_memstats_alloc_bytes gauge -go_memstats_alloc_bytes 595760 -# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. -# TYPE go_memstats_alloc_bytes_total counter -go_memstats_alloc_bytes_total 595760 -# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. -# TYPE go_memstats_buck_hash_sys_bytes gauge -go_memstats_buck_hash_sys_bytes 4092 -# HELP go_memstats_frees_total Total number of frees. -# TYPE go_memstats_frees_total counter -go_memstats_frees_total 123 -# HELP go_memstats_gc_cpu_fraction The fraction of this program's available CPU time used by the GC since the program started. -# TYPE go_memstats_gc_cpu_fraction gauge -go_memstats_gc_cpu_fraction 0 -# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. -# TYPE go_memstats_gc_sys_bytes gauge -go_memstats_gc_sys_bytes 3.97448e+06 -# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. -# TYPE go_memstats_heap_alloc_bytes gauge -go_memstats_heap_alloc_bytes 595760 -# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. -# TYPE go_memstats_heap_idle_bytes gauge -go_memstats_heap_idle_bytes 6.508544e+07 -# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. -# TYPE go_memstats_heap_inuse_bytes gauge -go_memstats_heap_inuse_bytes 1.59744e+06 -# HELP go_memstats_heap_objects Number of allocated objects. -# TYPE go_memstats_heap_objects gauge -go_memstats_heap_objects 2439 -# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. -# TYPE go_memstats_heap_released_bytes gauge -go_memstats_heap_released_bytes 6.508544e+07 -# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. -# TYPE go_memstats_heap_sys_bytes gauge -go_memstats_heap_sys_bytes 6.668288e+07 -# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. -# TYPE go_memstats_last_gc_time_seconds gauge -go_memstats_last_gc_time_seconds 0 -# HELP go_memstats_lookups_total Total number of pointer lookups. -# TYPE go_memstats_lookups_total counter -go_memstats_lookups_total 0 -# HELP go_memstats_mallocs_total Total number of mallocs. -# TYPE go_memstats_mallocs_total counter -go_memstats_mallocs_total 2562 -# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. -# TYPE go_memstats_mcache_inuse_bytes gauge -go_memstats_mcache_inuse_bytes 9600 -# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. -# TYPE go_memstats_mcache_sys_bytes gauge -go_memstats_mcache_sys_bytes 16384 -# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. -# TYPE go_memstats_mspan_inuse_bytes gauge -go_memstats_mspan_inuse_bytes 37400 -# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. -# TYPE go_memstats_mspan_sys_bytes gauge -go_memstats_mspan_sys_bytes 49152 -# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. -# TYPE go_memstats_next_gc_bytes gauge -go_memstats_next_gc_bytes 4.473924e+06 -# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. -# TYPE go_memstats_other_sys_bytes gauge -go_memstats_other_sys_bytes 497348 -# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. -# TYPE go_memstats_stack_inuse_bytes gauge -go_memstats_stack_inuse_bytes 425984 -# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. -# TYPE go_memstats_stack_sys_bytes gauge -go_memstats_stack_sys_bytes 425984 -# HELP go_memstats_sys_bytes Number of bytes obtained from system. -# TYPE go_memstats_sys_bytes gauge -go_memstats_sys_bytes 7.165032e+07 -# HELP go_threads Number of OS threads created. -# TYPE go_threads gauge -go_threads 7 -# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. -# TYPE promhttp_metric_handler_requests_in_flight gauge -promhttp_metric_handler_requests_in_flight 1 -# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. -# TYPE promhttp_metric_handler_requests_total counter -promhttp_metric_handler_requests_total{code="200"} 0 -promhttp_metric_handler_requests_total{code="500"} 0 -promhttp_metric_handler_requests_total{code="503"} 0 +# HELP ecs_container_cpu_usage_seconds_total Cumulative total container CPU usage in seconds. +# TYPE ecs_container_cpu_usage_seconds_total counter +ecs_container_cpu_usage_seconds_total{container_name="ecs-exporter"} 0.027095748000000003 +# HELP ecs_container_memory_limit_bytes Configured container memory limit in bytes, set from the container-level limit in the task definition if any, otherwise the task-level limit. +# TYPE ecs_container_memory_limit_bytes gauge +ecs_container_memory_limit_bytes{container_name="ecs-exporter"} 5.36870912e+08 +# HELP ecs_container_memory_page_cache_size_bytes Current container memory page cache size in bytes. This is not a subset of used bytes. +# TYPE ecs_container_memory_page_cache_size_bytes gauge +ecs_container_memory_page_cache_size_bytes{container_name="ecs-exporter"} 0 +# HELP ecs_container_memory_usage_bytes Current container memory usage in bytes. +# TYPE ecs_container_memory_usage_bytes gauge +ecs_container_memory_usage_bytes{container_name="ecs-exporter"} 4.452352e+06 +# HELP ecs_container_network_receive_bytes_total Cumulative total size of container network packets received in bytes. +# TYPE ecs_container_network_receive_bytes_total counter +ecs_container_network_receive_bytes_total{container_name="ecs-exporter",interface="eth1"} 1.1112267e+07 +# HELP ecs_container_network_receive_errors_total Cumulative total count of container network errors in receiving. +# TYPE ecs_container_network_receive_errors_total counter +ecs_container_network_receive_errors_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_receive_packets_dropped_total Cumulative total count of container network packets dropped in receiving. +# TYPE ecs_container_network_receive_packets_dropped_total counter +ecs_container_network_receive_packets_dropped_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_receive_packets_total Cumulative total count of container network packets received. +# TYPE ecs_container_network_receive_packets_total counter +ecs_container_network_receive_packets_total{container_name="ecs-exporter",interface="eth1"} 8039 +# HELP ecs_container_network_transmit_bytes_total Cumulative total size of container network packets transmitted in bytes. +# TYPE ecs_container_network_transmit_bytes_total counter +ecs_container_network_transmit_bytes_total{container_name="ecs-exporter",interface="eth1"} 165338 +# HELP ecs_container_network_transmit_dropped_total Cumulative total count of container network packets dropped in transmit. +# TYPE ecs_container_network_transmit_dropped_total counter +ecs_container_network_transmit_dropped_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_transmit_errors_total Cumulative total count of container network errors in transmit. +# TYPE ecs_container_network_transmit_errors_total counter +ecs_container_network_transmit_errors_total{container_name="ecs-exporter",interface="eth1"} 0 +# HELP ecs_container_network_transmit_packets_total Cumulative total count of container network packets transmitted. +# TYPE ecs_container_network_transmit_packets_total counter +ecs_container_network_transmit_packets_total{container_name="ecs-exporter",interface="eth1"} 713 +# HELP ecs_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ecs_exporter was built, and the goos and goarch for the build. +# TYPE ecs_exporter_build_info gauge +ecs_exporter_build_info{branch="",goarch="arm64",goos="linux",goversion="go1.23.2",revision="unknown",tags="unknown",version=""} 1 +# HELP ecs_task_cpu_limit_vcpus Configured task CPU limit in vCPUs (1 vCPU = 1024 CPU units). This is optional when running on EC2; if no limit is set, this metric has no value. +# TYPE ecs_task_cpu_limit_vcpus gauge +ecs_task_cpu_limit_vcpus 0.25 +# HELP ecs_task_ephemeral_storage_allocated_bytes Configured Fargate task ephemeral storage allocated size in bytes. +# TYPE ecs_task_ephemeral_storage_allocated_bytes gauge +ecs_task_ephemeral_storage_allocated_bytes 2.1491613696e+10 +# HELP ecs_task_ephemeral_storage_used_bytes Current Fargate task ephemeral storage usage in bytes. +# TYPE ecs_task_ephemeral_storage_used_bytes gauge +ecs_task_ephemeral_storage_used_bytes 3.7748736e+07 +# HELP ecs_task_image_pull_start_timestamp_seconds The time at which the task started pulling docker images for its containers. +# TYPE ecs_task_image_pull_start_timestamp_seconds gauge +ecs_task_image_pull_start_timestamp_seconds 1.7291179014941156e+09 +# HELP ecs_task_image_pull_stop_timestamp_seconds The time at which the task stopped (i.e. completed) pulling docker images for its containers. +# TYPE ecs_task_image_pull_stop_timestamp_seconds gauge +ecs_task_image_pull_stop_timestamp_seconds 1.7291179144469e+09 +# HELP ecs_task_memory_limit_bytes Configured task memory limit in bytes. This is optional when running on EC2; if no limit is set, this metric has no value. +# TYPE ecs_task_memory_limit_bytes gauge +ecs_task_memory_limit_bytes 5.36870912e+08 +# HELP ecs_task_metadata_info ECS task metadata, sourced from the task metadata endpoint version 4. +# TYPE ecs_task_metadata_info gauge +ecs_task_metadata_info{availability_zone="us-east-1a",cluster="arn:aws:ecs:us-east-1:829490980523:cluster/prom-ecs-exporter-sandbox",desired_status="RUNNING",family="prom-ecs-exporter-sandbox-isker-fargate",known_status="RUNNING",launch_type="FARGATE",revision="11",task_arn="arn:aws:ecs:us-east-1:829490980523:task/prom-ecs-exporter-sandbox/0c7f6b0414dc47d0a15019a099cd919b"} 1 ``` ## Example task definition diff --git a/ecscollector/collector.go b/ecscollector/collector.go index df8d9c5..c92cab7 100644 --- a/ecscollector/collector.go +++ b/ecscollector/collector.go @@ -17,9 +17,7 @@ package ecscollector import ( "context" - "fmt" "log/slog" - "time" "github.com/prometheus-community/ecs_exporter/ecsmetadata" "github.com/prometheus/client_golang/prometheus" @@ -27,114 +25,129 @@ import ( // ECS cpu_stats are from upstream docker/moby. These values are in nanoseconds. // https://github.com/moby/moby/blob/49f021ebf00a76d74f5ce158244083e2dfba26fb/api/types/stats.go#L18-L40 -const nanoSeconds = 1.0e9 +const nanoseconds = 1 / 1.0e9 -var ( - metadataDesc = prometheus.NewDesc( - "ecs_metadata_info", - "ECS service metadata.", - metadataLabels, nil) - - svcCPULimitDesc = prometheus.NewDesc( - "ecs_svc_cpu_limit", - "Total CPU Limit.", - svcLabels, nil) +// Task definition memory parameters are defined in MiB, while Prometheus +// standard metrics use bytes. +const mebibytes = 1024 * 1024 - svcMemLimitDesc = prometheus.NewDesc( - "ecs_svc_memory_limit_bytes", - "Total MEM Limit in bytes.", - svcLabels, nil) +var ( + taskMetadataDesc = prometheus.NewDesc( + "ecs_task_metadata_info", + "ECS task metadata, sourced from the task metadata endpoint version 4.", + taskMetadataLabels, nil) + + taskCpuLimitDesc = prometheus.NewDesc( + "ecs_task_cpu_limit_vcpus", + "Configured task CPU limit in vCPUs (1 vCPU = 1024 CPU units). This is optional when running on EC2; if no limit is set, this metric has no value.", + taskLabels, nil) + + taskMemLimitDesc = prometheus.NewDesc( + "ecs_task_memory_limit_bytes", + "Configured task memory limit in bytes. This is optional when running on EC2; if no limit is set, this metric has no value.", + taskLabels, nil) + + taskEphemeralStorageUsedDesc = prometheus.NewDesc( + "ecs_task_ephemeral_storage_used_bytes", + "Current Fargate task ephemeral storage usage in bytes.", + taskLabels, nil) + + taskEphemeralStorageAllocatedDesc = prometheus.NewDesc( + "ecs_task_ephemeral_storage_allocated_bytes", + "Configured Fargate task ephemeral storage allocated size in bytes.", + taskLabels, nil) + + taskImagePullStartDesc = prometheus.NewDesc( + "ecs_task_image_pull_start_timestamp_seconds", + "The time at which the task started pulling docker images for its containers.", + taskLabels, nil) + + taskImagePullStopDesc = prometheus.NewDesc( + "ecs_task_image_pull_stop_timestamp_seconds", + "The time at which the task stopped (i.e. completed) pulling docker images for its containers.", + taskLabels, nil) cpuTotalDesc = prometheus.NewDesc( - "ecs_cpu_seconds_total", - "Total CPU usage in seconds.", - cpuLabels, nil) + "ecs_container_cpu_usage_seconds_total", + "Cumulative total container CPU usage in seconds.", + containerLabels, nil) memUsageDesc = prometheus.NewDesc( - "ecs_memory_bytes", - "Memory usage in bytes.", - labels, nil) + "ecs_container_memory_usage_bytes", + "Current container memory usage in bytes.", + containerLabels, nil) memLimitDesc = prometheus.NewDesc( - "ecs_memory_limit_bytes", - "Memory limit in bytes.", - labels, nil) + "ecs_container_memory_limit_bytes", + "Configured container memory limit in bytes, set from the container-level limit in the task definition if any, otherwise the task-level limit.", + containerLabels, nil) - memCacheUsageDesc = prometheus.NewDesc( - "ecs_memory_cache_usage", - "Memory cache usage in bytes.", - labels, nil) + memCacheSizeDesc = prometheus.NewDesc( + "ecs_container_memory_page_cache_size_bytes", + "Current container memory page cache size in bytes. This is not a subset of used bytes.", + containerLabels, nil) networkRxBytesDesc = prometheus.NewDesc( - "ecs_network_receive_bytes_total", - "Network received in bytes.", - networkLabels, nil) + "ecs_container_network_receive_bytes_total", + "Cumulative total size of container network packets received in bytes.", + containerNetworkLabels, nil) networkRxPacketsDesc = prometheus.NewDesc( - "ecs_network_receive_packets_total", - "Network packets received.", - networkLabels, nil) + "ecs_container_network_receive_packets_total", + "Cumulative total count of container network packets received.", + containerNetworkLabels, nil) networkRxDroppedDesc = prometheus.NewDesc( - "ecs_network_receive_dropped_total", - "Network packets dropped in receiving.", - networkLabels, nil) + "ecs_container_network_receive_packets_dropped_total", + "Cumulative total count of container network packets dropped in receiving.", + containerNetworkLabels, nil) networkRxErrorsDesc = prometheus.NewDesc( - "ecs_network_receive_errors_total", - "Network errors in receiving.", - networkLabels, nil) + "ecs_container_network_receive_errors_total", + "Cumulative total count of container network errors in receiving.", + containerNetworkLabels, nil) networkTxBytesDesc = prometheus.NewDesc( - "ecs_network_transmit_bytes_total", - "Network transmitted in bytes.", - networkLabels, nil) + "ecs_container_network_transmit_bytes_total", + "Cumulative total size of container network packets transmitted in bytes.", + containerNetworkLabels, nil) networkTxPacketsDesc = prometheus.NewDesc( - "ecs_network_transmit_packets_total", - "Network packets transmitted.", - networkLabels, nil) + "ecs_container_network_transmit_packets_total", + "Cumulative total count of container network packets transmitted.", + containerNetworkLabels, nil) networkTxDroppedDesc = prometheus.NewDesc( - "ecs_network_transmit_dropped_total", - "Network packets dropped in transmit.", - networkLabels, nil) + "ecs_container_network_transmit_dropped_total", + "Cumulative total count of container network packets dropped in transmit.", + containerNetworkLabels, nil) networkTxErrorsDesc = prometheus.NewDesc( - "ecs_network_transmit_errors_total", - "Network errors in transmit.", - networkLabels, nil) + "ecs_container_network_transmit_errors_total", + "Cumulative total count of container network errors in transmit.", + containerNetworkLabels, nil) ) -var labels = []string{ - "container", +var containerLabels = []string{ + "container_name", } -var svcLabels = []string{ - "task_arn", -} +var taskLabels = []string{} -var metadataLabels = []string{ +var taskMetadataLabels = []string{ "cluster", "task_arn", "family", "revision", "desired_status", "known_status", - "pull_started_at", - "pull_stopped_at", "availability_zone", "launch_type", } -var cpuLabels = append( - labels, - "cpu", -) - -var networkLabels = append( - labels, - "device", +var containerNetworkLabels = append( + containerLabels, + "interface", ) // NewCollector returns a new Collector that queries ECS metadata server @@ -149,10 +162,17 @@ type collector struct { } func (c *collector) Describe(ch chan<- *prometheus.Desc) { + ch <- taskMetadataDesc + ch <- taskCpuLimitDesc + ch <- taskMemLimitDesc + ch <- taskEphemeralStorageUsedDesc + ch <- taskEphemeralStorageAllocatedDesc + ch <- taskImagePullStartDesc + ch <- taskImagePullStopDesc ch <- cpuTotalDesc ch <- memUsageDesc ch <- memLimitDesc - ch <- memCacheUsageDesc + ch <- memCacheSizeDesc ch <- networkRxBytesDesc ch <- networkRxPacketsDesc ch <- networkRxDroppedDesc @@ -173,7 +193,7 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { c.logger.Debug("Got ECS task metadata response", "stats", metadata) ch <- prometheus.MustNewConstMetric( - metadataDesc, + taskMetadataDesc, prometheus.GaugeValue, 1.0, metadata.Cluster, @@ -182,8 +202,6 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { metadata.Revision, metadata.DesiredStatus, metadata.KnownStatus, - metadata.PullStartedAt.Format(time.RFC3339Nano), - metadata.PullStoppedAt.Format(time.RFC3339Nano), metadata.AvailabilityZone, metadata.LaunchType, ) @@ -193,22 +211,48 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { if metadata.Limits != nil { if metadata.Limits.CPU != nil { ch <- prometheus.MustNewConstMetric( - svcCPULimitDesc, + taskCpuLimitDesc, prometheus.GaugeValue, *metadata.Limits.CPU, - metadata.TaskARN, ) } if metadata.Limits.Memory != nil { ch <- prometheus.MustNewConstMetric( - svcMemLimitDesc, + taskMemLimitDesc, prometheus.GaugeValue, - float64(*metadata.Limits.Memory), - metadata.TaskARN, + float64(*metadata.Limits.Memory*mebibytes), ) } } + if metadata.EphemeralStorageMetrics != nil { + ch <- prometheus.MustNewConstMetric( + taskEphemeralStorageUsedDesc, + prometheus.GaugeValue, + float64(metadata.EphemeralStorageMetrics.UtilizedMiBs*mebibytes), + ) + ch <- prometheus.MustNewConstMetric( + taskEphemeralStorageAllocatedDesc, + prometheus.GaugeValue, + float64(metadata.EphemeralStorageMetrics.ReservedMiBs*mebibytes), + ) + } + + if metadata.PullStartedAt != nil { + ch <- prometheus.MustNewConstMetric( + taskImagePullStartDesc, + prometheus.GaugeValue, + float64(metadata.PullStartedAt.UnixNano())*nanoseconds, + ) + } + if metadata.PullStoppedAt != nil { + ch <- prometheus.MustNewConstMetric( + taskImagePullStopDesc, + prometheus.GaugeValue, + float64(metadata.PullStoppedAt.UnixNano())*nanoseconds, + ) + } + stats, err := c.client.RetrieveTaskStats(ctx) if err != nil { c.logger.Debug("Failed to retrieve container stats", "error", err) @@ -223,41 +267,49 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { continue } - labelVals := []string{ + containerLabelVals := []string{ container.Name, } - for i, cpuUsage := range s.CPUStats.CPUUsage.PercpuUsage { - cpu := fmt.Sprintf("%d", i) - ch <- prometheus.MustNewConstMetric( - cpuTotalDesc, - prometheus.CounterValue, - float64(cpuUsage)/nanoSeconds, - append(labelVals, cpu)..., - ) - } + ch <- prometheus.MustNewConstMetric( + cpuTotalDesc, + prometheus.CounterValue, + float64(s.CPUStats.CPUUsage.TotalUsage)*nanoseconds, + containerLabelVals..., + ) cacheValue := 0.0 if val, ok := s.MemoryStats.Stats["cache"]; ok { cacheValue = float64(val) } + // Report the container's memory limit as its own, if any, otherwise the + // task's limit. This is correct in that this is the precise logic used + // to configure the cgroups limit for the container. + var containerMemoryLimitMib int64 + if container.Limits.Memory != nil { + containerMemoryLimitMib = *container.Limits.Memory + } else { + // This must be set if the container limit is not set, and thus is + // safe to dereference. + containerMemoryLimitMib = *metadata.Limits.Memory + } for desc, value := range map[*prometheus.Desc]float64{ - memUsageDesc: float64(s.MemoryStats.Usage), - memLimitDesc: float64(s.MemoryStats.Limit), - memCacheUsageDesc: cacheValue, + memUsageDesc: float64(s.MemoryStats.Usage), + memLimitDesc: float64(containerMemoryLimitMib * mebibytes), + memCacheSizeDesc: cacheValue, } { ch <- prometheus.MustNewConstMetric( desc, prometheus.GaugeValue, value, - labelVals..., + containerLabelVals..., ) } // Network metrics per interface. for iface, netStats := range s.Networks { - networkLabelVals := append(labelVals, iface) + networkLabelVals := append(containerLabelVals, iface) for desc, value := range map[*prometheus.Desc]float64{ networkRxBytesDesc: float64(netStats.RxBytes),