Overhaul all metrics

- Fix names to comply with the [official guidelines](https://prometheus.io/docs/practices/naming/#metric-and-label-naming) and to better mirror the names of similar timeseries from the much-more-popular cAdvisor, when reasonable. And don't use the word "svc" to refer to tasks, as it is just not correct. - Improve `help`s. - Stop reporting per-CPU usage metrics. They're empirically only available in Fargate, but the current collector implementation assumes they're available everywhere. (They were previously available in EC2 but that stopped being the case when ecs-agent was upgraded to use cgroups v2.) Given that it's not clear why per-CPU numbers are useful in general, remove them everywhere instead of exposing disjoint metrics for Fargate and EC2. This will also prevent Fargate from potentially spontaneously breaking in the same way EC2 did. - Fix task-level memory limit to actually be in bytes (it previously said "bytes" but was in fact MiB). - Correctly report container-level memory limits in all cases - the stats `limit` is nonsense if, as in Fargate, there is no container-level limit configured in the task definition. While the right data for all cases is hiding in the stats response somewhere, I have instead opted to cut out the stats middleman and use the task metadata directly to drive this metric. I think it's substantially less likely that ECS fails to effect the configured limits upon cgroups correctly than it is that we fail to interrogate cgroups output correctly: the latter empirically happens with some frequency :^). - Add metrics concerning Fargate ephemeral storage, and one for task image pull duration. - Add more labels for task- and container-level metrics. While we should always be cautious when adding common labels to timeseries, I think the existing ones were insufficient for doing basic aggregations (e.g. "average memory usage for a given task family grouped by revision"). These labels are in proportion to those used by cAdvisor for its own container timeseries. I have tested these changes both in Fargate and EC2 and they look correct to me. Signed-off-by: Ian Kerins <git@isk.haus>
prometheus-community · Oct 13, 2024 · 5c7aa78 · 5c7aa78
1 parent 593ea5f
commit 5c7aa78
Show file tree

Hide file tree

Showing 2 changed files with 159 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -37,12 +37,20 @@ from App Runner services.
 
 ## Labels
 
-* **container**: Container associated with a metric.
-* **cpu**: Available to CPU metrics, helps to breakdown metrics by CPU.
-* **device**: Network interface device associated with the metric. Only
+### On task-level metrics
+* **task_arn**: [ARN of the task](https://docs.aws.amazon.com/service-authorization/latest/reference/list_amazonelasticcontainerservice.html#amazonelasticcontainerservice-resources-for-iam-policies) associated with a metric.
+* **family**: Task definition family associated with a metric.
+* **revision**: Revision of the task definition family associated with a metric.
+
+### On container-level metrics
+
+* **container**: Name of the container (as in the ECS task definition) associated with a metric.
+* **image**: Docker image identifier (e.g. `name:tag`, `name@digest`) of the container.
+* **interface**: Network interface device associated with the metric. Only
   available for several network metrics.
 
 ## Example output
+TODO update
 
 ```
 # HELP ecs_cpu_seconds_total Total CPU usage in seconds.

diff --git a/ecscollector/collector.go b/ecscollector/collector.go
@@ -17,7 +17,6 @@ package ecscollector
 
 import (
 	"context"
-	"fmt"
 	"log/slog"
 	"time"
 
@@ -27,94 +26,116 @@ import (
 
 // ECS cpu_stats are from upstream docker/moby. These values are in nanoseconds.
 // https://github.com/moby/moby/blob/49f021ebf00a76d74f5ce158244083e2dfba26fb/api/types/stats.go#L18-L40
-const nanoSeconds = 1.0e9
+const nanoseconds = 1 / 1.0e9
 
-var (
-	metadataDesc = prometheus.NewDesc(
-		"ecs_metadata_info",
-		"ECS service metadata.",
-		metadataLabels, nil)
-
-	svcCPULimitDesc = prometheus.NewDesc(
-		"ecs_svc_cpu_limit",
-		"Total CPU Limit.",
-		svcLabels, nil)
+// Task definition memory parameters are defined in MiB, while Prometheus
+// standard metrics use bytes.
+const mebibytes = 1024 * 1024
 
-	svcMemLimitDesc = prometheus.NewDesc(
-		"ecs_svc_memory_limit_bytes",
-		"Total MEM Limit in bytes.",
-		svcLabels, nil)
+var (
+	taskMetadataDesc = prometheus.NewDesc(
+		"ecs_task_metadata_info",
+		"ECS task metadata, sourced from the task metadata endpoint version 4.",
+		taskMetadataLabels, nil)
+
+	taskCPULimitDesc = prometheus.NewDesc(
+		"ecs_task_cpu_limit_vcpus",
+		"Configured task CPU limit in vCPUs (1 vCPU = 1024 CPU units). This is optional when running on EC2; if no limit is set, this metric has no value.",
+		taskLabels, nil)
+
+	taskMemLimitDesc = prometheus.NewDesc(
+		"ecs_task_memory_limit_bytes",
+		"Configured task memory limit in bytes. This is optional when running on EC2; if no limit is set, this metric has no value.",
+		taskLabels, nil)
+
+	taskEphemeralStorageUsedDesc = prometheus.NewDesc(
+		"ecs_task_ephemeral_storage_used_bytes",
+		"Current Fargate task ephemeral storage usage in bytes.",
+		taskLabels, nil)
+
+	taskEphemeralStorageAllocatedDesc = prometheus.NewDesc(
+		"ecs_task_ephemeral_storage_allocated_bytes",
+		"Configured Fargate task ephemeral storage allocated size in bytes.",
+		taskLabels, nil)
+
+	taskImagePullDurationDesc = prometheus.NewDesc(
+		"ecs_task_image_pull_duration_seconds",
+		"How long container image pulling took for the task on startup.",
+		taskLabels, nil)
 
 	cpuTotalDesc = prometheus.NewDesc(
-		"ecs_cpu_seconds_total",
-		"Total CPU usage in seconds.",
-		cpuLabels, nil)
+		"ecs_container_cpu_usage_seconds_total",
+		"Cumulative total container CPU usage in seconds.",
+		containerLabels, nil)
 
 	memUsageDesc = prometheus.NewDesc(
-		"ecs_memory_bytes",
-		"Memory usage in bytes.",
-		labels, nil)
+		"ecs_container_memory_usage_bytes",
+		"Current container memory usage in bytes.",
+		containerLabels, nil)
 
 	memLimitDesc = prometheus.NewDesc(
-		"ecs_memory_limit_bytes",
-		"Memory limit in bytes.",
-		labels, nil)
+		"ecs_container_memory_limit_bytes",
+		"Configured container memory limit in bytes, set from the container-level limit in the task definition if any, otherwise the task-level limit.",
+		containerLabels, nil)
 
-	memCacheUsageDesc = prometheus.NewDesc(
-		"ecs_memory_cache_usage",
-		"Memory cache usage in bytes.",
-		labels, nil)
+	memCacheSizeDesc = prometheus.NewDesc(
+		"ecs_container_memory_page_cache_size_bytes",
+		"Current container memory page cache size in bytes. This is not a subset of used bytes.",
+		containerLabels, nil)
 
 	networkRxBytesDesc = prometheus.NewDesc(
-		"ecs_network_receive_bytes_total",
-		"Network received in bytes.",
-		networkLabels, nil)
+		"ecs_container_network_receive_bytes_total",
+		"Cumulative total size of container network packets received in bytes.",
+		containerNetworkLabels, nil)
 
 	networkRxPacketsDesc = prometheus.NewDesc(
-		"ecs_network_receive_packets_total",
-		"Network packets received.",
-		networkLabels, nil)
+		"ecs_container_network_receive_packets_total",
+		"Cumulative total count of container network packets received.",
+		containerNetworkLabels, nil)
 
 	networkRxDroppedDesc = prometheus.NewDesc(
-		"ecs_network_receive_dropped_total",
-		"Network packets dropped in receiving.",
-		networkLabels, nil)
+		"ecs_container_network_receive_packets_dropped_total",
+		"Cumulative total count of container network packets dropped in receiving.",
+		containerNetworkLabels, nil)
 
 	networkRxErrorsDesc = prometheus.NewDesc(
-		"ecs_network_receive_errors_total",
-		"Network errors in receiving.",
-		networkLabels, nil)
+		"ecs_container_network_receive_errors_total",
+		"Cumulative total count of container network errors in receiving.",
+		containerNetworkLabels, nil)
 
 	networkTxBytesDesc = prometheus.NewDesc(
-		"ecs_network_transmit_bytes_total",
-		"Network transmitted in bytes.",
-		networkLabels, nil)
+		"ecs_container_network_transmit_bytes_total",
+		"Cumulative total size of container network packets transmitted in bytes.",
+		containerNetworkLabels, nil)
 
 	networkTxPacketsDesc = prometheus.NewDesc(
-		"ecs_network_transmit_packets_total",
-		"Network packets transmitted.",
-		networkLabels, nil)
+		"ecs_container_network_transmit_packets_total",
+		"Cumulative total count of container network packets transmitted.",
+		containerNetworkLabels, nil)
 
 	networkTxDroppedDesc = prometheus.NewDesc(
-		"ecs_network_transmit_dropped_total",
-		"Network packets dropped in transmit.",
-		networkLabels, nil)
+		"ecs_container_network_transmit_dropped_total",
+		"Cumulative total count of container network packets dropped in transmit.",
+		containerNetworkLabels, nil)
 
 	networkTxErrorsDesc = prometheus.NewDesc(
-		"ecs_network_transmit_errors_total",
-		"Network errors in transmit.",
-		networkLabels, nil)
+		"ecs_container_network_transmit_errors_total",
+		"Cumulative total count of container network errors in transmit.",
+		containerNetworkLabels, nil)
 )
 
-var labels = []string{
-	"container",
+var containerLabels = []string{
+	"container_name",
+	"image",
 }
 
-var svcLabels = []string{
+var taskLabels = []string{
 	"task_arn",
+	"family",
+	"revision",
 }
 
-var metadataLabels = []string{
+var taskMetadataLabels = []string{
 	"cluster",
 	"task_arn",
 	"family",
@@ -127,14 +148,9 @@ var metadataLabels = []string{
 	"launch_type",
 }
 
-var cpuLabels = append(
-	labels,
-	"cpu",
-)
-
-var networkLabels = append(
-	labels,
-	"device",
+var containerNetworkLabels = append(
+	containerLabels,
+	"interface",
 )
 
 // NewCollector returns a new Collector that queries ECS metadata server
@@ -149,10 +165,16 @@ type collector struct {
 }
 
 func (c *collector) Describe(ch chan<- *prometheus.Desc) {
+	ch <- taskMetadataDesc
+	ch <- taskCPULimitDesc
+	ch <- taskMemLimitDesc
+	ch <- taskEphemeralStorageUsedDesc
+	ch <- taskEphemeralStorageAllocatedDesc
+	ch <- taskImagePullDurationDesc
 	ch <- cpuTotalDesc
 	ch <- memUsageDesc
 	ch <- memLimitDesc
-	ch <- memCacheUsageDesc
+	ch <- memCacheSizeDesc
 	ch <- networkRxBytesDesc
 	ch <- networkRxPacketsDesc
 	ch <- networkRxDroppedDesc
@@ -173,7 +195,7 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) {
 	c.logger.Debug("Got ECS task metadata response", "stats", metadata)
 
 	ch <- prometheus.MustNewConstMetric(
-		metadataDesc,
+		taskMetadataDesc,
 		prometheus.GaugeValue,
 		1.0,
 		metadata.Cluster,
@@ -188,27 +210,57 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) {
 		metadata.LaunchType,
 	)
 
+	taskLabelVals := []string{
+		metadata.TaskARN,
+		metadata.Family,
+		metadata.Revision,
+	}
+
 	// Task CPU/memory limits are optional when running on EC2 - the relevant
 	// limits may only exist at the container level.
 	if metadata.Limits != nil {
 		if metadata.Limits.CPU != nil {
 			ch <- prometheus.MustNewConstMetric(
-				svcCPULimitDesc,
+				taskCPULimitDesc,
 				prometheus.GaugeValue,
 				*metadata.Limits.CPU,
-				metadata.TaskARN,
+				taskLabelVals...,
 			)
 		}
 		if metadata.Limits.Memory != nil {
 			ch <- prometheus.MustNewConstMetric(
-				svcMemLimitDesc,
+				taskMemLimitDesc,
 				prometheus.GaugeValue,
-				float64(*metadata.Limits.Memory),
-				metadata.TaskARN,
+				float64(*metadata.Limits.Memory*mebibytes),
+				taskLabelVals...,
 			)
 		}
 	}
 
+	if metadata.EphemeralStorageMetrics != nil {
+		ch <- prometheus.MustNewConstMetric(
+			taskEphemeralStorageUsedDesc,
+			prometheus.GaugeValue,
+			float64(metadata.EphemeralStorageMetrics.UtilizedMiBs*mebibytes),
+			taskLabelVals...,
+		)
+		ch <- prometheus.MustNewConstMetric(
+			taskEphemeralStorageAllocatedDesc,
+			prometheus.GaugeValue,
+			float64(metadata.EphemeralStorageMetrics.ReservedMiBs*mebibytes),
+			taskLabelVals...,
+		)
+	}
+
+	if metadata.PullStartedAt != nil && metadata.PullStoppedAt != nil {
+		ch <- prometheus.MustNewConstMetric(
+			taskImagePullDurationDesc,
+			prometheus.GaugeValue,
+			float64(metadata.PullStoppedAt.Sub(*metadata.PullStartedAt))*nanoseconds,
+			taskLabelVals...,
+		)
+	}
+
 	stats, err := c.client.RetrieveTaskStats(ctx)
 	if err != nil {
 		c.logger.Debug("Failed to retrieve container stats", "error", err)
@@ -223,41 +275,50 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) {
 			continue
 		}
 
-		labelVals := []string{
+		containerLabelVals := []string{
 			container.Name,
+			container.Image,
 		}
 
-		for i, cpuUsage := range s.CPUStats.CPUUsage.PercpuUsage {
-			cpu := fmt.Sprintf("%d", i)
-			ch <- prometheus.MustNewConstMetric(
-				cpuTotalDesc,
-				prometheus.CounterValue,
-				float64(cpuUsage)/nanoSeconds,
-				append(labelVals, cpu)...,
-			)
-		}
+		ch <- prometheus.MustNewConstMetric(
+			cpuTotalDesc,
+			prometheus.CounterValue,
+			float64(s.CPUStats.CPUUsage.TotalUsage)*nanoseconds,
+			containerLabelVals...,
+		)
 
 		cacheValue := 0.0
 		if val, ok := s.MemoryStats.Stats["cache"]; ok {
 			cacheValue = float64(val)
 		}
 
+		// Report the container's memory limit as its own, if any, otherwise the
+		// task's limit. This is correct in that this is the precise logic used
+		// to configure the cgroups limit for the container.
+		var containerMemoryLimitMib int64
+		if container.Limits.Memory != nil {
+			containerMemoryLimitMib = *container.Limits.Memory
+		} else {
+			// This must be set if the container limit is not set, and thus is
+			// safe to dereference.
+			containerMemoryLimitMib = *metadata.Limits.Memory
+		}
 		for desc, value := range map[*prometheus.Desc]float64{
-			memUsageDesc:      float64(s.MemoryStats.Usage),
-			memLimitDesc:      float64(s.MemoryStats.Limit),
-			memCacheUsageDesc: cacheValue,
+			memUsageDesc:     float64(s.MemoryStats.Usage),
+			memLimitDesc:     float64(containerMemoryLimitMib * mebibytes),
+			memCacheSizeDesc: cacheValue,
 		} {
 			ch <- prometheus.MustNewConstMetric(
 				desc,
 				prometheus.GaugeValue,
 				value,
-				labelVals...,
+				containerLabelVals...,
 			)
 		}
 
 		// Network metrics per interface.
 		for iface, netStats := range s.Networks {
-			networkLabelVals := append(labelVals, iface)
+			networkLabelVals := append(containerLabelVals, iface)
 
 			for desc, value := range map[*prometheus.Desc]float64{
 				networkRxBytesDesc:   float64(netStats.RxBytes),