From 29ddf8599dfcb275ac221182644ee8c140ce6eca Mon Sep 17 00:00:00 2001 From: Lee Bernick Date: Fri, 30 Jun 2023 09:10:30 -0400 Subject: [PATCH] Fix units for metric taskrun_pod_latency Prior to this commit, the taskrun_pod_latency metric was calculated using pod timestamps and directly casted to float64. This calculation resulted in incorrectly using nanoseconds, when the metric was intended to have units of milliseconds. This commit fixes the duration conversion and adds units to the metric name, in line with Prometheus best practices. --- docs/metrics.md | 2 +- pkg/taskrunmetrics/metrics.go | 4 ++-- pkg/taskrunmetrics/metrics_test.go | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 0efd8c56b83..b0b8ab75e2f 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -22,7 +22,7 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver, | `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental | | `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental | | `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental | -| `tekton_pipelines_controller_taskruns_pod_latency` | Gauge | `namespace`=<taskruns-namespace>
`pod`= < taskrun_pod_name>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
| experimental | +| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=<taskruns-namespace>
`pod`= < taskrun_pod_name>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
| experimental | | `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental | The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics. diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go index 0d1a4457d14..473b8f4c604 100644 --- a/pkg/taskrunmetrics/metrics.go +++ b/pkg/taskrunmetrics/metrics.go @@ -86,7 +86,7 @@ var ( "Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started", stats.UnitDimensionless) - podLatency = stats.Float64("taskruns_pod_latency", + podLatency = stats.Float64("taskruns_pod_latency_milliseconds", "scheduling latency for the taskruns pods", stats.UnitMilliseconds) ) @@ -438,7 +438,7 @@ func (r *Recorder) RecordPodLatency(ctx context.Context, pod *corev1.Pod, tr *v1 return err } - metrics.Record(ctx, podLatency.M(float64(latency))) + metrics.Record(ctx, podLatency.M(float64(latency.Milliseconds()))) return nil } diff --git a/pkg/taskrunmetrics/metrics_test.go b/pkg/taskrunmetrics/metrics_test.go index 0e9e927cf2a..fa3f6b6de37 100644 --- a/pkg/taskrunmetrics/metrics_test.go +++ b/pkg/taskrunmetrics/metrics_test.go @@ -531,7 +531,7 @@ func TestRecordPodLatency(t *testing.T) { "taskrun": "test-taskrun", "namespace": "foo", }, - expectedValue: 4e+09, + expectedValue: 4000, }, { name: "for non scheduled pod", pod: &corev1.Pod{ @@ -559,7 +559,7 @@ func TestRecordPodLatency(t *testing.T) { if err != nil { t.Errorf("RecordPodLatency: %v", err) } - metricstest.CheckLastValueData(t, "taskruns_pod_latency", td.expectedTags, td.expectedValue) + metricstest.CheckLastValueData(t, "taskruns_pod_latency_milliseconds", td.expectedTags, td.expectedValue) } }) } @@ -610,7 +610,7 @@ func TestTaskRunIsOfPipelinerun(t *testing.T) { } func unregisterMetrics() { - metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "taskruns_pod_latency") + metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "taskruns_pod_latency_milliseconds") // Allow the recorder singleton to be recreated. once = sync.Once{}