From 29ddf8599dfcb275ac221182644ee8c140ce6eca Mon Sep 17 00:00:00 2001
From: Lee Bernick <lee.a.bernick@gmail.com>
Date: Fri, 30 Jun 2023 09:10:30 -0400
Subject: [PATCH] Fix units for metric taskrun_pod_latency

Prior to this commit, the taskrun_pod_latency metric was calculated
using pod timestamps and directly casted to float64. This calculation
resulted in incorrectly using nanoseconds, when the metric was
intended to have units of milliseconds.

This commit fixes the duration conversion and adds units to the metric
name, in line with Prometheus best practices.
---
 docs/metrics.md                    | 2 +-
 pkg/taskrunmetrics/metrics.go      | 4 ++--
 pkg/taskrunmetrics/metrics_test.go | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/docs/metrics.md b/docs/metrics.md
index 0efd8c56b83..b0b8ab75e2f 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -22,7 +22,7 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
 | `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
 | `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
 | `tekton_pipelines_controller_running_taskruns_throttled_by_node_count`  | Gauge | | experimental |
-| `tekton_pipelines_controller_taskruns_pod_latency` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
+| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
 | `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |
 
 The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics.
diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go
index 0d1a4457d14..473b8f4c604 100644
--- a/pkg/taskrunmetrics/metrics.go
+++ b/pkg/taskrunmetrics/metrics.go
@@ -86,7 +86,7 @@ var (
 		"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
 		stats.UnitDimensionless)
 
-	podLatency = stats.Float64("taskruns_pod_latency",
+	podLatency = stats.Float64("taskruns_pod_latency_milliseconds",
 		"scheduling latency for the taskruns pods",
 		stats.UnitMilliseconds)
 )
@@ -438,7 +438,7 @@ func (r *Recorder) RecordPodLatency(ctx context.Context, pod *corev1.Pod, tr *v1
 		return err
 	}
 
-	metrics.Record(ctx, podLatency.M(float64(latency)))
+	metrics.Record(ctx, podLatency.M(float64(latency.Milliseconds())))
 
 	return nil
 }
diff --git a/pkg/taskrunmetrics/metrics_test.go b/pkg/taskrunmetrics/metrics_test.go
index 0e9e927cf2a..fa3f6b6de37 100644
--- a/pkg/taskrunmetrics/metrics_test.go
+++ b/pkg/taskrunmetrics/metrics_test.go
@@ -531,7 +531,7 @@ func TestRecordPodLatency(t *testing.T) {
 			"taskrun":   "test-taskrun",
 			"namespace": "foo",
 		},
-		expectedValue: 4e+09,
+		expectedValue: 4000,
 	}, {
 		name: "for non scheduled pod",
 		pod: &corev1.Pod{
@@ -559,7 +559,7 @@ func TestRecordPodLatency(t *testing.T) {
 				if err != nil {
 					t.Errorf("RecordPodLatency: %v", err)
 				}
-				metricstest.CheckLastValueData(t, "taskruns_pod_latency", td.expectedTags, td.expectedValue)
+				metricstest.CheckLastValueData(t, "taskruns_pod_latency_milliseconds", td.expectedTags, td.expectedValue)
 			}
 		})
 	}
@@ -610,7 +610,7 @@ func TestTaskRunIsOfPipelinerun(t *testing.T) {
 }
 
 func unregisterMetrics() {
-	metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "taskruns_pod_latency")
+	metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "taskruns_pod_latency_milliseconds")
 
 	// Allow the recorder singleton to be recreated.
 	once = sync.Once{}