From 35c90d35fd6def2b47c246f2085a243d3aaa7228 Mon Sep 17 00:00:00 2001 From: gabemontero <gmontero@redhat.com> Date: Thu, 11 Apr 2024 12:14:33 -0400 Subject: [PATCH] add namespace label/tag to non-deprecated throttle metrics Back when implementing https://github.com/tektoncd/pipeline/pull/6744 for https://github.com/tektoncd/pipeline/issues/6631 we failed to realize that with k8s quota policies being namespace scoped, knowing which namespace the throttled items were in could have some diagnostic value. Now that we have been using the metric added for a bit, this realization is now very apparent. This changes introduces the namespace tag. Also, since last touching this space, the original metric was deprecated and a new one with a shorter name was added. This change only updates the non-deprecated metric with the new label. rh-pre-commit.version: 2.2.0 rh-pre-commit.check-secrets: ENABLED --- docs/metrics.md | 32 +++++++++---------- pkg/taskrunmetrics/metrics.go | 49 +++++++++++++++++++++++++----- pkg/taskrunmetrics/metrics_test.go | 4 ++- 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 3df1df461ce..30171b8229e 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -11,24 +11,24 @@ The following pipeline metrics are available at `controller-service` on port `90 We expose several kinds of exporters, including Prometheus, Google Stackdriver, and many others. You can set them up using [observability configuration](../config/config-observability.yaml). -| Name | Type | Labels/Tags | Status | -|-----------------------------------------------------------------------------------------| ----------- | ----------- | ----------- | +| Name | Type | Labels/Tags | Status | +|-----------------------------------------------------------------------------------------| ----------- |-------------------------------------------------| ----------- | | `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name> <br> `*pipelinerun`=<pipelinerun_name> <br> `status`=<status> <br> `namespace`=<pipelinerun-namespace> | experimental | -| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name> <br> `*pipelinerun`=<pipelinerun_name> <br> `status`=<status> <br> `*task`=<task_name> <br> `*taskrun`=<taskrun_name><br> `namespace`=<pipelineruns-taskruns-namespace>| experimental | -| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status> | deprecate | -| `tekton_pipelines_controller_pipelinerun_total` | Counter | `status`=<status> | experimental | -| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | deprecate | -| `tekton_pipelines_controller_running_pipelineruns` | Gauge | | experimental | +| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name> <br> `*pipelinerun`=<pipelinerun_name> <br> `status`=<status> <br> `*task`=<task_name> <br> `*taskrun`=<taskrun_name><br> `namespace`=<pipelineruns-taskruns-namespace> | experimental | +| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status> | deprecate | +| `tekton_pipelines_controller_pipelinerun_total` | Counter | `status`=<status> | experimental | +| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_pipelineruns` | Gauge | | experimental | | `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=<status> <br> `*task`=<task_name> <br> `*taskrun`=<taskrun_name><br> `namespace`=<pipelineruns-taskruns-namespace> | experimental | -| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status> | deprecate | -| `tekton_pipelines_controller_taskrun_total` | Counter | `status`=<status> | experimental | -| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | deprecate | -| `tekton_pipelines_controller_running_taskruns` | Gauge | | experimental | -| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | deprecate | -| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | deprecate | -| `tekton_pipelines_controller_running_taskruns_throttled_by_quota` | Gauge | | experimental | -| `tekton_pipelines_controller_running_taskruns_throttled_by_node` | Gauge | | experimental | -| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental | +| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status> | deprecate | +| `tekton_pipelines_controller_taskrun_total` | Counter | `status`=<status> | experimental | +| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_taskruns` | Gauge | | experimental | +| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | <br> `namespace`=<pipelinerun-namespace> | deprecate | +| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | <br> `namespace`=<pipelinerun-namespace> | deprecate | +| `tekton_pipelines_controller_running_taskruns_throttled_by_quota` | Gauge | <br> `namespace`=<pipelinerun-namespace> | experimental | +| `tekton_pipelines_controller_running_taskruns_throttled_by_node` | Gauge | <br> `namespace`=<pipelinerun-namespace> | experimental | +| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental | The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics. diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go index d60b5e567f9..89b2e680b1a 100644 --- a/pkg/taskrunmetrics/metrics.go +++ b/pkg/taskrunmetrics/metrics.go @@ -272,11 +272,13 @@ func viewRegister(cfg *config.Metrics) error { Description: runningTRsThrottledByQuota.Description(), Measure: runningTRsThrottledByQuota, Aggregation: view.LastValue(), + TagKeys: []tag.Key{namespaceTag}, } runningTRsThrottledByNodeView = &view.View{ Description: runningTRsThrottledByNode.Description(), Measure: runningTRsThrottledByNode, Aggregation: view.LastValue(), + TagKeys: []tag.Key{namespaceTag}, } podLatencyView = &view.View{ Description: podLatency.Description(), @@ -428,21 +430,40 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi } var runningTrs int - var trsThrottledByQuota int - var trsThrottledByNode int + trsThrottledByQuota := map[string]int{} + trsThrottledByQuotaCount := 0 + trsThrottledByNode := map[string]int{} + trsThrottledByNodeCount := 0 var trsWaitResolvingTaskRef int for _, pr := range trs { + // initialize metrics with namespace tag to zero if unset; will then update as needed below + _, ok := trsThrottledByQuota[pr.Namespace] + if !ok { + trsThrottledByQuota[pr.Namespace] = 0 + } + _, ok = trsThrottledByNode[pr.Namespace] + if !ok { + trsThrottledByNode[pr.Namespace] = 0 + } + if pr.IsDone() { continue } runningTrs++ + succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded) if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown { switch succeedCondition.Reason { case pod.ReasonExceededResourceQuota: - trsThrottledByQuota++ + trsThrottledByQuotaCount++ + cnt := trsThrottledByQuota[pr.Namespace] + cnt++ + trsThrottledByQuota[pr.Namespace] = cnt case pod.ReasonExceededNodeResources: - trsThrottledByNode++ + trsThrottledByNodeCount++ + cnt := trsThrottledByNode[pr.Namespace] + cnt++ + trsThrottledByNode[pr.Namespace] = cnt case v1.TaskRunReasonResolvingTaskRef: trsWaitResolvingTaskRef++ } @@ -455,12 +476,24 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi } metrics.Record(ctx, runningTRsCount.M(float64(runningTrs))) metrics.Record(ctx, runningTRs.M(float64(runningTrs))) - metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode))) - metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota))) metrics.Record(ctx, runningTRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef))) - metrics.Record(ctx, runningTRsThrottledByNode.M(float64(trsThrottledByNode))) - metrics.Record(ctx, runningTRsThrottledByQuota.M(float64(trsThrottledByQuota))) + metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuotaCount))) + metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNodeCount))) + for ns, cnt := range trsThrottledByQuota { + ctx, err = tag.New(ctx, []tag.Mutator{tag.Insert(namespaceTag, ns)}...) + if err != nil { + return err + } + metrics.Record(ctx, runningTRsThrottledByQuota.M(float64(cnt))) + } + for ns, cnt := range trsThrottledByNode { + ctx, err = tag.New(ctx, []tag.Mutator{tag.Insert(namespaceTag, ns)}...) + if err != nil { + return err + } + metrics.Record(ctx, runningTRsThrottledByNode.M(float64(cnt))) + } return nil } diff --git a/pkg/taskrunmetrics/metrics_test.go b/pkg/taskrunmetrics/metrics_test.go index c7e581d8962..b74f4c7d4cc 100644 --- a/pkg/taskrunmetrics/metrics_test.go +++ b/pkg/taskrunmetrics/metrics_test.go @@ -537,7 +537,7 @@ func TestRecordRunningTaskRunsThrottledCounts(t *testing.T) { informer := faketaskruninformer.Get(ctx) for i := 0; i < multiplier; i++ { tr := &v1.TaskRun{ - ObjectMeta: metav1.ObjectMeta{Name: names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("taskrun-")}, + ObjectMeta: metav1.ObjectMeta{Name: names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("taskrun-"), Namespace: "test"}, Status: v1.TaskRunStatus{ Status: duckv1.Status{ Conditions: duckv1.Conditions{{ @@ -563,7 +563,9 @@ func TestRecordRunningTaskRunsThrottledCounts(t *testing.T) { t.Errorf("RunningTaskRuns: %v", err) } metricstest.CheckLastValueData(t, "running_taskruns_throttled_by_quota_count", map[string]string{}, tc.quotaCount) + metricstest.CheckLastValueData(t, "running_taskruns_throttled_by_quota", map[string]string{namespaceTag.Name(): "test"}, tc.quotaCount) metricstest.CheckLastValueData(t, "running_taskruns_throttled_by_node_count", map[string]string{}, tc.nodeCount) + metricstest.CheckLastValueData(t, "running_taskruns_throttled_by_node", map[string]string{namespaceTag.Name(): "test"}, tc.nodeCount) metricstest.CheckLastValueData(t, "running_taskruns_waiting_on_task_resolution_count", map[string]string{}, tc.waitCount) } }