From f497653025613a0adac385c08efed3c3887452ef Mon Sep 17 00:00:00 2001 From: Sunil Arora Date: Thu, 6 Dec 2018 11:25:59 -0800 Subject: [PATCH] :sparkles: added reconciles_total metric --- pkg/internal/controller/controller.go | 5 +- pkg/internal/controller/controller_test.go | 129 ++++++++++++++++++++- pkg/internal/controller/metrics/metrics.go | 16 ++- 3 files changed, 143 insertions(+), 7 deletions(-) diff --git a/pkg/internal/controller/controller.go b/pkg/internal/controller/controller.go index 4d6f80f6c7..eb4fc01294 100644 --- a/pkg/internal/controller/controller.go +++ b/pkg/internal/controller/controller.go @@ -214,13 +214,15 @@ func (c *Controller) processNextWorkItem() bool { c.Queue.AddRateLimited(req) log.Error(err, "Reconciler error", "controller", c.Name, "request", req) ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Inc() - + ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, "error").Inc() return false } else if result.RequeueAfter > 0 { c.Queue.AddAfter(req, result.RequeueAfter) + ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, "requeue_after").Inc() return true } else if result.Requeue { c.Queue.AddRateLimited(req) + ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, "requeue").Inc() return true } @@ -231,6 +233,7 @@ func (c *Controller) processNextWorkItem() bool { // TODO(directxman12): What does 1 mean? Do we want level constants? Do we want levels at all? log.V(1).Info("Successfully Reconciled", "controller", c.Name, "request", req) + ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, "success").Inc() // Return true, don't take a break return true } diff --git a/pkg/internal/controller/controller_test.go b/pkg/internal/controller/controller_test.go index 1269c07a29..7fee71bb8d 100644 --- a/pkg/internal/controller/controller_test.go +++ b/pkg/internal/controller/controller_test.go @@ -408,6 +408,129 @@ var _ = Describe("controller", func() { // TODO(community): write this test }) + Context("prometheus metric reconcile_total", func() { + var reconcileTotal dto.Metric + + BeforeEach(func() { + ctrlmetrics.ReconcileTotal.Reset() + }) + + It("should get updated on successful reconciliation", func(done Done) { + Expect(func() error { + ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal) + if reconcileTotal.GetCounter().GetValue() != 0.0 { + return fmt.Errorf("metric reconcile total not reset") + } + return nil + }()).Should(Succeed()) + + go func() { + defer GinkgoRecover() + Expect(ctrl.Start(stop)).NotTo(HaveOccurred()) + }() + By("Invoking Reconciler which will succeed") + ctrl.Queue.Add(request) + + Expect(<-reconciled).To(Equal(request)) + Eventually(func() error { + ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "success").Write(&reconcileTotal) + if reconcileTotal.GetCounter().GetValue() != 1.0 { + return fmt.Errorf("metric reconcile total not updated") + } + return nil + }, 2.0).Should(Succeed()) + + close(done) + }, 2.0) + + It("should get updated on reconcile errors", func(done Done) { + Expect(func() error { + ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal) + if reconcileTotal.GetCounter().GetValue() != 0.0 { + return fmt.Errorf("metric reconcile total not reset") + } + return nil + }()).Should(Succeed()) + + fakeReconcile.Err = fmt.Errorf("expected error: reconcile") + go func() { + defer GinkgoRecover() + Expect(ctrl.Start(stop)).NotTo(HaveOccurred()) + }() + By("Invoking Reconciler which will give an error") + ctrl.Queue.Add(request) + + Expect(<-reconciled).To(Equal(request)) + Eventually(func() error { + ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "error").Write(&reconcileTotal) + if reconcileTotal.GetCounter().GetValue() != 1.0 { + return fmt.Errorf("metric reconcile total not updated") + } + return nil + }, 2.0).Should(Succeed()) + + close(done) + }, 2.0) + + It("should get updated when reconcile returns with retry enabled", func(done Done) { + Expect(func() error { + ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "retry").Write(&reconcileTotal) + if reconcileTotal.GetCounter().GetValue() != 0.0 { + return fmt.Errorf("metric reconcile total not reset") + } + return nil + }()).Should(Succeed()) + + fakeReconcile.Result.Requeue = true + go func() { + defer GinkgoRecover() + Expect(ctrl.Start(stop)).NotTo(HaveOccurred()) + }() + By("Invoking Reconciler which will return result with Requeue enabled") + ctrl.Queue.Add(request) + + Expect(<-reconciled).To(Equal(request)) + Eventually(func() error { + ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "requeue").Write(&reconcileTotal) + if reconcileTotal.GetCounter().GetValue() != 1.0 { + return fmt.Errorf("metric reconcile total not updated") + } + return nil + }, 2.0).Should(Succeed()) + + close(done) + }, 2.0) + + It("should get updated when reconcile returns with retryAfter enabled", func(done Done) { + Expect(func() error { + ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "retry_after").Write(&reconcileTotal) + if reconcileTotal.GetCounter().GetValue() != 0.0 { + return fmt.Errorf("metric reconcile total not reset") + } + return nil + }()).Should(Succeed()) + + fakeReconcile.Result.RequeueAfter = 5 * time.Hour + go func() { + defer GinkgoRecover() + Expect(ctrl.Start(stop)).NotTo(HaveOccurred()) + }() + By("Invoking Reconciler which will return result with requeueAfter enabled") + ctrl.Queue.Add(request) + + Expect(<-reconciled).To(Equal(request)) + Eventually(func() error { + ctrlmetrics.ReconcileTotal.WithLabelValues(ctrl.Name, "requeue_after").Write(&reconcileTotal) + if reconcileTotal.GetCounter().GetValue() != 1.0 { + return fmt.Errorf("metric reconcile total not updated") + } + return nil + }, 2.0).Should(Succeed()) + + close(done) + }, 2.0) + }) + Context("should update prometheus metrics", func() { It("should requeue a Request if there is an error and continue processing items", func(done Done) { var queueLength, reconcileErrs dto.Metric @@ -415,7 +538,7 @@ var _ = Describe("controller", func() { Expect(func() error { ctrlmetrics.QueueLength.WithLabelValues(ctrl.Name).Write(&queueLength) if queueLength.GetGauge().GetValue() != 0.0 { - return fmt.Errorf("metrics not reset") + return fmt.Errorf("metric queue length not reset") } return nil }()).Should(Succeed()) @@ -424,7 +547,7 @@ var _ = Describe("controller", func() { Expect(func() error { ctrlmetrics.ReconcileErrors.WithLabelValues(ctrl.Name).Write(&reconcileErrs) if reconcileErrs.GetCounter().GetValue() != 0.0 { - return fmt.Errorf("metrics not reset") + return fmt.Errorf("metric reconcile errors not reset") } return nil }()).Should(Succeed()) @@ -444,7 +567,7 @@ var _ = Describe("controller", func() { Eventually(func() error { ctrlmetrics.QueueLength.WithLabelValues(ctrl.Name).Write(&queueLength) if queueLength.GetGauge().GetValue() != 1.0 { - return fmt.Errorf("metrics not updated") + return fmt.Errorf("metric queue length not updated") } return nil }, 2.0).Should(Succeed()) diff --git a/pkg/internal/controller/metrics/metrics.go b/pkg/internal/controller/metrics/metrics.go index 944ca8e035..ff1206aee6 100644 --- a/pkg/internal/controller/metrics/metrics.go +++ b/pkg/internal/controller/metrics/metrics.go @@ -29,24 +29,34 @@ var ( Help: "Length of reconcile queue per controller", }, []string{"controller"}) + // ReconcileTotal is a prometheus counter metrics which holds the total + // number of reconciliations per controller. It has two labels. controller label refers + // to the controller name and result label refers to the reconcile result i.e + // success, error, requeue, requeue_after + ReconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_total", + Help: "Total number of reconciliations per controller", + }, []string{"controller", "result"}) + // ReconcileErrors is a prometheus counter metrics which holds the total // number of errors from the Reconciler ReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "controller_runtime_reconcile_errors_total", - Help: "Total number of reconcile errors per controller", + Help: "Total number of reconciliation errors per controller", }, []string{"controller"}) // ReconcileTime is a prometheus metric which keeps track of the duration - // of reconciles + // of reconciliations ReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "controller_runtime_reconcile_time_seconds", - Help: "Length of time per reconcile per controller", + Help: "Length of time per reconcilation per controller", }, []string{"controller"}) ) func init() { metrics.Registry.MustRegister( QueueLength, + ReconcileTotal, ReconcileErrors, ReconcileTime, // expose process metrics like CPU, Memory, file descriptor usage etc.