diff --git a/pkg/daemon/metrics.go b/pkg/daemon/metrics.go index 49b0ad079b..5923c257c3 100644 --- a/pkg/daemon/metrics.go +++ b/pkg/daemon/metrics.go @@ -39,7 +39,7 @@ var ( prometheus.GaugeOpts{ Name: "mcd_pivot_err", Help: "errors encountered during pivot", - }, []string{"pivot_target", "err"}) + }, []string{"node", "pivot_target", "err"}) // MCDState is state of mcd for indicated node (ex: degraded) MCDState = prometheus.NewGaugeVec( @@ -59,7 +59,7 @@ var ( MCDRebootErr = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "mcd_reboot_err", - }, []string{"message", "err"}) + }, []string{"node", "message", "err"}) // MCDUpdateState logs completed update or error MCDUpdateState = prometheus.NewGaugeVec( @@ -88,9 +88,9 @@ func registerMCDMetrics() error { } } - MCDPivotErr.WithLabelValues("", "").Set(0) + MCDPivotErr.WithLabelValues("", "", "").Set(0) KubeletHealthState.Set(0) - MCDRebootErr.WithLabelValues("", "").Set(0) + MCDRebootErr.WithLabelValues("", "", "").Set(0) MCDUpdateState.WithLabelValues("", "").Set(0) return nil diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 0c032364b3..74af1ec108 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -1614,13 +1614,13 @@ func (dn *Daemon) reboot(rationale string) error { // either, we just have one for the MCD itself. if err := rebootCmd.Run(); err != nil { dn.logSystem("failed to run reboot: %v", err) - MCDRebootErr.WithLabelValues("failed to run reboot", err.Error()).SetToCurrentTime() + MCDRebootErr.WithLabelValues(dn.node.Name, "failed to run reboot", err.Error()).SetToCurrentTime() } // wait to be killed via SIGTERM from the kubelet shutting down time.Sleep(defaultRebootTimeout) // if everything went well, this should be unreachable. - MCDRebootErr.WithLabelValues("reboot failed", "this error should be unreachable, something is seriously wrong").SetToCurrentTime() + MCDRebootErr.WithLabelValues(dn.node.Name, "reboot failed", "this error should be unreachable, something is seriously wrong").SetToCurrentTime() return fmt.Errorf("reboot failed; this error should be unreachable, something is seriously wrong") }