From d6db0c9ca6f40fb5749263b8bdc680d0e315d873 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Mon, 5 Feb 2018 12:36:27 +0100 Subject: [PATCH 1/2] Add mesh metrics This change adds 2 new metrics for the mesh: * alertmanager_peer_connection, state of the connection between the Alertmanager instance and a peer. * alertmanager_peer_terminations_total, total number of terminated connection. It also moves the gathering of the alertmanager_peer_position metric outside of the meshWait() function so that the metric is computed accurately even when no alerting group fires. --- cmd/alertmanager/main.go | 111 ++++++++++++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 20 deletions(-) diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 0957c0494d..74a36e2a94 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -54,10 +54,6 @@ import ( ) var ( - peerPosition = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "alertmanager_peer_position", - Help: "Position the Alertmanager instance believes it's in. The position determines a peer's behavior in the cluster.", - }) configHash = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "alertmanager_config_hash", Help: "Hash of the currently loaded alertmanager configuration.", @@ -75,7 +71,6 @@ var ( ) func init() { - prometheus.MustRegister(peerPosition) prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) prometheus.MustRegister(configHash) @@ -154,6 +149,7 @@ func main() { level.Error(logger).Log("msg", "Unable to initialize gossip mesh", "err", err) os.Exit(1) } + prometheus.MustRegister(NewMeshCollector(mrouter)) } stopc := make(chan struct{}) @@ -382,31 +378,106 @@ func main() { level.Info(logger).Log("msg", "Received SIGTERM, exiting gracefully...") } +type meshCollector struct { + router *mesh.Router + connDesc *prometheus.Desc + posDesc *prometheus.Desc + termDesc *prometheus.Desc +} + +func NewMeshCollector(router *mesh.Router) *meshCollector { + return &meshCollector{ + router: router, + connDesc: prometheus.NewDesc( + "alertmanager_peer_connection", + "State of the connection between the Alertmanager instance and a peer.", + []string{"peer", "nick"}, + prometheus.Labels{}, + ), + posDesc: prometheus.NewDesc( + "alertmanager_peer_position", + "Position the Alertmanager instance believes it's in. The position determines a peer's behavior in the cluster.", + []string{}, + prometheus.Labels{}, + ), + termDesc: prometheus.NewDesc( + "alertmanager_peer_terminations_total", + "Total number of terminated connections between the AlertManager and its peers.", + []string{}, + prometheus.Labels{}, + ), + } +} + +// Describe implements the prometheus.Collector interface +func (c *meshCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.connDesc + ch <- c.posDesc + ch <- c.termDesc +} + +// Collect implements the prometheus.Collector interface +func (c *meshCollector) Collect(ch chan<- prometheus.Metric) { + status := mesh.NewStatus(c.router) + for _, peer := range status.Peers { + // collect only metrics for the local peer + if status.Name != peer.Name { + continue + } + for _, conn := range peer.Connections { + var v float64 + if conn.Established { + v = 1 + } + ch <- prometheus.MustNewConstMetric( + c.connDesc, + prometheus.GaugeValue, + v, + conn.Name, conn.NickName, + ) + } + } + ch <- prometheus.MustNewConstMetric( + c.posDesc, + prometheus.GaugeValue, + float64(meshPeerPosition(c.router)), + ) + ch <- prometheus.MustNewConstMetric( + c.termDesc, + prometheus.GaugeValue, + float64(status.TerminationCount), + ) +} + type peerDescSlice []mesh.PeerDescription func (s peerDescSlice) Len() int { return len(s) } func (s peerDescSlice) Less(i, j int) bool { return s[i].UID < s[j].UID } func (s peerDescSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +// meshPeerPosition returns the position of the local peer in the mesh. +func meshPeerPosition(r *mesh.Router) int { + var peers peerDescSlice + for _, desc := range r.Peers.Descriptions() { + peers = append(peers, desc) + } + sort.Sort(peers) + + k := 0 + for _, desc := range peers { + if desc.Self { + break + } + k++ + } + return k +} + // meshWait returns a function that inspects the current peer state and returns // a duration of one base timeout for each peer with a higher ID than ourselves. func meshWait(r *mesh.Router, timeout time.Duration) func() time.Duration { return func() time.Duration { - var peers peerDescSlice - for _, desc := range r.Peers.Descriptions() { - peers = append(peers, desc) - } - sort.Sort(peers) - - k := 0 - for _, desc := range peers { - if desc.Self { - break - } - k++ - } - peerPosition.Set(float64(k)) - return time.Duration(k) * timeout + return time.Duration(meshPeerPosition(r)) * timeout } } From 63f4ad8c020960f72bae382848a0664a49de46e3 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Tue, 6 Feb 2018 10:51:39 +0100 Subject: [PATCH 2/2] Remove 'nick' label from alertmanager_peer_connection metric --- cmd/alertmanager/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 74a36e2a94..7f6e932290 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -391,7 +391,7 @@ func NewMeshCollector(router *mesh.Router) *meshCollector { connDesc: prometheus.NewDesc( "alertmanager_peer_connection", "State of the connection between the Alertmanager instance and a peer.", - []string{"peer", "nick"}, + []string{"peer"}, prometheus.Labels{}, ), posDesc: prometheus.NewDesc( @@ -433,7 +433,7 @@ func (c *meshCollector) Collect(ch chan<- prometheus.Metric) { c.connDesc, prometheus.GaugeValue, v, - conn.Name, conn.NickName, + conn.Name, ) } }