diff --git a/cluster/cluster.go b/cluster/cluster.go index a8b0cebe11..6a8c4e6200 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -5,6 +5,7 @@ import ( "io/ioutil" "math/rand" "net" + "sort" "strconv" "strings" "sync" @@ -191,6 +192,23 @@ func (p *Peer) Peers() []*memberlist.Node { return p.mlist.Members() } +// Position returns the position of the peer in the cluster. +func (p *Peer) Position() int { + all := p.Peers() + sort.Slice(all, func(i, j int) bool { + return all[i].Name < all[j].Name + }) + + k := 0 + for _, n := range all { + if n.Name == p.Self().Name { + break + } + k++ + } + return k +} + // State is a piece of state that can be serialized and merged with other // serialized state. type State interface { @@ -255,8 +273,20 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer) *delegate { }, func() float64 { return float64(p.ClusterSize()) }) + peerPosition := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Name: "alertmanager_peer_position", + Help: "Position the Alertmanager instance believes it's in. The position determines a peer's behavior in the cluster.", + }, func() float64 { + return float64(p.Position()) + }) + healthScore := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Name: "alertmanager_cluster_health_score", + Help: "Health score of the cluster. Lower values are better and zero means 'totally healthy'.", + }, func() float64 { + return float64(p.mlist.GetHealthScore()) + }) - reg.MustRegister(messagesReceived, messagesReceivedSize, gossipClusterMembers) + reg.MustRegister(messagesReceived, messagesReceivedSize, gossipClusterMembers, peerPosition, healthScore) return &delegate{ logger: l, diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 1cc6d63ccf..d69d85bd39 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -24,7 +24,6 @@ import ( "os/signal" "path/filepath" "runtime" - "sort" "strings" "sync" "syscall" @@ -53,10 +52,6 @@ import ( ) var ( - peerPosition = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "alertmanager_peer_position", - Help: "Position the Alertmanager instance believes it's in. The position determines a peer's behavior in the cluster.", - }) configHash = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "alertmanager_config_hash", Help: "Hash of the currently loaded alertmanager configuration.", @@ -74,7 +69,6 @@ var ( ) func init() { - prometheus.MustRegister(peerPosition) prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) prometheus.MustRegister(configHash) @@ -384,20 +378,7 @@ func main() { // a duration of one base timeout for each peer with a higher ID than ourselves. func clusterWait(p *cluster.Peer, timeout time.Duration) func() time.Duration { return func() time.Duration { - all := p.Peers() - sort.Slice(all, func(i, j int) bool { - return all[i].Name < all[j].Name - }) - - k := 0 - for _, n := range all { - if n.Name == p.Self().Name { - break - } - k++ - } - peerPosition.Set(float64(k)) - return time.Duration(k) * timeout + return time.Duration(p.Position()) * timeout } }