Skip to content

Commit

Permalink
cluster: gather alertmanager_peer_position all the time (#1247)
Browse files Browse the repository at this point in the history
* cluster: gather alertmanager_peer_position all the time

This change moves the gathering of the alertmanager_peer_position metric
outside of the clusterWait() function so that the metric is computed
accurately even when no alerting group fires.

* cluster: add alertmanager_cluster_health_score metric

This metric is retrieved from the memberlist library.
  • Loading branch information
simonpasquier authored and stuartnelson3 committed Feb 27, 2018
1 parent c2dac90 commit 3df0939
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 21 deletions.
32 changes: 31 additions & 1 deletion cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"io/ioutil"
"math/rand"
"net"
"sort"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -191,6 +192,23 @@ func (p *Peer) Peers() []*memberlist.Node {
return p.mlist.Members()
}

// Position returns the position of the peer in the cluster.
func (p *Peer) Position() int {
all := p.Peers()
sort.Slice(all, func(i, j int) bool {
return all[i].Name < all[j].Name
})

k := 0
for _, n := range all {
if n.Name == p.Self().Name {
break
}
k++
}
return k
}

// State is a piece of state that can be serialized and merged with other
// serialized state.
type State interface {
Expand Down Expand Up @@ -255,8 +273,20 @@ func newDelegate(l log.Logger, reg prometheus.Registerer, p *Peer) *delegate {
}, func() float64 {
return float64(p.ClusterSize())
})
peerPosition := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "alertmanager_peer_position",
Help: "Position the Alertmanager instance believes it's in. The position determines a peer's behavior in the cluster.",
}, func() float64 {
return float64(p.Position())
})
healthScore := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "alertmanager_cluster_health_score",
Help: "Health score of the cluster. Lower values are better and zero means 'totally healthy'.",
}, func() float64 {
return float64(p.mlist.GetHealthScore())
})

reg.MustRegister(messagesReceived, messagesReceivedSize, gossipClusterMembers)
reg.MustRegister(messagesReceived, messagesReceivedSize, gossipClusterMembers, peerPosition, healthScore)

return &delegate{
logger: l,
Expand Down
21 changes: 1 addition & 20 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
"os/signal"
"path/filepath"
"runtime"
"sort"
"strings"
"sync"
"syscall"
Expand Down Expand Up @@ -53,10 +52,6 @@ import (
)

var (
peerPosition = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_peer_position",
Help: "Position the Alertmanager instance believes it's in. The position determines a peer's behavior in the cluster.",
})
configHash = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_config_hash",
Help: "Hash of the currently loaded alertmanager configuration.",
Expand All @@ -74,7 +69,6 @@ var (
)

func init() {
prometheus.MustRegister(peerPosition)
prometheus.MustRegister(configSuccess)
prometheus.MustRegister(configSuccessTime)
prometheus.MustRegister(configHash)
Expand Down Expand Up @@ -384,20 +378,7 @@ func main() {
// a duration of one base timeout for each peer with a higher ID than ourselves.
func clusterWait(p *cluster.Peer, timeout time.Duration) func() time.Duration {
return func() time.Duration {
all := p.Peers()
sort.Slice(all, func(i, j int) bool {
return all[i].Name < all[j].Name
})

k := 0
for _, n := range all {
if n.Name == p.Self().Name {
break
}
k++
}
peerPosition.Set(float64(k))
return time.Duration(k) * timeout
return time.Duration(p.Position()) * timeout
}
}

Expand Down

0 comments on commit 3df0939

Please sign in to comment.