-
Notifications
You must be signed in to change notification settings - Fork 2.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add cluster peers DNS refresh job #1428
Changes from 1 commit
5985377
5b64904
bf248a5
9c69c38
1e4bcb6
e955b4c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,8 +47,13 @@ type Peer struct { | |
peers map[string]peer | ||
failedPeers []peer | ||
|
||
knownPeers []string | ||
advertiseAddr string | ||
|
||
failedReconnectionsCounter prometheus.Counter | ||
reconnectionsCounter prometheus.Counter | ||
failedRefreshCounter prometheus.Counter | ||
refreshCounter prometheus.Counter | ||
peerLeaveCounter prometheus.Counter | ||
peerUpdateCounter prometheus.Counter | ||
peerJoinCounter prometheus.Counter | ||
|
@@ -95,6 +100,7 @@ const ( | |
DefaultProbeInterval = 1 * time.Second | ||
DefaultReconnectInterval = 10 * time.Second | ||
DefaultReconnectTimeout = 6 * time.Hour | ||
DefaultRefreshInterval = 30 * time.Second | ||
maxGossipPacketSize = 1400 | ||
) | ||
|
||
|
@@ -112,6 +118,7 @@ func Join( | |
probeInterval time.Duration, | ||
reconnectInterval time.Duration, | ||
reconnectTimeout time.Duration, | ||
refreshInterval time.Duration, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As this is not configurable via a command line flag anymore, there is no reason for the parameter, right? |
||
) (*Peer, error) { | ||
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr) | ||
if err != nil { | ||
|
@@ -164,11 +171,12 @@ func Join( | |
} | ||
|
||
p := &Peer{ | ||
states: map[string]State{}, | ||
stopc: make(chan struct{}), | ||
readyc: make(chan struct{}), | ||
logger: l, | ||
peers: map[string]peer{}, | ||
states: map[string]State{}, | ||
stopc: make(chan struct{}), | ||
readyc: make(chan struct{}), | ||
logger: l, | ||
peers: map[string]peer{}, | ||
knownPeers: knownPeers, | ||
} | ||
|
||
p.register(reg) | ||
|
@@ -221,6 +229,9 @@ func Join( | |
if reconnectTimeout != 0 { | ||
go p.handleReconnectTimeout(5*time.Minute, reconnectTimeout) | ||
} | ||
if refreshInterval != 0 { | ||
go p.handleRefresh(refreshInterval) | ||
} | ||
|
||
return p, nil | ||
} | ||
|
@@ -298,6 +309,15 @@ func (p *Peer) register(reg prometheus.Registerer) { | |
Help: "A counter of the number of cluster peer reconnections.", | ||
}) | ||
|
||
p.failedRefreshCounter = prometheus.NewCounter(prometheus.CounterOpts{ | ||
Name: "alertmanager_cluster_refresh_failed_total", | ||
Help: "A counter of the number of failed cluster peer refresh attempts.", | ||
}) | ||
p.refreshCounter = prometheus.NewCounter(prometheus.CounterOpts{ | ||
Name: "alertmanager_cluster_refresh_total", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Metric name and metric description seem diverged to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, renamed to |
||
Help: "A counter of the number of cluster peer joined via refresh.", | ||
}) | ||
|
||
p.peerLeaveCounter = prometheus.NewCounter(prometheus.CounterOpts{ | ||
Name: "alertmanager_cluster_peers_left_total", | ||
Help: "A counter of the number of peers that have left.", | ||
|
@@ -312,7 +332,7 @@ func (p *Peer) register(reg prometheus.Registerer) { | |
}) | ||
|
||
reg.MustRegister(clusterFailedPeers, p.failedReconnectionsCounter, p.reconnectionsCounter, | ||
p.peerLeaveCounter, p.peerUpdateCounter, p.peerJoinCounter) | ||
p.peerLeaveCounter, p.peerUpdateCounter, p.peerJoinCounter, p.refreshCounter, p.failedRefreshCounter) | ||
} | ||
|
||
func (p *Peer) handleReconnectTimeout(d time.Duration, timeout time.Duration) { | ||
|
@@ -382,6 +402,50 @@ func (p *Peer) reconnect() { | |
} | ||
} | ||
|
||
func (p *Peer) handleRefresh(d time.Duration) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we're talking about proper DNS support for alertmanager, it would be better to respect the TTL of the record as advertised by the authority. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you shorten the interval to 10s or 15s? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @grobie It looks like Go net package doesn't expose TTL values https://golang.org/pkg/net/#IPAddr. I personally prefer to use this, as it's a simpler solution, but I can prepare a change if you want. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This ends up relying on using the internal resolver, which should be properly caching/refreshing responses. I would say we can shorten the time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 Shortened to 15s. |
||
tick := time.NewTicker(d) | ||
defer tick.Stop() | ||
|
||
for { | ||
select { | ||
case <-p.stopc: | ||
return | ||
case <-tick.C: | ||
p.refresh() | ||
} | ||
} | ||
} | ||
|
||
func (p *Peer) refresh() { | ||
logger := log.With(p.logger, "msg", "refresh") | ||
|
||
resolvedPeers, err := resolvePeers(context.Background(), p.knownPeers, p.advertiseAddr, net.Resolver{}, false) | ||
if err != nil { | ||
level.Debug(logger).Log("peers", p.knownPeers, "err", err) | ||
} | ||
|
||
members := p.mlist.Members() | ||
for _, peer := range resolvedPeers { | ||
var isPeerFound bool | ||
for _, member := range members { | ||
if member.Address() == peer { | ||
isPeerFound = true | ||
break | ||
} | ||
} | ||
|
||
if !isPeerFound { | ||
if _, err := p.mlist.Join([]string{peer}); err != nil { | ||
p.failedRefreshCounter.Inc() | ||
level.Debug(logger).Log("result", "failure", "addr", peer) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would argue that this could also be a Info or Warn. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 changed to be |
||
} else { | ||
p.refreshCounter.Inc() | ||
level.Debug(logger).Log("result", "success", "addr", peer) | ||
} | ||
} | ||
} | ||
} | ||
|
||
func (p *Peer) peerJoin(n *memberlist.Node) { | ||
p.peerLock.Lock() | ||
defer p.peerLock.Unlock() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,6 +162,7 @@ func main() { | |
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration() | ||
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration() | ||
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration() | ||
refreshInterval = kingpin.Flag("cluster.refresh-interval", "Interval between attempting to refresh cluster.peers DNS records.").Default(cluster.DefaultReconnectInterval.String()).Duration() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would a good default value be enough for now, or is a custom configuration necessary for most environments? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 I think 30s should be fine for most environments, typically alertmanger is quick to start, so IMO anything longer than that would slow down startup/gossip settling |
||
) | ||
|
||
kingpin.Version(version.Print("alertmanager")) | ||
|
@@ -196,6 +197,7 @@ func main() { | |
*probeInterval, | ||
*reconnectInterval, | ||
*peerReconnectTimeout, | ||
*refreshInterval, | ||
) | ||
if err != nil { | ||
level.Error(logger).Log("msg", "Unable to initialize gossip mesh", "err", err) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems to be quiet long to prevent a partition during deployment.