From fe9bff5431963247de27fbfd2b57f2483bc8c34b Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Mon, 26 Feb 2018 13:38:40 -0800 Subject: [PATCH] etcdserver: adjust tick advance on restart Signed-off-by: Gyuho Lee --- etcdserver/raft.go | 12 ----------- etcdserver/server.go | 48 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/etcdserver/raft.go b/etcdserver/raft.go index 9695da80ea5e..40afbb9ce2f6 100644 --- a/etcdserver/raft.go +++ b/etcdserver/raft.go @@ -370,16 +370,6 @@ func (r *raftNode) resumeSending() { p.Resume() } -// advanceTicksForElection advances ticks to the node for fast election. -// This reduces the time to wait for first leader election if bootstrapping the whole -// cluster, while leaving at least 1 heartbeat for possible existing leader -// to contact it. -func advanceTicksForElection(n raft.Node, electionTicks int) { - for i := 0; i < electionTicks-1; i++ { - n.Tick() - } -} - func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id types.ID, n raft.Node, s *raft.MemoryStorage, w *wal.WAL) { var err error member := cl.MemberByName(cfg.Name) @@ -417,7 +407,6 @@ func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id raftStatusMu.Lock() raftStatus = n.Status raftStatusMu.Unlock() - advanceTicksForElection(n, c.ElectionTick) return id, n, s, w } @@ -451,7 +440,6 @@ func restartNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types.ID, *member raftStatusMu.Lock() raftStatus = n.Status raftStatusMu.Unlock() - advanceTicksForElection(n, c.ElectionTick) return id, cl, n, s, w } diff --git a/etcdserver/server.go b/etcdserver/server.go index 79d17425871f..9192a73f3c19 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -300,6 +300,7 @@ func NewServer(cfg ServerConfig) (srv *EtcdServer, err error) { snapshot *raftpb.Snapshot ) + restart := false switch { case !haveWAL && !cfg.NewCluster: if err = cfg.VerifyJoinExisting(); err != nil { @@ -388,6 +389,7 @@ func NewServer(cfg ServerConfig) (srv *EtcdServer, err error) { cfg.Print() if !cfg.ForceNewCluster { id, cl, n, s, w = restartNode(cfg, snapshot) + restart = true } else { id, cl, n, s, w = restartAsStandaloneNode(cfg, snapshot) } @@ -518,9 +520,55 @@ func NewServer(cfg ServerConfig) (srv *EtcdServer, err error) { } srv.r.transport = tr + activePeers := 0 + for _, m := range cl.Members() { + if m.ID != id { + if tr.IsActive(m.ID) { + activePeers++ + } + } + } + + clusterN := len(cl.Members()) + plog.Infof("%s has %d active peers among %d found member(s)", srv.ID(), activePeers, clusterN) + + // only advance ticks if starting to fresh cluster + // and if single-node cluster, or peer connection + // has been already established (activePeers > 0) + if !restart && (clusterN == 1 || activePeers == 0) { + // save one tick in case leader node comes + // in with heartbeat before timeout + tick := cfg.ElectionTicks - 1 + plog.Infof("%s is advancing %d ticks for faster election (election tick %d)", srv.ID(), tick, cfg.ElectionTicks) + advanceTicksForElection(n, tick) + } else { + // on restart, there is likely an active peer already + // with an active leader; adjust ticks to advance + // in case leader heartbeats arrive and restarted + // node can revert back to follower + // otherwise, restarted follower can disrupt + // current cluster by starting an election + // with higher term + tick := cfg.ElectionTicks / 10 // default election tick is 10 (1s) + plog.Infof("%s is advancing %d ticks (election tick %d)", srv.ID(), tick, cfg.ElectionTicks) + advanceTicksForElection(n, tick) + } + return srv, nil } +// advanceTicksForElection advances ticks to the node +// for fast election. This reduces the time to wait +// for first leader election if bootstrapping the whole +// cluster (especially cross data-center deployments), +// while leaving a few heartbeats for possible existing +// leader to contact it. +func advanceTicksForElection(n raft.Node, ticks int) { + for i := 0; i < ticks; i++ { + n.Tick() + } +} + // Start performs any initialization of the Server necessary for it to // begin serving requests. It must be called before Do or Process. // Start must be non-blocking; any long-running server functionality