diff --git a/pkg/base/config.go b/pkg/base/config.go index 4797db8944a6..dba79a024e5b 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -27,7 +27,9 @@ import ( // Base config defaults. // -// When changing these, TestDefaultRaftConfig must also be updated via -rewrite. +// When changing these, TestDefaultRaftConfig must also be updated via -rewrite, +// and the result copied to the defaultRangeLeaseRaftElectionTimeoutMultiplier +// comment with any adjustments to the surrounding reasoning. const ( defaultInsecure = false defaultUser = username.RootUser @@ -54,7 +56,48 @@ const ( // defaultRangeLeaseRaftElectionTimeoutMultiplier specifies what multiple the // leader lease active duration should be of the raft election timeout. - defaultRangeLeaseRaftElectionTimeoutMultiplier = 3 + // + // Timers for Raft leadership election and lease expiration run in parallel. + // Although not required, we would like to elect a leader before the lease + // expires, such that we don't have to wait for a Raft election when we're + // ready to acquire the lease. + // + // The relevant operations and default time intervals are listed below. RTTs + // are assumed to range from 10ms to 400ms (maximum GCP inter-region latency). + // Heartbeat offsets refer to the duration from the last heartbeat to the node + // crash -- for example, with a heartbeat interval of 1s and a timeout of 3s, + // if the node crashes 1s after the previous heartbeat (just before it's about + // to heartbeat again), then the timeout will fire after 2s of unavailability + // rather than 3s, so the heartbeat offset is -1s. + // + // Raft election: + // - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s] + // - Election timeout (random 1x-2x timeout) [ 2.00s - 4.00s] + // - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s] + // Total latency [ 1.03s - 5.20s] + // + // Lease acquisition: + // - Heartbeat offset (0-1 heartbeat interval) [-2.50s - 0.00s] + // - Lease expiration (constant) [ 5.00s - 5.00s] + // - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s] + // - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s] + // Total latency [ 2.53s - 6.20s] + // + // (generated by TestDefaultRaftConfig) + // + // From the above, we note that the worst-case Raft election latency + // (4.03s-5.20s) is always less than the corresponding lease expiration + + // epoch bump time (5.02s-5.80s) regardless of RTT, such that the upper bound + // on unavailability is always given by the lease expiration time + 3x RTT + // (5.03s to 6.20s). + // + // With negligible RTT, the average latency is 3.75s for lease acquisition + // (-2.5s / 2 + 5.0s) and 2.5s for Raft elections ((-1.0s + 2.0s + 4.0s) / 2). + // However, the worst-case Raft election latency (4.0s) being greater than the + // best-case lease acquisition latency (2.5s) for a given RTT will skew the + // average upwards, so we can approximate the typical unavailability to be + // roughly 4.0s (the exact calculation is left as an exercise for the reader). + defaultRangeLeaseRaftElectionTimeoutMultiplier = 2.5 // NB: this can't easily become a variable as the UI hard-codes it to 10s. // See https://github.com/cockroachdb/cockroach/issues/20310. @@ -151,10 +194,14 @@ var ( // RPCHeartbeatIntervalAndTimeout used by the RPC context. defaultRPCHeartbeatIntervalAndTimeout = NetworkTimeout - // defaultRaftElectionTimeoutTicks specifies the number of Raft Tick - // invocations that must pass between elections. + // defaultRaftElectionTimeoutTicks specifies the minimum number of Raft ticks + // before holding an election. It is set low by default for faster failover. + // 1 second is sufficient for a network roundtrip and retransmit even in + // multi-region clusters (see NetworkTimeout), so 2 seconds should be enough. + // Furthermore, the actual election timeout per replica is multiplied by a + // random factor of 1-2. defaultRaftElectionTimeoutTicks = envutil.EnvOrDefaultInt( - "COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 15) + "COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 10) // defaultRaftLogTruncationThreshold specifies the upper bound that a single // Range's Raft log can grow to before log truncations are triggered while at @@ -346,8 +393,9 @@ type RaftConfig struct { // RaftTickInterval is the resolution of the Raft timer. RaftTickInterval time.Duration - // RaftElectionTimeoutTicks is the number of raft ticks before the - // previous election expires. This value is inherited by individual stores + // RaftElectionTimeoutTicks is the minimum number of raft ticks before holding + // an election. The actual election timeout is randomized by each replica to + // between 1-2 election timeouts. This value is inherited by individual stores // unless overridden. RaftElectionTimeoutTicks int @@ -469,15 +517,15 @@ func (cfg *RaftConfig) SetDefaults() { cfg.RaftMaxInflightMsgs = defaultRaftMaxInflightMsgs } if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 { - // The Raft Ticks interval defaults to 200ms, and an election is 15 + // The Raft Ticks interval defaults to 200ms, and an election is 10 // ticks. Add a generous amount of ticks to make sure even a backed up // Raft snapshot queue is going to make progress when a (not overly // concurrent) amount of splits happens. // The generous amount should result in a delay sufficient to // transmit at least one snapshot with the slow delay, which - // with default settings is max 64MB at 2MB/s, ie 32 seconds. + // with default settings is max 512MB at 32MB/s, ie 16 seconds. // - // The resulting delay configured here is about 50s. + // The resulting delay configured here is 46s. cfg.RaftDelaySplitToSuppressSnapshotTicks = 3*cfg.RaftElectionTimeoutTicks + 200 } @@ -548,7 +596,7 @@ func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal t // propagate liveness. The replica which is the lease holder of the first range // gossips it. func (cfg RaftConfig) SentinelGossipTTL() time.Duration { - return cfg.RangeLeaseActiveDuration() / 2 + return cfg.RangeLeaseActiveDuration() } // DefaultRetryOptions should be used for retrying most diff --git a/pkg/base/testdata/raft_config b/pkg/base/testdata/raft_config index 08ac3af865fa..1d2ea4a1f5fb 100644 --- a/pkg/base/testdata/raft_config +++ b/pkg/base/testdata/raft_config @@ -2,9 +2,9 @@ echo ---- (base.RaftConfig) { RaftTickInterval: (time.Duration) 200ms, - RaftElectionTimeoutTicks: (int) 15, + RaftElectionTimeoutTicks: (int) 10, RaftHeartbeatIntervalTicks: (int) 5, - RangeLeaseRaftElectionTimeoutMultiplier: (float64) 3, + RangeLeaseRaftElectionTimeoutMultiplier: (float64) 2.5, RangeLeaseRenewalFraction: (float64) 0.5, RaftLogTruncationThreshold: (int64) 16777216, RaftProposalQuota: (int64) 8388608, @@ -12,11 +12,11 @@ echo RaftMaxSizePerMsg: (uint64) 32768, RaftMaxCommittedSizePerReady: (uint64) 67108864, RaftMaxInflightMsgs: (int) 128, - RaftDelaySplitToSuppressSnapshotTicks: (int) 245 + RaftDelaySplitToSuppressSnapshotTicks: (int) 230 } RaftHeartbeatInterval: 1s -RaftElectionTimeout: 3s -RangeLeaseDurations: active=9s renewal=4.5s -RangeLeaseAcquireTimeout: 6s -NodeLivenessDurations: active=9s renewal=4.5s -SentinelGossipTTL: 4.5s +RaftElectionTimeout: 2s +RangeLeaseDurations: active=5s renewal=2.5s +RangeLeaseAcquireTimeout: 4s +NodeLivenessDurations: active=5s renewal=2.5s +SentinelGossipTTL: 5s diff --git a/pkg/base/testdata/raft_config_recovery b/pkg/base/testdata/raft_config_recovery index 21486a4f5517..b907b72a0307 100644 --- a/pkg/base/testdata/raft_config_recovery +++ b/pkg/base/testdata/raft_config_recovery @@ -1,14 +1,17 @@ +# Any changes in this result should be copied to the comment on +# defaultRangeLeaseRaftElectionTimeoutMultiplier, and the corresponding +# reasoning should be adjusted. echo ---- // Raft election: // - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s] -// - Election timeout (random 1x-2x timeout) [ 3.00s - 6.00s] +// - Election timeout (random 1x-2x timeout) [ 2.00s - 4.00s] // - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s] -// Total latency [ 2.03s - 7.20s] +// Total latency [ 1.03s - 5.20s] // // Lease acquisition: -// - Heartbeat offset (0-1 heartbeat interval) [-4.50s - 0.00s] -// - Lease expiration (constant) [ 9.00s - 9.00s] +// - Heartbeat offset (0-1 heartbeat interval) [-2.50s - 0.00s] +// - Lease expiration (constant) [ 5.00s - 5.00s] // - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s] // - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s] -// Total latency [ 4.53s -10.20s] +// Total latency [ 2.53s - 6.20s] diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 1ca007a84542..015aa7d543a0 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -267,7 +267,7 @@ func testStoreConfig(clock *hlc.Clock, version roachpb.Version) StoreConfig { // Use shorter Raft tick settings in order to minimize start up and failover // time in tests. sc.RaftHeartbeatIntervalTicks = 1 - sc.RaftElectionTimeoutTicks = 3 + sc.RaftElectionTimeoutTicks = 2 sc.RaftTickInterval = 100 * time.Millisecond sc.SetDefaults() return sc