From 183e425ca9efaf5d3d90d209e36df09bf942e464 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 27 Nov 2022 17:56:29 +0000 Subject: [PATCH 1/4] base: add `RaftConfig.RangeLeaseAcquireTimeout()` Release note: None --- pkg/base/config.go | 11 +++++++++++ pkg/kv/kvserver/replica_range_lease.go | 6 +----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pkg/base/config.go b/pkg/base/config.go index 462e6824ba26..8ab908ac6ead 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -520,6 +520,17 @@ func (cfg RaftConfig) RangeLeaseRenewalDuration() time.Duration { return rangeLeaseRenewal } +// RangeLeaseAcquireTimeout is the timeout for lease acquisition. +func (cfg RaftConfig) RangeLeaseAcquireTimeout() time.Duration { + // The Raft election timeout is randomized by a factor of 1-2x per replica + // (the first one will trigger the election), and reproposing the lease + // acquisition command can take up to 1 Raft election timeout. On average, we + // should be able to elect a leader and acquire a lease within 2 election + // timeouts, assuming negligible RTT; otherwise, lease acquisition will + // typically be retried, only adding a bit of tail latency. + return 2 * cfg.RaftElectionTimeout() +} + // NodeLivenessDurations computes durations for node liveness expiration and // renewal based on a default multiple of Raft election timeout. func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal time.Duration) { diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index 58529c99a061..7ba6e58e81f5 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -1201,13 +1201,9 @@ func (r *Replica) TestingAcquireLease(ctx context.Context) (kvserverpb.LeaseStat func (r *Replica) redirectOnOrAcquireLeaseForRequest( ctx context.Context, reqTS hlc.Timestamp, brSig signaller, ) (status kvserverpb.LeaseStatus, pErr *roachpb.Error) { - // We may need to hold a Raft election and repropose the lease acquisition - // command, which can take a couple of Raft election timeouts. - timeout := 2 * r.store.cfg.RaftElectionTimeout() - // Does not use RunWithTimeout(), because we do not want to mask the // NotLeaseHolderError on context cancellation. - ctx, cancel := context.WithTimeout(ctx, timeout) // nolint:context + ctx, cancel := context.WithTimeout(ctx, r.store.cfg.RangeLeaseAcquireTimeout()) // nolint:context defer cancel() // Try fast-path. From dcdeb8b7b22fa7d9143ba19f6a2da551728f8ca8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 16 Nov 2022 10:16:39 +0000 Subject: [PATCH 2/4] base: add data-driven test for default Raft config Release note: None --- go.mod | 2 +- pkg/base/BUILD.bazel | 4 + pkg/base/config.go | 2 + pkg/base/config_test.go | 124 +++++++++++++++++++++++++ pkg/base/testdata/raft_config | 22 +++++ pkg/base/testdata/raft_config_recovery | 14 +++ 6 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 pkg/base/config_test.go create mode 100644 pkg/base/testdata/raft_config create mode 100644 pkg/base/testdata/raft_config_recovery diff --git a/go.mod b/go.mod index 7e9f296375a1..aad9a8d4360c 100644 --- a/go.mod +++ b/go.mod @@ -122,6 +122,7 @@ require ( github.com/containerd/containerd v1.5.4 github.com/coreos/go-oidc v2.2.1+incompatible github.com/dave/dst v0.24.0 + github.com/davecgh/go-spew v1.1.1 github.com/docker/distribution v2.7.1+incompatible github.com/docker/docker v20.10.17+incompatible github.com/docker/go-connections v0.4.0 @@ -251,7 +252,6 @@ require ( github.com/cespare/xxhash/v2 v2.1.2 // indirect github.com/charmbracelet/bubbletea v0.22.2-0.20221007125427-0e76ba142aa1 // indirect github.com/charmbracelet/lipgloss v0.6.0 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect github.com/decred/dcrd/dcrec/secp256k1/v4 v4.1.0 // indirect github.com/dimchansky/utfbom v1.1.1 // indirect github.com/djherbis/atime v1.1.0 // indirect diff --git a/pkg/base/BUILD.bazel b/pkg/base/BUILD.bazel index f57a88f28068..66e619ca1730 100644 --- a/pkg/base/BUILD.bazel +++ b/pkg/base/BUILD.bazel @@ -52,21 +52,25 @@ go_test( srcs = [ "addr_validation_test.go", "cluster_id_test.go", + "config_test.go", "main_test.go", "node_id_test.go", "store_spec_test.go", ], args = ["-test.timeout=55s"], + data = glob(["testdata/**"]), deps = [ ":base", "//pkg/roachpb", "//pkg/security/securityassets", "//pkg/security/securitytest", "//pkg/testutils", + "//pkg/testutils/echotest", "//pkg/util/leaktest", "//pkg/util/log", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", + "@com_github_davecgh_go_spew//spew", "@com_github_stretchr_testify//require", ], ) diff --git a/pkg/base/config.go b/pkg/base/config.go index 8ab908ac6ead..4797db8944a6 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -26,6 +26,8 @@ import ( ) // Base config defaults. +// +// When changing these, TestDefaultRaftConfig must also be updated via -rewrite. const ( defaultInsecure = false defaultUser = username.RootUser diff --git a/pkg/base/config_test.go b/pkg/base/config_test.go new file mode 100644 index 000000000000..b61d77235069 --- /dev/null +++ b/pkg/base/config_test.go @@ -0,0 +1,124 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package base_test + +import ( + "fmt" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/echotest" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/davecgh/go-spew/spew" +) + +func TestDefaultRaftConfig(t *testing.T) { + defer leaktest.AfterTest(t)() + + var cfg base.RaftConfig + cfg.SetDefaults() + + // Assert the config and various derived values. + leaseActive, leaseRenewal := cfg.RangeLeaseDurations() + nodeActive, nodeRenewal := cfg.NodeLivenessDurations() + raftElectionTimeout := cfg.RaftElectionTimeout() + raftHeartbeatInterval := cfg.RaftTickInterval * time.Duration(cfg.RaftHeartbeatIntervalTicks) + + { + var s string + s += spew.Sdump(cfg) + s += fmt.Sprintf("RaftHeartbeatInterval: %s\n", raftHeartbeatInterval) + s += fmt.Sprintf("RaftElectionTimeout: %s\n", raftElectionTimeout) + s += fmt.Sprintf("RangeLeaseDurations: active=%s renewal=%s\n", leaseActive, leaseRenewal) + s += fmt.Sprintf("RangeLeaseAcquireTimeout: %s\n", cfg.RangeLeaseAcquireTimeout()) + s += fmt.Sprintf("NodeLivenessDurations: active=%s renewal=%s\n", nodeActive, nodeRenewal) + s += fmt.Sprintf("SentinelGossipTTL: %s\n", cfg.SentinelGossipTTL()) + echotest.Require(t, s, testutils.TestDataPath(t, "raft_config")) + } + + // Generate and assert the derived recovery intervals. + const ( + minRTT = 10 * time.Millisecond + maxRTT = 400 * time.Millisecond // max GCP inter-region RTT is ~350ms + maxElectionMultiplier = 2 + ) + + type interval struct { + name string + min, max time.Duration + } + + formatIntervals := func(name string, intervals []interval) string { + // Format intervals and append min/max sum. + var minSum, maxSum time.Duration + var formatted []interval + for _, ival := range intervals { + ival.name = "- " + ival.name + formatted = append(formatted, ival) + minSum += ival.min + maxSum += ival.max + } + formatted = append(formatted, interval{name: "Total latency", min: minSum, max: maxSum}) + + s := "// " + name + ":\n" + for _, ival := range formatted { + s += fmt.Sprintf("// %-46s [%5.2fs -%5.2fs]\n", + ival.name, ival.min.Seconds(), ival.max.Seconds()) + } + return s + } + + var s string + s += formatIntervals("Raft election", []interval{ + { + "Heartbeat offset (0-1 heartbeat interval)", + -raftHeartbeatInterval, + 0, + }, + { + fmt.Sprintf("Election timeout (random 1x-%dx timeout)", maxElectionMultiplier), + raftElectionTimeout, + maxElectionMultiplier * raftElectionTimeout, + }, + { + "Election (3x RTT: prevote, vote, append)", + 3 * minRTT, + 3 * maxRTT, + }, + }) + s += "//\n" + s += formatIntervals("Lease acquisition", []interval{ + { + "Heartbeat offset (0-1 heartbeat interval)", + -leaseRenewal, + 0, + }, + { + "Lease expiration (constant)", + leaseActive, + leaseActive, + }, + { + "Liveness epoch bump (2x RTT: CPut + append)", + 2 * minRTT, + 2 * maxRTT, + }, + { + "Lease acquisition (1x RTT: append)", + minRTT, + maxRTT, + }, + }) + + echotest.Require(t, s, testutils.TestDataPath(t, "raft_config_recovery")) +} diff --git a/pkg/base/testdata/raft_config b/pkg/base/testdata/raft_config new file mode 100644 index 000000000000..08ac3af865fa --- /dev/null +++ b/pkg/base/testdata/raft_config @@ -0,0 +1,22 @@ +echo +---- +(base.RaftConfig) { + RaftTickInterval: (time.Duration) 200ms, + RaftElectionTimeoutTicks: (int) 15, + RaftHeartbeatIntervalTicks: (int) 5, + RangeLeaseRaftElectionTimeoutMultiplier: (float64) 3, + RangeLeaseRenewalFraction: (float64) 0.5, + RaftLogTruncationThreshold: (int64) 16777216, + RaftProposalQuota: (int64) 8388608, + RaftMaxUncommittedEntriesSize: (uint64) 16777216, + RaftMaxSizePerMsg: (uint64) 32768, + RaftMaxCommittedSizePerReady: (uint64) 67108864, + RaftMaxInflightMsgs: (int) 128, + RaftDelaySplitToSuppressSnapshotTicks: (int) 245 +} +RaftHeartbeatInterval: 1s +RaftElectionTimeout: 3s +RangeLeaseDurations: active=9s renewal=4.5s +RangeLeaseAcquireTimeout: 6s +NodeLivenessDurations: active=9s renewal=4.5s +SentinelGossipTTL: 4.5s diff --git a/pkg/base/testdata/raft_config_recovery b/pkg/base/testdata/raft_config_recovery new file mode 100644 index 000000000000..21486a4f5517 --- /dev/null +++ b/pkg/base/testdata/raft_config_recovery @@ -0,0 +1,14 @@ +echo +---- +// Raft election: +// - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s] +// - Election timeout (random 1x-2x timeout) [ 3.00s - 6.00s] +// - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s] +// Total latency [ 2.03s - 7.20s] +// +// Lease acquisition: +// - Heartbeat offset (0-1 heartbeat interval) [-4.50s - 0.00s] +// - Lease expiration (constant) [ 9.00s - 9.00s] +// - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s] +// - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s] +// Total latency [ 4.53s -10.20s] From 09a64b5169988ff5f6fbbedfd5eae38ce092eb40 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 14 Nov 2022 23:37:15 +0000 Subject: [PATCH 3/4] base: reduce lease interval and Raft election timeout This patch reduces the lease interval from 9.0s to 5.0s and Raft election timeout from 3.0s to 2.0s, to reduce unavailability following a leaseholder crash. This implies that the node heartbeat interval is reduced from 4.5s to 2.5s, the lease acquisition timeout is reduced from 6.0s to 4.0s, and the proposal timeout is reduced from 3.0s-6.0s to 2.0s-4.0s. The gossip sentinel TTL is now tied to the lease expiration interval rather than the lease renewal interval, and has thus changed from 4.5s to 5.0s. There are three involved quantities: * `defaultRaftHeartbeatIntervalTicks`: 5 (1.0s). Not changed, to avoid increasing Raft heartbeat costs. * `defaultRaftElectionTimeoutTicks`: 10 (2.0s). Changed from 15 (3.0s), which still leaves ample headroom by the following reasoning: * The network round-trip time (RTT) is expected to be below 400ms (worst-case GCP inter-region latency is 350ms). * TCP prevents "dropped" heartbeats. Linux has an RTT-dependant retransmission timeout (RTO) which we can approximate as 1.5x RTT (smoothed RTT + 4x RTT variance), with a lower bound of 200ms. The worst-case RTO is thus 600ms, with another 0.5x RTT (200ms) for the retransmit to reach the peer, leaving 200ms to spare. * The actual election timeout per replica is multiplied by a random factor of 1-2, so it's likely to be significantly larger than 2.0s for a given follower (i.e. TCP connection). * `defaultRangeLeaseRaftElectionTimeoutMultiplier`: 2.5 (5.0s). Changed from 3.0 (9.0s), with the following reasoning: * Our target lease interval is 5.0s. Reducing the lease multiplier is better than reducing the election timeout further, to leave some headroom for Raft elections and related timeouts. * However, we generally want Raft elections to complete before the lease expires. Since Raft elections have a random 1-2 multiplier, 2.5 ensures the nominal lease interval (5.0s) is always greater than the nominal Raft election timeout (4.0s worst-case), but the relative offset of the last Raft heartbeat (up to 1.0s) and lease extension (up to 2.5s) may cause lease acquisition to sometimes block on Raft elections, skewing the average unavailability upwards to roughly 4.0s. Roachtest benchmarks of pMax latency during leaseholder failures show a clear improvement: | Test | `master` | This PR | |---------------------------------|-----------|---------| | `failover/non-system/crash` | 14.5 s | 8.3 s | | `failover/non-system/blackhole` | 18.3 s | 14.5 s | Alternatively, keeping `defaultRangeLeaseRaftElectionTimeoutMultiplier` at 3 but reducing `defaultRaftElectionTimeoutTicks` to 8 would give a lease interval of 4.8s, with slightly lower average unavailability, but a higher probability of spurious Raft election timeouts, especially with higher RTTs. Release note (ops change): The Raft election timeout has been reduced from 3 seconds to 2 seconds, and the lease interval from 9 seconds to 5 seconds, with a corresponding reduction in the node heartbeat interval from 4.5 seconds to 2.5 seconds. This reduces the period of unavailability following leaseholder loss, but places tighter restrictions on network latencies (no more than 500ms RTT). These timeouts can be adjusted via the environment variable `COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS`, which now defaults to 10 and will scale the timeouts proportionally. --- pkg/base/config.go | 70 ++++++++++++++++++++++---- pkg/base/testdata/raft_config | 16 +++--- pkg/base/testdata/raft_config_recovery | 13 +++-- pkg/kv/kvserver/store.go | 2 +- 4 files changed, 76 insertions(+), 25 deletions(-) diff --git a/pkg/base/config.go b/pkg/base/config.go index 4797db8944a6..dba79a024e5b 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -27,7 +27,9 @@ import ( // Base config defaults. // -// When changing these, TestDefaultRaftConfig must also be updated via -rewrite. +// When changing these, TestDefaultRaftConfig must also be updated via -rewrite, +// and the result copied to the defaultRangeLeaseRaftElectionTimeoutMultiplier +// comment with any adjustments to the surrounding reasoning. const ( defaultInsecure = false defaultUser = username.RootUser @@ -54,7 +56,48 @@ const ( // defaultRangeLeaseRaftElectionTimeoutMultiplier specifies what multiple the // leader lease active duration should be of the raft election timeout. - defaultRangeLeaseRaftElectionTimeoutMultiplier = 3 + // + // Timers for Raft leadership election and lease expiration run in parallel. + // Although not required, we would like to elect a leader before the lease + // expires, such that we don't have to wait for a Raft election when we're + // ready to acquire the lease. + // + // The relevant operations and default time intervals are listed below. RTTs + // are assumed to range from 10ms to 400ms (maximum GCP inter-region latency). + // Heartbeat offsets refer to the duration from the last heartbeat to the node + // crash -- for example, with a heartbeat interval of 1s and a timeout of 3s, + // if the node crashes 1s after the previous heartbeat (just before it's about + // to heartbeat again), then the timeout will fire after 2s of unavailability + // rather than 3s, so the heartbeat offset is -1s. + // + // Raft election: + // - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s] + // - Election timeout (random 1x-2x timeout) [ 2.00s - 4.00s] + // - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s] + // Total latency [ 1.03s - 5.20s] + // + // Lease acquisition: + // - Heartbeat offset (0-1 heartbeat interval) [-2.50s - 0.00s] + // - Lease expiration (constant) [ 5.00s - 5.00s] + // - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s] + // - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s] + // Total latency [ 2.53s - 6.20s] + // + // (generated by TestDefaultRaftConfig) + // + // From the above, we note that the worst-case Raft election latency + // (4.03s-5.20s) is always less than the corresponding lease expiration + + // epoch bump time (5.02s-5.80s) regardless of RTT, such that the upper bound + // on unavailability is always given by the lease expiration time + 3x RTT + // (5.03s to 6.20s). + // + // With negligible RTT, the average latency is 3.75s for lease acquisition + // (-2.5s / 2 + 5.0s) and 2.5s for Raft elections ((-1.0s + 2.0s + 4.0s) / 2). + // However, the worst-case Raft election latency (4.0s) being greater than the + // best-case lease acquisition latency (2.5s) for a given RTT will skew the + // average upwards, so we can approximate the typical unavailability to be + // roughly 4.0s (the exact calculation is left as an exercise for the reader). + defaultRangeLeaseRaftElectionTimeoutMultiplier = 2.5 // NB: this can't easily become a variable as the UI hard-codes it to 10s. // See https://github.com/cockroachdb/cockroach/issues/20310. @@ -151,10 +194,14 @@ var ( // RPCHeartbeatIntervalAndTimeout used by the RPC context. defaultRPCHeartbeatIntervalAndTimeout = NetworkTimeout - // defaultRaftElectionTimeoutTicks specifies the number of Raft Tick - // invocations that must pass between elections. + // defaultRaftElectionTimeoutTicks specifies the minimum number of Raft ticks + // before holding an election. It is set low by default for faster failover. + // 1 second is sufficient for a network roundtrip and retransmit even in + // multi-region clusters (see NetworkTimeout), so 2 seconds should be enough. + // Furthermore, the actual election timeout per replica is multiplied by a + // random factor of 1-2. defaultRaftElectionTimeoutTicks = envutil.EnvOrDefaultInt( - "COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 15) + "COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 10) // defaultRaftLogTruncationThreshold specifies the upper bound that a single // Range's Raft log can grow to before log truncations are triggered while at @@ -346,8 +393,9 @@ type RaftConfig struct { // RaftTickInterval is the resolution of the Raft timer. RaftTickInterval time.Duration - // RaftElectionTimeoutTicks is the number of raft ticks before the - // previous election expires. This value is inherited by individual stores + // RaftElectionTimeoutTicks is the minimum number of raft ticks before holding + // an election. The actual election timeout is randomized by each replica to + // between 1-2 election timeouts. This value is inherited by individual stores // unless overridden. RaftElectionTimeoutTicks int @@ -469,15 +517,15 @@ func (cfg *RaftConfig) SetDefaults() { cfg.RaftMaxInflightMsgs = defaultRaftMaxInflightMsgs } if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 { - // The Raft Ticks interval defaults to 200ms, and an election is 15 + // The Raft Ticks interval defaults to 200ms, and an election is 10 // ticks. Add a generous amount of ticks to make sure even a backed up // Raft snapshot queue is going to make progress when a (not overly // concurrent) amount of splits happens. // The generous amount should result in a delay sufficient to // transmit at least one snapshot with the slow delay, which - // with default settings is max 64MB at 2MB/s, ie 32 seconds. + // with default settings is max 512MB at 32MB/s, ie 16 seconds. // - // The resulting delay configured here is about 50s. + // The resulting delay configured here is 46s. cfg.RaftDelaySplitToSuppressSnapshotTicks = 3*cfg.RaftElectionTimeoutTicks + 200 } @@ -548,7 +596,7 @@ func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal t // propagate liveness. The replica which is the lease holder of the first range // gossips it. func (cfg RaftConfig) SentinelGossipTTL() time.Duration { - return cfg.RangeLeaseActiveDuration() / 2 + return cfg.RangeLeaseActiveDuration() } // DefaultRetryOptions should be used for retrying most diff --git a/pkg/base/testdata/raft_config b/pkg/base/testdata/raft_config index 08ac3af865fa..1d2ea4a1f5fb 100644 --- a/pkg/base/testdata/raft_config +++ b/pkg/base/testdata/raft_config @@ -2,9 +2,9 @@ echo ---- (base.RaftConfig) { RaftTickInterval: (time.Duration) 200ms, - RaftElectionTimeoutTicks: (int) 15, + RaftElectionTimeoutTicks: (int) 10, RaftHeartbeatIntervalTicks: (int) 5, - RangeLeaseRaftElectionTimeoutMultiplier: (float64) 3, + RangeLeaseRaftElectionTimeoutMultiplier: (float64) 2.5, RangeLeaseRenewalFraction: (float64) 0.5, RaftLogTruncationThreshold: (int64) 16777216, RaftProposalQuota: (int64) 8388608, @@ -12,11 +12,11 @@ echo RaftMaxSizePerMsg: (uint64) 32768, RaftMaxCommittedSizePerReady: (uint64) 67108864, RaftMaxInflightMsgs: (int) 128, - RaftDelaySplitToSuppressSnapshotTicks: (int) 245 + RaftDelaySplitToSuppressSnapshotTicks: (int) 230 } RaftHeartbeatInterval: 1s -RaftElectionTimeout: 3s -RangeLeaseDurations: active=9s renewal=4.5s -RangeLeaseAcquireTimeout: 6s -NodeLivenessDurations: active=9s renewal=4.5s -SentinelGossipTTL: 4.5s +RaftElectionTimeout: 2s +RangeLeaseDurations: active=5s renewal=2.5s +RangeLeaseAcquireTimeout: 4s +NodeLivenessDurations: active=5s renewal=2.5s +SentinelGossipTTL: 5s diff --git a/pkg/base/testdata/raft_config_recovery b/pkg/base/testdata/raft_config_recovery index 21486a4f5517..b907b72a0307 100644 --- a/pkg/base/testdata/raft_config_recovery +++ b/pkg/base/testdata/raft_config_recovery @@ -1,14 +1,17 @@ +# Any changes in this result should be copied to the comment on +# defaultRangeLeaseRaftElectionTimeoutMultiplier, and the corresponding +# reasoning should be adjusted. echo ---- // Raft election: // - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s] -// - Election timeout (random 1x-2x timeout) [ 3.00s - 6.00s] +// - Election timeout (random 1x-2x timeout) [ 2.00s - 4.00s] // - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s] -// Total latency [ 2.03s - 7.20s] +// Total latency [ 1.03s - 5.20s] // // Lease acquisition: -// - Heartbeat offset (0-1 heartbeat interval) [-4.50s - 0.00s] -// - Lease expiration (constant) [ 9.00s - 9.00s] +// - Heartbeat offset (0-1 heartbeat interval) [-2.50s - 0.00s] +// - Lease expiration (constant) [ 5.00s - 5.00s] // - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s] // - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s] -// Total latency [ 4.53s -10.20s] +// Total latency [ 2.53s - 6.20s] diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 1ca007a84542..015aa7d543a0 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -267,7 +267,7 @@ func testStoreConfig(clock *hlc.Clock, version roachpb.Version) StoreConfig { // Use shorter Raft tick settings in order to minimize start up and failover // time in tests. sc.RaftHeartbeatIntervalTicks = 1 - sc.RaftElectionTimeoutTicks = 3 + sc.RaftElectionTimeoutTicks = 2 sc.RaftTickInterval = 100 * time.Millisecond sc.SetDefaults() return sc From 6afd5c9491eac304a62b94964394b6df6c42cb29 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 5 Dec 2022 20:33:17 +0000 Subject: [PATCH 4/4] base: use separate lease duration for liveness range This patch adds a separate lease duration for the liveness range. When the liveness range leaseholder is lost, other nodes are unable to heartbeat and extend their leases, which can lead to the loss of most leases in the system. Shortening the liveness lease duration shortens the period of unavailability for other leases. Release note (ops change): The duration of the liveness range lease has been reduced to 2.5s, half of the regular lease duration. When the liveness range leaseholder is lost (e.g. due to an infrastructure outage), other nodes are unable to heartbeat and thus extend their own leases, which can lead to the loss of many of the leases throughtout the cluster. Reducing the liveness lease duration reduces the period of unavailability. --- pkg/base/config.go | 42 ++++++++++++++++++++++++++ pkg/base/config_test.go | 3 ++ pkg/base/testdata/raft_config | 3 ++ pkg/kv/kvserver/replica_range_lease.go | 31 ++++++++++++++++--- 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/pkg/base/config.go b/pkg/base/config.go index dba79a024e5b..99573dde02b3 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -114,6 +114,18 @@ const ( // eagerly renewed 8 seconds into each lease. defaultRangeLeaseRenewalFraction = 0.5 + // defaultLivenessRangeMultiplier specifies the liveness range lease active + // duration as a multiple of the standard range lease active duration. When + // the liveness leaseholder is lost, other nodes will be unable to heartbeat + // and extend their own leases, which can cause a loss of all leases in the + // cluster. It is set to half of the regular lease active duration to + // counteract this. + defaultLivenessRangeLeaseActiveMultiplier = 0.5 + + // defaultLivenessRangeLeaseRenewalFraction is like + // defaultRangeLeaseRenewalFraction, but for the liveness range lease. + defaultLivenessRangeLeaseRenewalFraction = 0.6 + // livenessRenewalFraction specifies what fraction the node liveness // renewal duration should be of the node liveness duration. For example, // with a value of 0.2 and a liveness duration of 10 seconds, each node's @@ -412,6 +424,17 @@ type RaftConfig struct { // and a value of -1 means never preemptively renew the lease. A value of 1 // means always renew. RangeLeaseRenewalFraction float64 + // LivenessRangeLeaseActiveMultiplier specifies what multiple the liveness + // range active duration should be of the regular lease active duration. When + // the liveness leaseholder is lost, other nodes are unable to heartbeat and + // extend their epoch-based leases, which can cascade into a loss of all + // leases in the cluster, so this should be set lower than that. + LivenessRangeLeaseActiveMultiplier float64 + // LivenessRangeLeaseRenewalFraction is like RangeLeaseRenewalFraction but for + // the liveness range lease. It is set to 0.6 such that, in the default + // configuration, a 2.5s liveness lease is renewed every 1.0s, giving it 1.0s + // to succeed before entering the 0.5s stasis period. + LivenessRangeLeaseRenewalFraction float64 // RaftLogTruncationThreshold controls how large a single Range's Raft log // can grow. When a Range's Raft log grows above this size, the Range will @@ -486,6 +509,12 @@ func (cfg *RaftConfig) SetDefaults() { if cfg.RangeLeaseRenewalFraction == 0 { cfg.RangeLeaseRenewalFraction = defaultRangeLeaseRenewalFraction } + if cfg.LivenessRangeLeaseActiveMultiplier == 0 { + cfg.LivenessRangeLeaseActiveMultiplier = defaultLivenessRangeLeaseActiveMultiplier + } + if cfg.LivenessRangeLeaseRenewalFraction == 0 { + cfg.LivenessRangeLeaseRenewalFraction = defaultLivenessRangeLeaseRenewalFraction + } // TODO(andrei): -1 is a special value for RangeLeaseRenewalFraction which // really means "0" (never renew), except that the zero value means "use // default". We can't turn the -1 into 0 here because, unfortunately, @@ -581,6 +610,19 @@ func (cfg RaftConfig) RangeLeaseAcquireTimeout() time.Duration { return 2 * cfg.RaftElectionTimeout() } +// LivenessRangeLeaseDurations computes durations for range lease expiration and +// renewal based for the liveness range lease. +func (cfg RaftConfig) LivenessRangeLeaseDurations() (active, renewal time.Duration) { + active = time.Duration(cfg.LivenessRangeLeaseActiveMultiplier * + float64(cfg.RangeLeaseActiveDuration())) + renewalFraction := cfg.LivenessRangeLeaseRenewalFraction + if renewalFraction == -1 { + renewalFraction = 0 + } + renewal = time.Duration(float64(active) * renewalFraction) + return +} + // NodeLivenessDurations computes durations for node liveness expiration and // renewal based on a default multiple of Raft election timeout. func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal time.Duration) { diff --git a/pkg/base/config_test.go b/pkg/base/config_test.go index b61d77235069..a93bc5519e91 100644 --- a/pkg/base/config_test.go +++ b/pkg/base/config_test.go @@ -30,6 +30,7 @@ func TestDefaultRaftConfig(t *testing.T) { // Assert the config and various derived values. leaseActive, leaseRenewal := cfg.RangeLeaseDurations() + livenessActive, livenessRenewal := cfg.LivenessRangeLeaseDurations() nodeActive, nodeRenewal := cfg.NodeLivenessDurations() raftElectionTimeout := cfg.RaftElectionTimeout() raftHeartbeatInterval := cfg.RaftTickInterval * time.Duration(cfg.RaftHeartbeatIntervalTicks) @@ -41,6 +42,8 @@ func TestDefaultRaftConfig(t *testing.T) { s += fmt.Sprintf("RaftElectionTimeout: %s\n", raftElectionTimeout) s += fmt.Sprintf("RangeLeaseDurations: active=%s renewal=%s\n", leaseActive, leaseRenewal) s += fmt.Sprintf("RangeLeaseAcquireTimeout: %s\n", cfg.RangeLeaseAcquireTimeout()) + s += fmt.Sprintf("LivenessRangeLeaseDurations: active=%s renewal=%s\n", + livenessActive, livenessRenewal) s += fmt.Sprintf("NodeLivenessDurations: active=%s renewal=%s\n", nodeActive, nodeRenewal) s += fmt.Sprintf("SentinelGossipTTL: %s\n", cfg.SentinelGossipTTL()) echotest.Require(t, s, testutils.TestDataPath(t, "raft_config")) diff --git a/pkg/base/testdata/raft_config b/pkg/base/testdata/raft_config index 1d2ea4a1f5fb..afe0019c5078 100644 --- a/pkg/base/testdata/raft_config +++ b/pkg/base/testdata/raft_config @@ -6,6 +6,8 @@ echo RaftHeartbeatIntervalTicks: (int) 5, RangeLeaseRaftElectionTimeoutMultiplier: (float64) 2.5, RangeLeaseRenewalFraction: (float64) 0.5, + LivenessRangeLeaseActiveMultiplier: (float64) 0.5, + LivenessRangeLeaseRenewalFraction: (float64) 0.6, RaftLogTruncationThreshold: (int64) 16777216, RaftProposalQuota: (int64) 8388608, RaftMaxUncommittedEntriesSize: (uint64) 16777216, @@ -18,5 +20,6 @@ RaftHeartbeatInterval: 1s RaftElectionTimeout: 2s RangeLeaseDurations: active=5s renewal=2.5s RangeLeaseAcquireTimeout: 4s +LivenessRangeLeaseDurations: active=2.5s renewal=1.5s NodeLivenessDurations: active=5s renewal=2.5s SentinelGossipTTL: 5s diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index 7ba6e58e81f5..04d36de6e228 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -273,8 +273,14 @@ func (p *pendingLeaseRequest) InitOrJoinRequest( // based leases, it's possible for the new leaseholder that's delayed // in applying the lease transfer to maintain its lease (assuming the // node it's on is able to heartbeat its liveness record). + var active time.Duration + if p.repl.isLivenessRangeRLocked() { + active, _ = p.repl.store.cfg.LivenessRangeLeaseDurations() + } else { + active = p.repl.store.cfg.RangeLeaseActiveDuration() + } reqLease.Expiration = &hlc.Timestamp{} - *reqLease.Expiration = status.Now.ToTimestamp().Add(int64(p.repl.store.cfg.RangeLeaseActiveDuration()), 0) + *reqLease.Expiration = status.Now.ToTimestamp().Add(active.Nanoseconds(), 0) } else { // Get the liveness for the next lease holder and set the epoch in the lease request. l, ok := p.repl.store.cfg.NodeLiveness.GetLiveness(nextLeaseHolder.NodeID) @@ -803,6 +809,13 @@ func (r *Replica) requiresExpiringLeaseRLocked() bool { r.mu.state.Desc.StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) } +// isLivenessRange returns true if this range contains the node liveness data. +func (r *Replica) isLivenessRangeRLocked() bool { + return r.store.cfg.NodeLiveness != nil && + r.mu.state.Desc.StartKey.Compare(roachpb.RKey(keys.NodeLivenessKeyMax)) < 0 && + r.mu.state.Desc.EndKey.Compare(roachpb.RKey(keys.NodeLivenessPrefix)) > 0 +} + // requestLeaseLocked executes a request to obtain or extend a lease // asynchronously and returns a channel on which the result will be posted. If // there's already a request in progress, we join in waiting for the results of @@ -1032,10 +1045,12 @@ func NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError( // lease's expiration (and stasis period). func (r *Replica) checkRequestTimeRLocked(now hlc.ClockTimestamp, reqTS hlc.Timestamp) error { var leaseRenewal time.Duration - if r.requiresExpiringLeaseRLocked() { - _, leaseRenewal = r.store.cfg.RangeLeaseDurations() - } else { + if !r.requiresExpiringLeaseRLocked() { _, leaseRenewal = r.store.cfg.NodeLivenessDurations() + } else if r.isLivenessRangeRLocked() { + _, leaseRenewal = r.store.cfg.LivenessRangeLeaseDurations() + } else { + _, leaseRenewal = r.store.cfg.RangeLeaseDurations() } leaseRenewalMinusStasis := leaseRenewal - r.store.Clock().MaxOffset() if leaseRenewalMinusStasis < 0 { @@ -1419,7 +1434,13 @@ func (r *Replica) shouldExtendLeaseRLocked(st kvserverpb.LeaseStatus) bool { if _, ok := r.mu.pendingLeaseRequest.RequestPending(); ok { return false } - renewal := st.Lease.Expiration.Add(-r.store.cfg.RangeLeaseRenewalDuration().Nanoseconds(), 0) + var renewalDuration time.Duration + if r.isLivenessRangeRLocked() { + _, renewalDuration = r.store.cfg.LivenessRangeLeaseDurations() + } else { + renewalDuration = r.store.cfg.RangeLeaseRenewalDuration() + } + renewal := st.Lease.Expiration.Add(-renewalDuration.Nanoseconds(), 0) return renewal.LessEq(st.Now.ToTimestamp()) }