diff --git a/go.mod b/go.mod index 7e9f296375a1..aad9a8d4360c 100644 --- a/go.mod +++ b/go.mod @@ -122,6 +122,7 @@ require ( github.com/containerd/containerd v1.5.4 github.com/coreos/go-oidc v2.2.1+incompatible github.com/dave/dst v0.24.0 + github.com/davecgh/go-spew v1.1.1 github.com/docker/distribution v2.7.1+incompatible github.com/docker/docker v20.10.17+incompatible github.com/docker/go-connections v0.4.0 @@ -251,7 +252,6 @@ require ( github.com/cespare/xxhash/v2 v2.1.2 // indirect github.com/charmbracelet/bubbletea v0.22.2-0.20221007125427-0e76ba142aa1 // indirect github.com/charmbracelet/lipgloss v0.6.0 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect github.com/decred/dcrd/dcrec/secp256k1/v4 v4.1.0 // indirect github.com/dimchansky/utfbom v1.1.1 // indirect github.com/djherbis/atime v1.1.0 // indirect diff --git a/pkg/base/BUILD.bazel b/pkg/base/BUILD.bazel index f57a88f28068..66e619ca1730 100644 --- a/pkg/base/BUILD.bazel +++ b/pkg/base/BUILD.bazel @@ -52,21 +52,25 @@ go_test( srcs = [ "addr_validation_test.go", "cluster_id_test.go", + "config_test.go", "main_test.go", "node_id_test.go", "store_spec_test.go", ], args = ["-test.timeout=55s"], + data = glob(["testdata/**"]), deps = [ ":base", "//pkg/roachpb", "//pkg/security/securityassets", "//pkg/security/securitytest", "//pkg/testutils", + "//pkg/testutils/echotest", "//pkg/util/leaktest", "//pkg/util/log", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", + "@com_github_davecgh_go_spew//spew", "@com_github_stretchr_testify//require", ], ) diff --git a/pkg/base/config.go b/pkg/base/config.go index 462e6824ba26..99573dde02b3 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -26,6 +26,10 @@ import ( ) // Base config defaults. +// +// When changing these, TestDefaultRaftConfig must also be updated via -rewrite, +// and the result copied to the defaultRangeLeaseRaftElectionTimeoutMultiplier +// comment with any adjustments to the surrounding reasoning. const ( defaultInsecure = false defaultUser = username.RootUser @@ -52,7 +56,48 @@ const ( // defaultRangeLeaseRaftElectionTimeoutMultiplier specifies what multiple the // leader lease active duration should be of the raft election timeout. - defaultRangeLeaseRaftElectionTimeoutMultiplier = 3 + // + // Timers for Raft leadership election and lease expiration run in parallel. + // Although not required, we would like to elect a leader before the lease + // expires, such that we don't have to wait for a Raft election when we're + // ready to acquire the lease. + // + // The relevant operations and default time intervals are listed below. RTTs + // are assumed to range from 10ms to 400ms (maximum GCP inter-region latency). + // Heartbeat offsets refer to the duration from the last heartbeat to the node + // crash -- for example, with a heartbeat interval of 1s and a timeout of 3s, + // if the node crashes 1s after the previous heartbeat (just before it's about + // to heartbeat again), then the timeout will fire after 2s of unavailability + // rather than 3s, so the heartbeat offset is -1s. + // + // Raft election: + // - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s] + // - Election timeout (random 1x-2x timeout) [ 2.00s - 4.00s] + // - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s] + // Total latency [ 1.03s - 5.20s] + // + // Lease acquisition: + // - Heartbeat offset (0-1 heartbeat interval) [-2.50s - 0.00s] + // - Lease expiration (constant) [ 5.00s - 5.00s] + // - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s] + // - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s] + // Total latency [ 2.53s - 6.20s] + // + // (generated by TestDefaultRaftConfig) + // + // From the above, we note that the worst-case Raft election latency + // (4.03s-5.20s) is always less than the corresponding lease expiration + + // epoch bump time (5.02s-5.80s) regardless of RTT, such that the upper bound + // on unavailability is always given by the lease expiration time + 3x RTT + // (5.03s to 6.20s). + // + // With negligible RTT, the average latency is 3.75s for lease acquisition + // (-2.5s / 2 + 5.0s) and 2.5s for Raft elections ((-1.0s + 2.0s + 4.0s) / 2). + // However, the worst-case Raft election latency (4.0s) being greater than the + // best-case lease acquisition latency (2.5s) for a given RTT will skew the + // average upwards, so we can approximate the typical unavailability to be + // roughly 4.0s (the exact calculation is left as an exercise for the reader). + defaultRangeLeaseRaftElectionTimeoutMultiplier = 2.5 // NB: this can't easily become a variable as the UI hard-codes it to 10s. // See https://github.com/cockroachdb/cockroach/issues/20310. @@ -69,6 +114,18 @@ const ( // eagerly renewed 8 seconds into each lease. defaultRangeLeaseRenewalFraction = 0.5 + // defaultLivenessRangeMultiplier specifies the liveness range lease active + // duration as a multiple of the standard range lease active duration. When + // the liveness leaseholder is lost, other nodes will be unable to heartbeat + // and extend their own leases, which can cause a loss of all leases in the + // cluster. It is set to half of the regular lease active duration to + // counteract this. + defaultLivenessRangeLeaseActiveMultiplier = 0.5 + + // defaultLivenessRangeLeaseRenewalFraction is like + // defaultRangeLeaseRenewalFraction, but for the liveness range lease. + defaultLivenessRangeLeaseRenewalFraction = 0.6 + // livenessRenewalFraction specifies what fraction the node liveness // renewal duration should be of the node liveness duration. For example, // with a value of 0.2 and a liveness duration of 10 seconds, each node's @@ -149,10 +206,14 @@ var ( // RPCHeartbeatIntervalAndTimeout used by the RPC context. defaultRPCHeartbeatIntervalAndTimeout = NetworkTimeout - // defaultRaftElectionTimeoutTicks specifies the number of Raft Tick - // invocations that must pass between elections. + // defaultRaftElectionTimeoutTicks specifies the minimum number of Raft ticks + // before holding an election. It is set low by default for faster failover. + // 1 second is sufficient for a network roundtrip and retransmit even in + // multi-region clusters (see NetworkTimeout), so 2 seconds should be enough. + // Furthermore, the actual election timeout per replica is multiplied by a + // random factor of 1-2. defaultRaftElectionTimeoutTicks = envutil.EnvOrDefaultInt( - "COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 15) + "COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 10) // defaultRaftLogTruncationThreshold specifies the upper bound that a single // Range's Raft log can grow to before log truncations are triggered while at @@ -344,8 +405,9 @@ type RaftConfig struct { // RaftTickInterval is the resolution of the Raft timer. RaftTickInterval time.Duration - // RaftElectionTimeoutTicks is the number of raft ticks before the - // previous election expires. This value is inherited by individual stores + // RaftElectionTimeoutTicks is the minimum number of raft ticks before holding + // an election. The actual election timeout is randomized by each replica to + // between 1-2 election timeouts. This value is inherited by individual stores // unless overridden. RaftElectionTimeoutTicks int @@ -362,6 +424,17 @@ type RaftConfig struct { // and a value of -1 means never preemptively renew the lease. A value of 1 // means always renew. RangeLeaseRenewalFraction float64 + // LivenessRangeLeaseActiveMultiplier specifies what multiple the liveness + // range active duration should be of the regular lease active duration. When + // the liveness leaseholder is lost, other nodes are unable to heartbeat and + // extend their epoch-based leases, which can cascade into a loss of all + // leases in the cluster, so this should be set lower than that. + LivenessRangeLeaseActiveMultiplier float64 + // LivenessRangeLeaseRenewalFraction is like RangeLeaseRenewalFraction but for + // the liveness range lease. It is set to 0.6 such that, in the default + // configuration, a 2.5s liveness lease is renewed every 1.0s, giving it 1.0s + // to succeed before entering the 0.5s stasis period. + LivenessRangeLeaseRenewalFraction float64 // RaftLogTruncationThreshold controls how large a single Range's Raft log // can grow. When a Range's Raft log grows above this size, the Range will @@ -436,6 +509,12 @@ func (cfg *RaftConfig) SetDefaults() { if cfg.RangeLeaseRenewalFraction == 0 { cfg.RangeLeaseRenewalFraction = defaultRangeLeaseRenewalFraction } + if cfg.LivenessRangeLeaseActiveMultiplier == 0 { + cfg.LivenessRangeLeaseActiveMultiplier = defaultLivenessRangeLeaseActiveMultiplier + } + if cfg.LivenessRangeLeaseRenewalFraction == 0 { + cfg.LivenessRangeLeaseRenewalFraction = defaultLivenessRangeLeaseRenewalFraction + } // TODO(andrei): -1 is a special value for RangeLeaseRenewalFraction which // really means "0" (never renew), except that the zero value means "use // default". We can't turn the -1 into 0 here because, unfortunately, @@ -467,15 +546,15 @@ func (cfg *RaftConfig) SetDefaults() { cfg.RaftMaxInflightMsgs = defaultRaftMaxInflightMsgs } if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 { - // The Raft Ticks interval defaults to 200ms, and an election is 15 + // The Raft Ticks interval defaults to 200ms, and an election is 10 // ticks. Add a generous amount of ticks to make sure even a backed up // Raft snapshot queue is going to make progress when a (not overly // concurrent) amount of splits happens. // The generous amount should result in a delay sufficient to // transmit at least one snapshot with the slow delay, which - // with default settings is max 64MB at 2MB/s, ie 32 seconds. + // with default settings is max 512MB at 32MB/s, ie 16 seconds. // - // The resulting delay configured here is about 50s. + // The resulting delay configured here is 46s. cfg.RaftDelaySplitToSuppressSnapshotTicks = 3*cfg.RaftElectionTimeoutTicks + 200 } @@ -520,6 +599,30 @@ func (cfg RaftConfig) RangeLeaseRenewalDuration() time.Duration { return rangeLeaseRenewal } +// RangeLeaseAcquireTimeout is the timeout for lease acquisition. +func (cfg RaftConfig) RangeLeaseAcquireTimeout() time.Duration { + // The Raft election timeout is randomized by a factor of 1-2x per replica + // (the first one will trigger the election), and reproposing the lease + // acquisition command can take up to 1 Raft election timeout. On average, we + // should be able to elect a leader and acquire a lease within 2 election + // timeouts, assuming negligible RTT; otherwise, lease acquisition will + // typically be retried, only adding a bit of tail latency. + return 2 * cfg.RaftElectionTimeout() +} + +// LivenessRangeLeaseDurations computes durations for range lease expiration and +// renewal based for the liveness range lease. +func (cfg RaftConfig) LivenessRangeLeaseDurations() (active, renewal time.Duration) { + active = time.Duration(cfg.LivenessRangeLeaseActiveMultiplier * + float64(cfg.RangeLeaseActiveDuration())) + renewalFraction := cfg.LivenessRangeLeaseRenewalFraction + if renewalFraction == -1 { + renewalFraction = 0 + } + renewal = time.Duration(float64(active) * renewalFraction) + return +} + // NodeLivenessDurations computes durations for node liveness expiration and // renewal based on a default multiple of Raft election timeout. func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal time.Duration) { @@ -535,7 +638,7 @@ func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal t // propagate liveness. The replica which is the lease holder of the first range // gossips it. func (cfg RaftConfig) SentinelGossipTTL() time.Duration { - return cfg.RangeLeaseActiveDuration() / 2 + return cfg.RangeLeaseActiveDuration() } // DefaultRetryOptions should be used for retrying most diff --git a/pkg/base/config_test.go b/pkg/base/config_test.go new file mode 100644 index 000000000000..a93bc5519e91 --- /dev/null +++ b/pkg/base/config_test.go @@ -0,0 +1,127 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package base_test + +import ( + "fmt" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/echotest" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/davecgh/go-spew/spew" +) + +func TestDefaultRaftConfig(t *testing.T) { + defer leaktest.AfterTest(t)() + + var cfg base.RaftConfig + cfg.SetDefaults() + + // Assert the config and various derived values. + leaseActive, leaseRenewal := cfg.RangeLeaseDurations() + livenessActive, livenessRenewal := cfg.LivenessRangeLeaseDurations() + nodeActive, nodeRenewal := cfg.NodeLivenessDurations() + raftElectionTimeout := cfg.RaftElectionTimeout() + raftHeartbeatInterval := cfg.RaftTickInterval * time.Duration(cfg.RaftHeartbeatIntervalTicks) + + { + var s string + s += spew.Sdump(cfg) + s += fmt.Sprintf("RaftHeartbeatInterval: %s\n", raftHeartbeatInterval) + s += fmt.Sprintf("RaftElectionTimeout: %s\n", raftElectionTimeout) + s += fmt.Sprintf("RangeLeaseDurations: active=%s renewal=%s\n", leaseActive, leaseRenewal) + s += fmt.Sprintf("RangeLeaseAcquireTimeout: %s\n", cfg.RangeLeaseAcquireTimeout()) + s += fmt.Sprintf("LivenessRangeLeaseDurations: active=%s renewal=%s\n", + livenessActive, livenessRenewal) + s += fmt.Sprintf("NodeLivenessDurations: active=%s renewal=%s\n", nodeActive, nodeRenewal) + s += fmt.Sprintf("SentinelGossipTTL: %s\n", cfg.SentinelGossipTTL()) + echotest.Require(t, s, testutils.TestDataPath(t, "raft_config")) + } + + // Generate and assert the derived recovery intervals. + const ( + minRTT = 10 * time.Millisecond + maxRTT = 400 * time.Millisecond // max GCP inter-region RTT is ~350ms + maxElectionMultiplier = 2 + ) + + type interval struct { + name string + min, max time.Duration + } + + formatIntervals := func(name string, intervals []interval) string { + // Format intervals and append min/max sum. + var minSum, maxSum time.Duration + var formatted []interval + for _, ival := range intervals { + ival.name = "- " + ival.name + formatted = append(formatted, ival) + minSum += ival.min + maxSum += ival.max + } + formatted = append(formatted, interval{name: "Total latency", min: minSum, max: maxSum}) + + s := "// " + name + ":\n" + for _, ival := range formatted { + s += fmt.Sprintf("// %-46s [%5.2fs -%5.2fs]\n", + ival.name, ival.min.Seconds(), ival.max.Seconds()) + } + return s + } + + var s string + s += formatIntervals("Raft election", []interval{ + { + "Heartbeat offset (0-1 heartbeat interval)", + -raftHeartbeatInterval, + 0, + }, + { + fmt.Sprintf("Election timeout (random 1x-%dx timeout)", maxElectionMultiplier), + raftElectionTimeout, + maxElectionMultiplier * raftElectionTimeout, + }, + { + "Election (3x RTT: prevote, vote, append)", + 3 * minRTT, + 3 * maxRTT, + }, + }) + s += "//\n" + s += formatIntervals("Lease acquisition", []interval{ + { + "Heartbeat offset (0-1 heartbeat interval)", + -leaseRenewal, + 0, + }, + { + "Lease expiration (constant)", + leaseActive, + leaseActive, + }, + { + "Liveness epoch bump (2x RTT: CPut + append)", + 2 * minRTT, + 2 * maxRTT, + }, + { + "Lease acquisition (1x RTT: append)", + minRTT, + maxRTT, + }, + }) + + echotest.Require(t, s, testutils.TestDataPath(t, "raft_config_recovery")) +} diff --git a/pkg/base/testdata/raft_config b/pkg/base/testdata/raft_config new file mode 100644 index 000000000000..afe0019c5078 --- /dev/null +++ b/pkg/base/testdata/raft_config @@ -0,0 +1,25 @@ +echo +---- +(base.RaftConfig) { + RaftTickInterval: (time.Duration) 200ms, + RaftElectionTimeoutTicks: (int) 10, + RaftHeartbeatIntervalTicks: (int) 5, + RangeLeaseRaftElectionTimeoutMultiplier: (float64) 2.5, + RangeLeaseRenewalFraction: (float64) 0.5, + LivenessRangeLeaseActiveMultiplier: (float64) 0.5, + LivenessRangeLeaseRenewalFraction: (float64) 0.6, + RaftLogTruncationThreshold: (int64) 16777216, + RaftProposalQuota: (int64) 8388608, + RaftMaxUncommittedEntriesSize: (uint64) 16777216, + RaftMaxSizePerMsg: (uint64) 32768, + RaftMaxCommittedSizePerReady: (uint64) 67108864, + RaftMaxInflightMsgs: (int) 128, + RaftDelaySplitToSuppressSnapshotTicks: (int) 230 +} +RaftHeartbeatInterval: 1s +RaftElectionTimeout: 2s +RangeLeaseDurations: active=5s renewal=2.5s +RangeLeaseAcquireTimeout: 4s +LivenessRangeLeaseDurations: active=2.5s renewal=1.5s +NodeLivenessDurations: active=5s renewal=2.5s +SentinelGossipTTL: 5s diff --git a/pkg/base/testdata/raft_config_recovery b/pkg/base/testdata/raft_config_recovery new file mode 100644 index 000000000000..b907b72a0307 --- /dev/null +++ b/pkg/base/testdata/raft_config_recovery @@ -0,0 +1,17 @@ +# Any changes in this result should be copied to the comment on +# defaultRangeLeaseRaftElectionTimeoutMultiplier, and the corresponding +# reasoning should be adjusted. +echo +---- +// Raft election: +// - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s] +// - Election timeout (random 1x-2x timeout) [ 2.00s - 4.00s] +// - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s] +// Total latency [ 1.03s - 5.20s] +// +// Lease acquisition: +// - Heartbeat offset (0-1 heartbeat interval) [-2.50s - 0.00s] +// - Lease expiration (constant) [ 5.00s - 5.00s] +// - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s] +// - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s] +// Total latency [ 2.53s - 6.20s] diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index 58529c99a061..04d36de6e228 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -273,8 +273,14 @@ func (p *pendingLeaseRequest) InitOrJoinRequest( // based leases, it's possible for the new leaseholder that's delayed // in applying the lease transfer to maintain its lease (assuming the // node it's on is able to heartbeat its liveness record). + var active time.Duration + if p.repl.isLivenessRangeRLocked() { + active, _ = p.repl.store.cfg.LivenessRangeLeaseDurations() + } else { + active = p.repl.store.cfg.RangeLeaseActiveDuration() + } reqLease.Expiration = &hlc.Timestamp{} - *reqLease.Expiration = status.Now.ToTimestamp().Add(int64(p.repl.store.cfg.RangeLeaseActiveDuration()), 0) + *reqLease.Expiration = status.Now.ToTimestamp().Add(active.Nanoseconds(), 0) } else { // Get the liveness for the next lease holder and set the epoch in the lease request. l, ok := p.repl.store.cfg.NodeLiveness.GetLiveness(nextLeaseHolder.NodeID) @@ -803,6 +809,13 @@ func (r *Replica) requiresExpiringLeaseRLocked() bool { r.mu.state.Desc.StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) } +// isLivenessRange returns true if this range contains the node liveness data. +func (r *Replica) isLivenessRangeRLocked() bool { + return r.store.cfg.NodeLiveness != nil && + r.mu.state.Desc.StartKey.Compare(roachpb.RKey(keys.NodeLivenessKeyMax)) < 0 && + r.mu.state.Desc.EndKey.Compare(roachpb.RKey(keys.NodeLivenessPrefix)) > 0 +} + // requestLeaseLocked executes a request to obtain or extend a lease // asynchronously and returns a channel on which the result will be posted. If // there's already a request in progress, we join in waiting for the results of @@ -1032,10 +1045,12 @@ func NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError( // lease's expiration (and stasis period). func (r *Replica) checkRequestTimeRLocked(now hlc.ClockTimestamp, reqTS hlc.Timestamp) error { var leaseRenewal time.Duration - if r.requiresExpiringLeaseRLocked() { - _, leaseRenewal = r.store.cfg.RangeLeaseDurations() - } else { + if !r.requiresExpiringLeaseRLocked() { _, leaseRenewal = r.store.cfg.NodeLivenessDurations() + } else if r.isLivenessRangeRLocked() { + _, leaseRenewal = r.store.cfg.LivenessRangeLeaseDurations() + } else { + _, leaseRenewal = r.store.cfg.RangeLeaseDurations() } leaseRenewalMinusStasis := leaseRenewal - r.store.Clock().MaxOffset() if leaseRenewalMinusStasis < 0 { @@ -1201,13 +1216,9 @@ func (r *Replica) TestingAcquireLease(ctx context.Context) (kvserverpb.LeaseStat func (r *Replica) redirectOnOrAcquireLeaseForRequest( ctx context.Context, reqTS hlc.Timestamp, brSig signaller, ) (status kvserverpb.LeaseStatus, pErr *roachpb.Error) { - // We may need to hold a Raft election and repropose the lease acquisition - // command, which can take a couple of Raft election timeouts. - timeout := 2 * r.store.cfg.RaftElectionTimeout() - // Does not use RunWithTimeout(), because we do not want to mask the // NotLeaseHolderError on context cancellation. - ctx, cancel := context.WithTimeout(ctx, timeout) // nolint:context + ctx, cancel := context.WithTimeout(ctx, r.store.cfg.RangeLeaseAcquireTimeout()) // nolint:context defer cancel() // Try fast-path. @@ -1423,7 +1434,13 @@ func (r *Replica) shouldExtendLeaseRLocked(st kvserverpb.LeaseStatus) bool { if _, ok := r.mu.pendingLeaseRequest.RequestPending(); ok { return false } - renewal := st.Lease.Expiration.Add(-r.store.cfg.RangeLeaseRenewalDuration().Nanoseconds(), 0) + var renewalDuration time.Duration + if r.isLivenessRangeRLocked() { + _, renewalDuration = r.store.cfg.LivenessRangeLeaseDurations() + } else { + renewalDuration = r.store.cfg.RangeLeaseRenewalDuration() + } + renewal := st.Lease.Expiration.Add(-renewalDuration.Nanoseconds(), 0) return renewal.LessEq(st.Now.ToTimestamp()) } diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 1ca007a84542..015aa7d543a0 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -267,7 +267,7 @@ func testStoreConfig(clock *hlc.Clock, version roachpb.Version) StoreConfig { // Use shorter Raft tick settings in order to minimize start up and failover // time in tests. sc.RaftHeartbeatIntervalTicks = 1 - sc.RaftElectionTimeoutTicks = 3 + sc.RaftElectionTimeoutTicks = 2 sc.RaftTickInterval = 100 * time.Millisecond sc.SetDefaults() return sc