Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

base: use separate lease duration for liveness range #93073

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ require (
github.com/containerd/containerd v1.5.4
github.com/coreos/go-oidc v2.2.1+incompatible
github.com/dave/dst v0.24.0
github.com/davecgh/go-spew v1.1.1
github.com/docker/distribution v2.7.1+incompatible
github.com/docker/docker v20.10.17+incompatible
github.com/docker/go-connections v0.4.0
Expand Down Expand Up @@ -251,7 +252,6 @@ require (
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/charmbracelet/bubbletea v0.22.2-0.20221007125427-0e76ba142aa1 // indirect
github.com/charmbracelet/lipgloss v0.6.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/decred/dcrd/dcrec/secp256k1/v4 v4.1.0 // indirect
github.com/dimchansky/utfbom v1.1.1 // indirect
github.com/djherbis/atime v1.1.0 // indirect
Expand Down
4 changes: 4 additions & 0 deletions pkg/base/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -52,21 +52,25 @@ go_test(
srcs = [
"addr_validation_test.go",
"cluster_id_test.go",
"config_test.go",
"main_test.go",
"node_id_test.go",
"store_spec_test.go",
],
args = ["-test.timeout=55s"],
data = glob(["testdata/**"]),
deps = [
":base",
"//pkg/roachpb",
"//pkg/security/securityassets",
"//pkg/security/securitytest",
"//pkg/testutils",
"//pkg/testutils/echotest",
"//pkg/util/leaktest",
"//pkg/util/log",
"//pkg/util/uuid",
"@com_github_cockroachdb_errors//:errors",
"@com_github_davecgh_go_spew//spew",
"@com_github_stretchr_testify//require",
],
)
Expand Down
123 changes: 113 additions & 10 deletions pkg/base/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ import (
)

// Base config defaults.
//
// When changing these, TestDefaultRaftConfig must also be updated via -rewrite,
// and the result copied to the defaultRangeLeaseRaftElectionTimeoutMultiplier
// comment with any adjustments to the surrounding reasoning.
const (
defaultInsecure = false
defaultUser = username.RootUser
Expand All @@ -52,7 +56,48 @@ const (

// defaultRangeLeaseRaftElectionTimeoutMultiplier specifies what multiple the
// leader lease active duration should be of the raft election timeout.
defaultRangeLeaseRaftElectionTimeoutMultiplier = 3
//
// Timers for Raft leadership election and lease expiration run in parallel.
// Although not required, we would like to elect a leader before the lease
// expires, such that we don't have to wait for a Raft election when we're
// ready to acquire the lease.
//
// The relevant operations and default time intervals are listed below. RTTs
// are assumed to range from 10ms to 400ms (maximum GCP inter-region latency).
// Heartbeat offsets refer to the duration from the last heartbeat to the node
// crash -- for example, with a heartbeat interval of 1s and a timeout of 3s,
// if the node crashes 1s after the previous heartbeat (just before it's about
// to heartbeat again), then the timeout will fire after 2s of unavailability
// rather than 3s, so the heartbeat offset is -1s.
//
// Raft election:
// - Heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s]
// - Election timeout (random 1x-2x timeout) [ 2.00s - 4.00s]
// - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s]
// Total latency [ 1.03s - 5.20s]
//
// Lease acquisition:
// - Heartbeat offset (0-1 heartbeat interval) [-2.50s - 0.00s]
// - Lease expiration (constant) [ 5.00s - 5.00s]
// - Liveness epoch bump (2x RTT: CPut + append) [ 0.02s - 0.80s]
// - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s]
// Total latency [ 2.53s - 6.20s]
//
// (generated by TestDefaultRaftConfig)
//
// From the above, we note that the worst-case Raft election latency
// (4.03s-5.20s) is always less than the corresponding lease expiration +
// epoch bump time (5.02s-5.80s) regardless of RTT, such that the upper bound
// on unavailability is always given by the lease expiration time + 3x RTT
// (5.03s to 6.20s).
//
// With negligible RTT, the average latency is 3.75s for lease acquisition
// (-2.5s / 2 + 5.0s) and 2.5s for Raft elections ((-1.0s + 2.0s + 4.0s) / 2).
// However, the worst-case Raft election latency (4.0s) being greater than the
// best-case lease acquisition latency (2.5s) for a given RTT will skew the
// average upwards, so we can approximate the typical unavailability to be
// roughly 4.0s (the exact calculation is left as an exercise for the reader).
defaultRangeLeaseRaftElectionTimeoutMultiplier = 2.5

// NB: this can't easily become a variable as the UI hard-codes it to 10s.
// See https://github.com/cockroachdb/cockroach/issues/20310.
Expand All @@ -69,6 +114,18 @@ const (
// eagerly renewed 8 seconds into each lease.
defaultRangeLeaseRenewalFraction = 0.5

// defaultLivenessRangeMultiplier specifies the liveness range lease active
// duration as a multiple of the standard range lease active duration. When
// the liveness leaseholder is lost, other nodes will be unable to heartbeat
// and extend their own leases, which can cause a loss of all leases in the
// cluster. It is set to half of the regular lease active duration to
// counteract this.
defaultLivenessRangeLeaseActiveMultiplier = 0.5

// defaultLivenessRangeLeaseRenewalFraction is like
// defaultRangeLeaseRenewalFraction, but for the liveness range lease.
defaultLivenessRangeLeaseRenewalFraction = 0.6

// livenessRenewalFraction specifies what fraction the node liveness
// renewal duration should be of the node liveness duration. For example,
// with a value of 0.2 and a liveness duration of 10 seconds, each node's
Expand Down Expand Up @@ -149,10 +206,14 @@ var (
// RPCHeartbeatIntervalAndTimeout used by the RPC context.
defaultRPCHeartbeatIntervalAndTimeout = NetworkTimeout

// defaultRaftElectionTimeoutTicks specifies the number of Raft Tick
// invocations that must pass between elections.
// defaultRaftElectionTimeoutTicks specifies the minimum number of Raft ticks
// before holding an election. It is set low by default for faster failover.
// 1 second is sufficient for a network roundtrip and retransmit even in
// multi-region clusters (see NetworkTimeout), so 2 seconds should be enough.
// Furthermore, the actual election timeout per replica is multiplied by a
// random factor of 1-2.
defaultRaftElectionTimeoutTicks = envutil.EnvOrDefaultInt(
"COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 15)
"COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 10)

// defaultRaftLogTruncationThreshold specifies the upper bound that a single
// Range's Raft log can grow to before log truncations are triggered while at
Expand Down Expand Up @@ -344,8 +405,9 @@ type RaftConfig struct {
// RaftTickInterval is the resolution of the Raft timer.
RaftTickInterval time.Duration

// RaftElectionTimeoutTicks is the number of raft ticks before the
// previous election expires. This value is inherited by individual stores
// RaftElectionTimeoutTicks is the minimum number of raft ticks before holding
// an election. The actual election timeout is randomized by each replica to
// between 1-2 election timeouts. This value is inherited by individual stores
// unless overridden.
RaftElectionTimeoutTicks int

Expand All @@ -362,6 +424,17 @@ type RaftConfig struct {
// and a value of -1 means never preemptively renew the lease. A value of 1
// means always renew.
RangeLeaseRenewalFraction float64
// LivenessRangeLeaseActiveMultiplier specifies what multiple the liveness
// range active duration should be of the regular lease active duration. When
// the liveness leaseholder is lost, other nodes are unable to heartbeat and
// extend their epoch-based leases, which can cascade into a loss of all
// leases in the cluster, so this should be set lower than that.
LivenessRangeLeaseActiveMultiplier float64
// LivenessRangeLeaseRenewalFraction is like RangeLeaseRenewalFraction but for
// the liveness range lease. It is set to 0.6 such that, in the default
// configuration, a 2.5s liveness lease is renewed every 1.0s, giving it 1.0s
// to succeed before entering the 0.5s stasis period.
LivenessRangeLeaseRenewalFraction float64

// RaftLogTruncationThreshold controls how large a single Range's Raft log
// can grow. When a Range's Raft log grows above this size, the Range will
Expand Down Expand Up @@ -436,6 +509,12 @@ func (cfg *RaftConfig) SetDefaults() {
if cfg.RangeLeaseRenewalFraction == 0 {
cfg.RangeLeaseRenewalFraction = defaultRangeLeaseRenewalFraction
}
if cfg.LivenessRangeLeaseActiveMultiplier == 0 {
cfg.LivenessRangeLeaseActiveMultiplier = defaultLivenessRangeLeaseActiveMultiplier
}
if cfg.LivenessRangeLeaseRenewalFraction == 0 {
cfg.LivenessRangeLeaseRenewalFraction = defaultLivenessRangeLeaseRenewalFraction
}
// TODO(andrei): -1 is a special value for RangeLeaseRenewalFraction which
// really means "0" (never renew), except that the zero value means "use
// default". We can't turn the -1 into 0 here because, unfortunately,
Expand Down Expand Up @@ -467,15 +546,15 @@ func (cfg *RaftConfig) SetDefaults() {
cfg.RaftMaxInflightMsgs = defaultRaftMaxInflightMsgs
}
if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 {
// The Raft Ticks interval defaults to 200ms, and an election is 15
// The Raft Ticks interval defaults to 200ms, and an election is 10
// ticks. Add a generous amount of ticks to make sure even a backed up
// Raft snapshot queue is going to make progress when a (not overly
// concurrent) amount of splits happens.
// The generous amount should result in a delay sufficient to
// transmit at least one snapshot with the slow delay, which
// with default settings is max 64MB at 2MB/s, ie 32 seconds.
// with default settings is max 512MB at 32MB/s, ie 16 seconds.
//
// The resulting delay configured here is about 50s.
// The resulting delay configured here is 46s.
cfg.RaftDelaySplitToSuppressSnapshotTicks = 3*cfg.RaftElectionTimeoutTicks + 200
}

Expand Down Expand Up @@ -520,6 +599,30 @@ func (cfg RaftConfig) RangeLeaseRenewalDuration() time.Duration {
return rangeLeaseRenewal
}

// RangeLeaseAcquireTimeout is the timeout for lease acquisition.
func (cfg RaftConfig) RangeLeaseAcquireTimeout() time.Duration {
// The Raft election timeout is randomized by a factor of 1-2x per replica
// (the first one will trigger the election), and reproposing the lease
// acquisition command can take up to 1 Raft election timeout. On average, we
// should be able to elect a leader and acquire a lease within 2 election
// timeouts, assuming negligible RTT; otherwise, lease acquisition will
// typically be retried, only adding a bit of tail latency.
return 2 * cfg.RaftElectionTimeout()
}

// LivenessRangeLeaseDurations computes durations for range lease expiration and
// renewal based for the liveness range lease.
func (cfg RaftConfig) LivenessRangeLeaseDurations() (active, renewal time.Duration) {
active = time.Duration(cfg.LivenessRangeLeaseActiveMultiplier *
float64(cfg.RangeLeaseActiveDuration()))
renewalFraction := cfg.LivenessRangeLeaseRenewalFraction
if renewalFraction == -1 {
renewalFraction = 0
}
renewal = time.Duration(float64(active) * renewalFraction)
return
}

// NodeLivenessDurations computes durations for node liveness expiration and
// renewal based on a default multiple of Raft election timeout.
func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal time.Duration) {
Expand All @@ -535,7 +638,7 @@ func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal t
// propagate liveness. The replica which is the lease holder of the first range
// gossips it.
func (cfg RaftConfig) SentinelGossipTTL() time.Duration {
return cfg.RangeLeaseActiveDuration() / 2
return cfg.RangeLeaseActiveDuration()
}

// DefaultRetryOptions should be used for retrying most
Expand Down
127 changes: 127 additions & 0 deletions pkg/base/config_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package base_test

import (
"fmt"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/echotest"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/davecgh/go-spew/spew"
)

func TestDefaultRaftConfig(t *testing.T) {
defer leaktest.AfterTest(t)()

var cfg base.RaftConfig
cfg.SetDefaults()

// Assert the config and various derived values.
leaseActive, leaseRenewal := cfg.RangeLeaseDurations()
livenessActive, livenessRenewal := cfg.LivenessRangeLeaseDurations()
nodeActive, nodeRenewal := cfg.NodeLivenessDurations()
raftElectionTimeout := cfg.RaftElectionTimeout()
raftHeartbeatInterval := cfg.RaftTickInterval * time.Duration(cfg.RaftHeartbeatIntervalTicks)

{
var s string
s += spew.Sdump(cfg)
s += fmt.Sprintf("RaftHeartbeatInterval: %s\n", raftHeartbeatInterval)
s += fmt.Sprintf("RaftElectionTimeout: %s\n", raftElectionTimeout)
s += fmt.Sprintf("RangeLeaseDurations: active=%s renewal=%s\n", leaseActive, leaseRenewal)
s += fmt.Sprintf("RangeLeaseAcquireTimeout: %s\n", cfg.RangeLeaseAcquireTimeout())
s += fmt.Sprintf("LivenessRangeLeaseDurations: active=%s renewal=%s\n",
livenessActive, livenessRenewal)
s += fmt.Sprintf("NodeLivenessDurations: active=%s renewal=%s\n", nodeActive, nodeRenewal)
s += fmt.Sprintf("SentinelGossipTTL: %s\n", cfg.SentinelGossipTTL())
echotest.Require(t, s, testutils.TestDataPath(t, "raft_config"))
}

// Generate and assert the derived recovery intervals.
const (
minRTT = 10 * time.Millisecond
maxRTT = 400 * time.Millisecond // max GCP inter-region RTT is ~350ms
maxElectionMultiplier = 2
)

type interval struct {
name string
min, max time.Duration
}

formatIntervals := func(name string, intervals []interval) string {
// Format intervals and append min/max sum.
var minSum, maxSum time.Duration
var formatted []interval
for _, ival := range intervals {
ival.name = "- " + ival.name
formatted = append(formatted, ival)
minSum += ival.min
maxSum += ival.max
}
formatted = append(formatted, interval{name: "Total latency", min: minSum, max: maxSum})

s := "// " + name + ":\n"
for _, ival := range formatted {
s += fmt.Sprintf("// %-46s [%5.2fs -%5.2fs]\n",
ival.name, ival.min.Seconds(), ival.max.Seconds())
}
return s
}

var s string
s += formatIntervals("Raft election", []interval{
{
"Heartbeat offset (0-1 heartbeat interval)",
-raftHeartbeatInterval,
0,
},
{
fmt.Sprintf("Election timeout (random 1x-%dx timeout)", maxElectionMultiplier),
raftElectionTimeout,
maxElectionMultiplier * raftElectionTimeout,
},
{
"Election (3x RTT: prevote, vote, append)",
3 * minRTT,
3 * maxRTT,
},
})
s += "//\n"
s += formatIntervals("Lease acquisition", []interval{
{
"Heartbeat offset (0-1 heartbeat interval)",
-leaseRenewal,
0,
},
{
"Lease expiration (constant)",
leaseActive,
leaseActive,
},
{
"Liveness epoch bump (2x RTT: CPut + append)",
2 * minRTT,
2 * maxRTT,
},
{
"Lease acquisition (1x RTT: append)",
minRTT,
maxRTT,
},
})

echotest.Require(t, s, testutils.TestDataPath(t, "raft_config_recovery"))
}
25 changes: 25 additions & 0 deletions pkg/base/testdata/raft_config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
echo
----
(base.RaftConfig) {
RaftTickInterval: (time.Duration) 200ms,
RaftElectionTimeoutTicks: (int) 10,
RaftHeartbeatIntervalTicks: (int) 5,
RangeLeaseRaftElectionTimeoutMultiplier: (float64) 2.5,
RangeLeaseRenewalFraction: (float64) 0.5,
LivenessRangeLeaseActiveMultiplier: (float64) 0.5,
LivenessRangeLeaseRenewalFraction: (float64) 0.6,
RaftLogTruncationThreshold: (int64) 16777216,
RaftProposalQuota: (int64) 8388608,
RaftMaxUncommittedEntriesSize: (uint64) 16777216,
RaftMaxSizePerMsg: (uint64) 32768,
RaftMaxCommittedSizePerReady: (uint64) 67108864,
RaftMaxInflightMsgs: (int) 128,
RaftDelaySplitToSuppressSnapshotTicks: (int) 230
}
RaftHeartbeatInterval: 1s
RaftElectionTimeout: 2s
RangeLeaseDurations: active=5s renewal=2.5s
RangeLeaseAcquireTimeout: 4s
LivenessRangeLeaseDurations: active=2.5s renewal=1.5s
NodeLivenessDurations: active=5s renewal=2.5s
SentinelGossipTTL: 5s
Loading