hashicorp · ShimmerGlass · Nov 21, 2018 · Nov 22, 2018
diff --git a/agent/agent.go b/agent/agent.go
@@ -1113,6 +1113,8 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
 		return nil, fmt.Errorf("Failed to configure keyring: %v", err)
 	}
 
+	base.WatchSoftLimit = a.config.WatchSoftLimit
+
 	return base, nil
 }
 

diff --git a/agent/config/builder.go b/agent/config/builder.go
@@ -843,6 +843,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
 		VerifyOutgoing:                          b.boolVal(c.VerifyOutgoing),
 		VerifyServerHostname:                    b.boolVal(c.VerifyServerHostname),
 		Watches:                                 c.Watches,
+		WatchSoftLimit:                          b.intValWithDefault(c.Performance.WatchSoftLimit, consul.DefaultSoftWatchLimit),
 	}
 
 	if rt.BootstrapExpect == 1 {
@@ -1308,6 +1309,13 @@ func (b *Builder) intVal(v *int) int {
 	return *v
 }
 
+func (b *Builder) intValWithDefault(v *int, defaultVal int) int {
+	if v == nil {
+		return defaultVal
+	}
+	return *v
+}
+
 func (b *Builder) portVal(name string, v *int) int {
 	if v == nil || *v <= 0 {
 		return -1

diff --git a/agent/config/config.go b/agent/config/config.go
@@ -566,6 +566,7 @@ type Performance struct {
 	LeaveDrainTime *string `json:"leave_drain_time,omitempty" hcl:"leave_drain_time" mapstructure:"leave_drain_time"`
 	RaftMultiplier *int    `json:"raft_multiplier,omitempty" hcl:"raft_multiplier" mapstructure:"raft_multiplier"` // todo(fs): validate as uint
 	RPCHoldTimeout *string `json:"rpc_hold_timeout" hcl:"rpc_hold_timeout" mapstructure:"rpc_hold_timeout"`
+	WatchSoftLimit *int    `json:"watch_soft_limit,omitempty" hcl:"watch_soft_limit" mapstructure:"watch_soft_limit"`
 }
 
 type Telemetry struct {

diff --git a/agent/config/runtime.go b/agent/config/runtime.go
@@ -1413,6 +1413,13 @@ type RuntimeConfig struct {
 	// ]
 	//
 	Watches []map[string]interface{}
+
+	// WatchSoftLimit is used as a soft limit to cap how many watches we allow
+	// for a given blocking query. If this is exceeded, then we will use a
+	// higher-level watch that's less fine-grained.
+	//
+	// hcl: watch_soft_limit = int
+	WatchSoftLimit int
 }
 
 // IncomingHTTPSConfig returns the TLS configuration for HTTPS

diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go
@@ -18,6 +18,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/hashicorp/consul/agent/consul"
 	"github.com/hashicorp/consul/agent/structs"
 	"github.com/hashicorp/consul/lib"
 	"github.com/hashicorp/consul/testutil"
@@ -3020,7 +3021,8 @@ func TestFullConfig(t *testing.T) {
 			"performance": {
 				"leave_drain_time": "8265s",
 				"raft_multiplier": 5,
-				"rpc_hold_timeout": "15707s"
+				"rpc_hold_timeout": "15707s",
+				"watch_soft_limit": ` + fmt.Sprint(consul.DefaultSoftWatchLimit) + `
 			},
 			"pid_file": "43xN80Km",
 			"ports": {
@@ -3569,6 +3571,7 @@ func TestFullConfig(t *testing.T) {
 				leave_drain_time = "8265s"
 				raft_multiplier = 5
 				rpc_hold_timeout = "15707s"
+				watch_soft_limit = ` + fmt.Sprint(consul.DefaultSoftWatchLimit) + `
 			}
 			pid_file = "43xN80Km"
 			ports {
@@ -4538,6 +4541,7 @@ func TestFullConfig(t *testing.T) {
 				"args":       []interface{}{"dltjDJ2a", "flEa7C2d"},
 			},
 		},
+		WatchSoftLimit: consul.DefaultSoftWatchLimit,
 	}
 
 	warns := []string{
@@ -5122,7 +5126,8 @@ func TestSanitize(t *testing.T) {
 		"VerifyServerHostname": false,
 		"Version": "",
 		"VersionPrerelease": "",
-		"Watches": []
+		"Watches": [],
+		"WatchSoftLimit": 0
 	}`
 	b, err := json.MarshalIndent(rt.Sanitized(), "", "    ")
 	if err != nil {

diff --git a/agent/connect/ca/provider_consul_test.go b/agent/connect/ca/provider_consul_test.go
@@ -46,7 +46,7 @@ func (c *consulCAMockDelegate) ApplyCARequest(req *structs.CARequest) error {
 }
 
 func newMockDelegate(t *testing.T, conf *structs.CAConfiguration) *consulCAMockDelegate {
-	s, err := state.NewStateStore(nil)
+	s, err := state.NewStateStore(nil, 1024, nil)
 	if err != nil {
 		t.Fatalf("err: %s", err)
 	}

diff --git a/agent/consul/config.go b/agent/consul/config.go
@@ -33,6 +33,11 @@ const (
 	// MaxRaftMultiplier is a fairly arbitrary upper bound that limits the
 	// amount of performance detuning that's possible.
 	MaxRaftMultiplier uint = 10
+
+	// DefaultSoftWatchLimit is used as a soft limit to cap how many watches we allow
+	// for a given blocking query. If this is exceeded, then we will use a
+	// higher-level watch that's less fine-grained.
+	DefaultSoftWatchLimit = 2048
 )
 
 var (
@@ -380,6 +385,11 @@ type Config struct {
 
 	// ConnectReplicationToken is used to control Intention replication.
 	ConnectReplicationToken string
+
+	// WatchSoftLimit is used as a soft limit to cap how many watches we allow
+	// for a given blocking query. If this is exceeded, then we will use a
+	// higher-level watch that's less fine-grained.
+	WatchSoftLimit int
 }
 
 // CheckProtocolVersion validates the protocol version.

diff --git a/agent/consul/fsm/commands_oss_test.go b/agent/consul/fsm/commands_oss_test.go
@@ -41,7 +41,7 @@ func generateRandomCoordinate() *coordinate.Coordinate {
 
 func TestFSM_RegisterNode(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -85,7 +85,7 @@ func TestFSM_RegisterNode(t *testing.T) {
 
 func TestFSM_RegisterNode_Service(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -148,7 +148,7 @@ func TestFSM_RegisterNode_Service(t *testing.T) {
 
 func TestFSM_DeregisterService(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -210,7 +210,7 @@ func TestFSM_DeregisterService(t *testing.T) {
 
 func TestFSM_DeregisterCheck(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -272,7 +272,7 @@ func TestFSM_DeregisterCheck(t *testing.T) {
 
 func TestFSM_DeregisterNode(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -349,7 +349,7 @@ func TestFSM_DeregisterNode(t *testing.T) {
 
 func TestFSM_KVSDelete(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -395,7 +395,7 @@ func TestFSM_KVSDelete(t *testing.T) {
 
 func TestFSM_KVSDeleteTree(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -442,7 +442,7 @@ func TestFSM_KVSDeleteTree(t *testing.T) {
 
 func TestFSM_KVSDeleteCheckAndSet(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -498,7 +498,7 @@ func TestFSM_KVSDeleteCheckAndSet(t *testing.T) {
 
 func TestFSM_KVSCheckAndSet(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -555,7 +555,7 @@ func TestFSM_KVSCheckAndSet(t *testing.T) {
 
 func TestFSM_KVSLock(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -600,7 +600,7 @@ func TestFSM_KVSLock(t *testing.T) {
 
 func TestFSM_KVSUnlock(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -663,7 +663,7 @@ func TestFSM_KVSUnlock(t *testing.T) {
 
 func TestFSM_CoordinateUpdate(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -704,7 +704,7 @@ func TestFSM_CoordinateUpdate(t *testing.T) {
 
 func TestFSM_SessionCreate_Destroy(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -784,7 +784,7 @@ func TestFSM_SessionCreate_Destroy(t *testing.T) {
 
 func TestFSM_ACL_CRUD(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -902,7 +902,7 @@ func TestFSM_ACL_CRUD(t *testing.T) {
 
 func TestFSM_PreparedQuery_CRUD(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -1000,7 +1000,7 @@ func TestFSM_PreparedQuery_CRUD(t *testing.T) {
 
 func TestFSM_TombstoneReap(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -1048,7 +1048,7 @@ func TestFSM_TombstoneReap(t *testing.T) {
 
 func TestFSM_Txn(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -1090,7 +1090,7 @@ func TestFSM_Txn(t *testing.T) {
 
 func TestFSM_Autopilot(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -1154,7 +1154,7 @@ func TestFSM_Intention_CRUD(t *testing.T) {
 	t.Parallel()
 
 	assert := assert.New(t)
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	assert.Nil(err)
 
 	// Create a new intention.
@@ -1223,7 +1223,7 @@ func TestFSM_CAConfig(t *testing.T) {
 	t.Parallel()
 
 	assert := assert.New(t)
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	assert.Nil(err)
 
 	// Set the autopilot config using a request.
@@ -1290,7 +1290,7 @@ func TestFSM_CARoots(t *testing.T) {
 	t.Parallel()
 
 	assert := assert.New(t)
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	assert.Nil(err)
 
 	// Roots
@@ -1322,7 +1322,7 @@ func TestFSM_CABuiltinProvider(t *testing.T) {
 	t.Parallel()
 
 	assert := assert.New(t)
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	assert.Nil(err)
 
 	// Provider state.

diff --git a/agent/consul/fsm/fsm.go b/agent/consul/fsm/fsm.go
@@ -13,6 +13,10 @@ import (
 	"github.com/hashicorp/raft"
 )
 
+const (
+	testWatchLimit = 1024
+)
+
 // msgpackHandle is a shared handle for encoding/decoding msgpack payloads
 var msgpackHandle = &codec.MsgpackHandle{
 	RawToString: true,
@@ -59,22 +63,26 @@ type FSM struct {
 	stateLock sync.RWMutex
 	state     *state.Store
 
-	gc *state.TombstoneGC
+	gc         *state.TombstoneGC
+	watchLimit int
 }
 
 // New is used to construct a new FSM with a blank state.
-func New(gc *state.TombstoneGC, logOutput io.Writer) (*FSM, error) {
-	stateNew, err := state.NewStateStore(gc)
+func New(gc *state.TombstoneGC, watchLimit int, logOutput io.Writer) (*FSM, error) {
+	logger := log.New(logOutput, "", log.LstdFlags)
+
+	stateNew, err := state.NewStateStore(gc, watchLimit, logger)
 	if err != nil {
 		return nil, err
 	}
 
 	fsm := &FSM{
-		logOutput: logOutput,
-		logger:    log.New(logOutput, "", log.LstdFlags),
-		apply:     make(map[structs.MessageType]command),
-		state:     stateNew,
-		gc:        gc,
+		logOutput:  logOutput,
+		logger:     logger,
+		apply:      make(map[structs.MessageType]command),
+		state:      stateNew,
+		gc:         gc,
+		watchLimit: watchLimit,
 	}
 
 	// Build out the apply dispatch table based on the registered commands.
@@ -136,7 +144,7 @@ func (c *FSM) Restore(old io.ReadCloser) error {
 	defer old.Close()
 
 	// Create a new state store.
-	stateNew, err := state.NewStateStore(c.gc)
+	stateNew, err := state.NewStateStore(c.gc, c.watchLimit, c.logger)
 	if err != nil {
 		return err
 	}

diff --git a/agent/consul/fsm/fsm_test.go b/agent/consul/fsm/fsm_test.go
@@ -39,7 +39,7 @@ func makeLog(buf []byte) *raft.Log {
 
 func TestFSM_IgnoreUnknown(t *testing.T) {
 	t.Parallel()
-	fsm, err := New(nil, os.Stderr)
+	fsm, err := New(nil, testWatchLimit, os.Stderr)
 	assert.Nil(t, err)
 
 	// Create a new reap request
-Original file line number
+Diff line change
@@ Expand Up / @@ -1113,6 +1113,8 @@ func (a *Agent) consulConfig() (*consul.Config, error) { @@
     		return nil, fmt.Errorf("Failed to configure keyring: %v", err)
     	}
+    	base.WatchSoftLimit = a.config.WatchSoftLimit
     	return base, nil
     }
@@ Expand Down @@