[Heartbeat] Fix scheduler job type limit algorithm (#27559) (#27573)

Previously heartbeat would break when running with mode: all since that would create multiple terminal jobs. These would all attempt to release from the limit semaphore, when only the last one should. This also refactors recursive job running into an OO type structure to make things more readable. (cherry picked from commit d561a55) Co-authored-by: Andrew Cholakian <andrew@andrewvc.com>
elastic · Aug 24, 2021 · 64b4eed · 64b4eed
1 parent 48fae81
commit 64b4eed
Show file tree

Hide file tree

Showing 7 changed files with 282 additions and 135 deletions.
diff --git a/heartbeat/_meta/config/beat.reference.yml.tmpl b/heartbeat/_meta/config/beat.reference.yml.tmpl
@@ -295,3 +295,14 @@ heartbeat.scheduler:
 
   # Set the scheduler it's time zone
   #location: ''
+
+heartbeat.jobs:
+  # Limit the number of concurrent monitors executed by heartbeat. This differs from
+  # heartbeat.scheduler.limit in that it maps to individual monitors rather than the 
+  # subtasks of monitors. For non-browser monitors a subtask usually corresponds to a
+  # single file descriptor.
+  # This feature is most useful for the browser type
+  #browser.limit: 1
+  #http.limit: 10
+  #tcp.limit: 10
+  #icmp.limit: 10
diff --git a/heartbeat/heartbeat.reference.yml b/heartbeat/heartbeat.reference.yml
@@ -296,6 +296,16 @@ heartbeat.scheduler:
   # Set the scheduler it's time zone
   #location: ''
 
+heartbeat.jobs:
+  # Limit the number of concurrent monitors executed by heartbeat. This differs from
+  # heartbeat.scheduler.limit in that it maps to individual monitors rather than the 
+  # subtasks of monitors. For non-browser monitors a subtask usually corresponds to a
+  # single file descriptor.
+  # This feature is most useful for the browser type
+  #browser.limit: 1
+  #http.limit: 10
+  #tcp.limit: 10
+  #icmp.limit: 10
 # ================================== General ===================================
 
 # The name of the shipper that publishes the network data. It can be used to group

diff --git a/heartbeat/scheduler/schedjob.go b/heartbeat/scheduler/schedjob.go
@@ -0,0 +1,124 @@
+// Licensed to Elasticsearch B.V. under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Elasticsearch B.V. licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package scheduler
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"golang.org/x/sync/semaphore"
+
+	"github.com/elastic/beats/v7/libbeat/common/atomic"
+)
+
+type schedJob struct {
+	id          string
+	ctx         context.Context
+	scheduler   *Scheduler
+	wg          *sync.WaitGroup
+	entrypoint  TaskFunc
+	jobLimitSem *semaphore.Weighted
+	activeTasks atomic.Int
+}
+
+// runRecursiveJob runs the entry point for a job, blocking until all subtasks are completed.
+// Subtasks are run in separate goroutines.
+// returns the time execution began on its first task
+func newSchedJob(ctx context.Context, s *Scheduler, id string, jobType string, task TaskFunc) *schedJob {
+	return &schedJob{
+		id:          id,
+		ctx:         ctx,
+		scheduler:   s,
+		jobLimitSem: s.jobLimitSem[jobType],
+		entrypoint:  task,
+		activeTasks: atomic.MakeInt(0),
+		wg:          &sync.WaitGroup{},
+	}
+}
+
+// runRecursiveTask runs an individual task and its continuations until none are left with as much parallelism as possible.
+// Since task funcs can emit continuations recursively we need a function to execute
+// recursively.
+// The wait group passed into this function expects to already have its count incremented by one.
+func (sj *schedJob) run() (startedAt time.Time) {
+	sj.wg.Add(1)
+	sj.activeTasks.Inc()
+	if sj.jobLimitSem != nil {
+		sj.jobLimitSem.Acquire(sj.ctx, 1)
+	}
+
+	startedAt = sj.runTask(sj.entrypoint)
+
+	sj.wg.Wait()
+	return startedAt
+}
+
+// runRecursiveTask runs an individual task and its continuations until none are left with as much parallelism as possible.
+// Since task funcs can emit continuations recursively we need a function to execute
+// recursively.
+// The wait group passed into this function expects to already have its count incremented by one.
+func (sj *schedJob) runTask(task TaskFunc) time.Time {
+	defer sj.wg.Done()
+	defer sj.activeTasks.Dec()
+
+	// The accounting for waiting/active tasks is done using atomics.
+	// Absolute accuracy is not critical here so the gap between modifying waitingTasks and activeJobs is acceptable.
+	sj.scheduler.stats.waitingTasks.Inc()
+
+	// Acquire an execution slot in keeping with heartbeat.scheduler.limit
+	// this should block until resources are available.
+	// In the case where the semaphore has free resources immediately
+	// it will not block and will not check the cancelled status of the
+	// context, which is OK, because we check it later anyway.
+	limitErr := sj.scheduler.limitSem.Acquire(sj.ctx, 1)
+	sj.scheduler.stats.waitingTasks.Dec()
+	if limitErr == nil {
+		defer sj.scheduler.limitSem.Release(1)
+	}
+
+	// Record the time this task started now that we have a resource to execute with
+	startedAt := time.Now()
+
+	// Check if the scheduler has been shut down. If so, exit early
+	select {
+	case <-sj.ctx.Done():
+		return startedAt
+	default:
+		sj.scheduler.stats.activeTasks.Inc()
+
+		continuations := task(sj.ctx)
+		sj.scheduler.stats.activeTasks.Dec()
+
+		sj.wg.Add(len(continuations))
+		sj.activeTasks.Add(len(continuations))
+		for _, cont := range continuations {
+			// Run continuations in parallel, note that these each will acquire their own slots
+			// We can discard the started at times for continuations as those are
+			// irrelevant
+			go sj.runTask(cont)
+		}
+		// There is always at least 1 task (the current one), if that's all, then we know
+		// there are no other jobs active or pending, and we can release the jobLimitSem
+		if sj.jobLimitSem != nil && sj.activeTasks.Load() == 1 {
+			sj.jobLimitSem.Release(1)
+		}
+	}
+
+	return startedAt
+}
diff --git a/heartbeat/scheduler/schedjob_test.go b/heartbeat/scheduler/schedjob_test.go
@@ -0,0 +1,122 @@
+// Licensed to Elasticsearch B.V. under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Elasticsearch B.V. licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package scheduler
+
+import (
+	"context"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/elastic/beats/v7/heartbeat/config"
+	batomic "github.com/elastic/beats/v7/libbeat/common/atomic"
+	"github.com/elastic/beats/v7/libbeat/monitoring"
+)
+
+func TestSchedJobRun(t *testing.T) {
+	cancelledCtx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	testCases := []struct {
+		name          string
+		jobCtx        context.Context
+		overLimit     bool
+		shouldRunTask bool
+	}{
+		{
+			"context not cancelled",
+			context.Background(),
+			false,
+			true,
+		},
+		{
+			"context cancelled",
+			cancelledCtx,
+			false,
+			false,
+		},
+		{
+			"context cancelled over limit",
+			cancelledCtx,
+			true,
+			false,
+		},
+	}
+
+	for _, testCase := range testCases {
+		t.Run(testCase.name, func(t *testing.T) {
+			limit := int64(100)
+			s := NewWithLocation(limit, monitoring.NewRegistry(), tarawaTime(), nil)
+
+			if testCase.overLimit {
+				s.limitSem.Acquire(context.Background(), limit)
+			}
+
+			wg := &sync.WaitGroup{}
+			wg.Add(1)
+			executed := batomic.MakeBool(false)
+
+			tf := func(ctx context.Context) []TaskFunc {
+				executed.Store(true)
+				return nil
+			}
+
+			beforeStart := time.Now()
+			sj := newSchedJob(testCase.jobCtx, s, "myid", "atype", tf)
+			startedAt := sj.run()
+
+			// This will panic in the case where we don't check s.limitSem.Acquire
+			// for an error value and released an unacquired resource in scheduler.go.
+			// In that case this will release one more resource than allowed causing
+			// the panic.
+			if testCase.overLimit {
+				s.limitSem.Release(limit)
+			}
+
+			require.Equal(t, testCase.shouldRunTask, executed.Load())
+			require.True(t, startedAt.Equal(beforeStart) || startedAt.After(beforeStart))
+		})
+	}
+}
+
+// testRecursiveForkingJob tests that a schedJob that splits into multiple parallel pieces executes without error
+func TestRecursiveForkingJob(t *testing.T) {
+	s := NewWithLocation(1000, monitoring.NewRegistry(), tarawaTime(), map[string]config.JobLimit{
+		"atype": {Limit: 1},
+	})
+	ran := batomic.NewInt(0)
+
+	var terminalTf TaskFunc = func(ctx context.Context) []TaskFunc {
+		ran.Inc()
+		return nil
+	}
+	var forkingTf TaskFunc = func(ctx context.Context) []TaskFunc {
+		ran.Inc()
+		return []TaskFunc{
+			terminalTf, terminalTf, terminalTf,
+		}
+	}
+
+	sj := newSchedJob(context.Background(), s, "myid", "atype", forkingTf)
+
+	sj.run()
+	require.Equal(t, 4, ran.Load())
+
+}
diff --git a/heartbeat/scheduler/scheduler.go b/heartbeat/scheduler/scheduler.go
@@ -22,7 +22,6 @@ import (
 	"errors"
 	"fmt"
 	"math"
-	"sync"
 	"time"
 
 	"golang.org/x/sync/semaphore"
@@ -207,7 +206,8 @@ func (s *Scheduler) Add(sched Schedule, id string, entrypoint TaskFunc, jobType
 		default:
 		}
 		s.stats.activeJobs.Inc()
-		lastRanAt = s.runRecursiveJob(jobCtx, entrypoint, jobType)
+		debugf("Job '%s' started", id)
+		lastRanAt := newSchedJob(jobCtx, s, id, jobType, entrypoint).run()
 		s.stats.activeJobs.Dec()
 		s.runOnce(sched.Next(lastRanAt), taskFn)
 		debugf("Job '%v' returned at %v", id, time.Now())
@@ -241,68 +241,3 @@ func (s *Scheduler) runOnce(runAt time.Time, taskFn timerqueue.TimerTaskFn) {
 	asyncTask := func(now time.Time) { go taskFn(now) }
 	s.timerQueue.Push(runAt, asyncTask)
 }
-
-// runRecursiveJob runs the entry point for a job, blocking until all subtasks are completed.
-// Subtasks are run in separate goroutines.
-// returns the time execution began on its first task
-func (s *Scheduler) runRecursiveJob(jobCtx context.Context, task TaskFunc, jobType string) (startedAt time.Time) {
-	wg := &sync.WaitGroup{}
-	jobSem := s.jobLimitSem[jobType]
-	if jobSem != nil {
-		jobSem.Acquire(jobCtx, 1)
-	}
-	wg.Add(1)
-	startedAt = s.runRecursiveTask(jobCtx, task, wg, jobSem)
-	wg.Wait()
-	return startedAt
-}
-
-// runRecursiveTask runs an individual task and its continuations until none are left with as much parallelism as possible.
-// Since task funcs can emit continuations recursively we need a function to execute
-// recursively.
-// The wait group passed into this function expects to already have its count incremented by one.
-func (s *Scheduler) runRecursiveTask(jobCtx context.Context, task TaskFunc, wg *sync.WaitGroup, jobSem *semaphore.Weighted) (startedAt time.Time) {
-	defer wg.Done()
-
-	// The accounting for waiting/active tasks is done using atomics.
-	// Absolute accuracy is not critical here so the gap between modifying waitingTasks and activeJobs is acceptable.
-	s.stats.waitingTasks.Inc()
-
-	// Acquire an execution slot in keeping with heartbeat.scheduler.limit
-	// this should block until resources are available.
-	// In the case where the semaphore has free resources immediately
-	// it will not block and will not check the cancelled status of the
-	// context, which is OK, because we check it later anyway.
-	limitErr := s.limitSem.Acquire(jobCtx, 1)
-	s.stats.waitingTasks.Dec()
-	if limitErr == nil {
-		defer s.limitSem.Release(1)
-	}
-
-	// Record the time this task started now that we have a resource to execute with
-	startedAt = time.Now()
-
-	// Check if the scheduler has been shut down. If so, exit early
-	select {
-	case <-jobCtx.Done():
-		return startedAt
-	default:
-		s.stats.activeTasks.Inc()
-
-		continuations := task(jobCtx)
-		s.stats.activeTasks.Dec()
-
-		wg.Add(len(continuations))
-		for _, cont := range continuations {
-			// Run continuations in parallel, note that these each will acquire their own slots
-			// We can discard the started at times for continuations as those are
-			// irrelevant
-			go s.runRecursiveTask(jobCtx, cont, wg, jobSem)
-		}
-		if jobSem != nil && len(continuations) == 0 {
-			jobSem.Release(1)
-		}
-	}
-
-	return startedAt
-}