From 842a79123338c75c65efa316b0cdcacfa24b05ec Mon Sep 17 00:00:00 2001
From: Ridwan Sharif <ridwanmsharif@google.com>
Date: Thu, 19 Oct 2023 22:05:33 +0000
Subject: [PATCH] scrape: make the initial scrape offset configurable

This change changes the `NoJitter` boolean configuration to a
confiurable `InitialScrapeOffset` option. This will let serverless
setups configure more freely how it will want to scrape workloads.

This was needed because in some deployments in serverless environments
the scraper (colocated with the workload) might not want to wait an
entire scrape interval (since the target could be shortlived) but also
might not want to scrape immediately when the target is not ready.

Making this configurable lets the scrape controller (OpenTelemetry
Contrib's prometheusreceiver) choose that is appropriate for it
depending on its environment.

Signed-off-by: Ridwan Sharif <ridwanmsharif@google.com>
---
 scrape/manager.go      |  16 +++++--
 scrape/manager_test.go | 102 +++++++++++++++++++++++++----------------
 scrape/scrape.go       |   4 +-
 3 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/scrape/manager.go b/scrape/manager.go
index ad4a47305a..c2b434308e 100644
--- a/scrape/manager.go
+++ b/scrape/manager.go
@@ -141,12 +141,18 @@ type Options struct {
 	// Optional HTTP client options to use when scraping.
 	HTTPClientOptions []config_util.HTTPClientOption
 
-	// IgnoreJitter causes all targets managed by this manager to be scraped
-	// as soon as they are discovered. By default, all targets have offset,
-	// so we spread the scraping load evenly within Prometheus server.
+	// InitialScrapeOffset controls how long after startup we should scrape all
+	// targets.  By default, all targets have an offset so we spread the
+	// scraping load evenly within the Prometheus server. Configuring this will
+	// make it so all targets have the same configured offset, which may be
+	// undesirable as load is no longer evenly spread.  This is useful however
+	// in serverless deployments where we're sensitive to the intitial offsets
+	// and would like them to be small and configurable.
+	//
 	// NOTE(bwplotka): This option is experimental and not used by Prometheus.
-	// It was created for serverless flavors of OpenTelemetry contrib's prometheusreceiver.
-	IgnoreJitter bool
+	// It was created for serverless flavors of OpenTelemetry contrib's
+	// prometheusreceiver.
+	InitialScrapeOffset *time.Duration
 }
 
 // Manager maintains a set of scrape pools and manages start/stop cycles
diff --git a/scrape/manager_test.go b/scrape/manager_test.go
index c4ce8e8d29..e49d5b4833 100644
--- a/scrape/manager_test.go
+++ b/scrape/manager_test.go
@@ -15,7 +15,6 @@ package scrape
 
 import (
 	"context"
-	"errors"
 	"net/http"
 	"net/http/httptest"
 	"net/url"
@@ -34,7 +33,6 @@ import (
 	"github.com/prometheus/prometheus/discovery/targetgroup"
 	"github.com/prometheus/prometheus/model/labels"
 	"github.com/prometheus/prometheus/model/relabel"
-	"github.com/prometheus/prometheus/util/runutil"
 )
 
 func TestPopulateLabels(t *testing.T) {
@@ -711,47 +709,88 @@ scrape_configs:
 }
 
 func TestManagerStopAfterScrapeAttempt(t *testing.T) {
+	noOffset := 0 * time.Nanosecond
+	largeOffset := 99 * time.Hour
+	oneSecondOffset := 1 * time.Second
+	tenSecondOffset := 10 * time.Second
 	for _, tcase := range []struct {
-		name            string
-		noJitter        bool
-		stop            func(m *Manager)
+		name string
+		// initialScrapeOffset defines how long to wait before scraping all targets.
+		initialScrapeOffset *time.Duration
+		// stopDelay defines how long the scrape loop should run before the the stopFunc is run.
+		stopDelay time.Duration
+		// stopFunc controls how the manager should be stopped.
+		stopFunc        func(m *Manager)
 		expectedSamples int
 	}{
 		{
-			name:            "no scrape stop, no jitter",
-			noJitter:        true,
-			stop:            func(m *Manager) { m.Stop() },
-			expectedSamples: 1,
+			name:                "no scrape on stop, no jitter",
+			initialScrapeOffset: &noOffset,
+			stopDelay:           5 * time.Second,
+			stopFunc:            func(m *Manager) { m.Stop() },
+			expectedSamples:     1,
 		},
 		{
 			name:            "no scrape on stop, with jitter",
-			stop:            func(m *Manager) { m.Stop() },
+			stopDelay:       5 * time.Second,
+			stopFunc:        func(m *Manager) { m.Stop() },
 			expectedSamples: 0,
 		},
 		{
-			name:            "scrape on stop, no jitter",
-			noJitter:        true,
-			stop:            func(m *Manager) { m.StopAfterScrapeAttempt(time.Now()) },
-			expectedSamples: 2,
+			name:                "scrape on stop, no jitter",
+			initialScrapeOffset: &noOffset,
+			stopDelay:           5 * time.Second,
+			stopFunc:            func(m *Manager) { m.StopAfterScrapeAttempt(time.Now()) },
+			expectedSamples:     2,
 		},
 		{
-			name:            "scrape on stop, but initial sample is fresh enough, no jitter",
-			noJitter:        true,
-			stop:            func(m *Manager) { m.StopAfterScrapeAttempt(time.Now().Add(-1 * time.Hour)) },
-			expectedSamples: 1,
+			name:                "scrape on stop, but initial sample is fresh enough, no jitter",
+			initialScrapeOffset: &noOffset,
+			stopDelay:           5 * time.Second,
+			stopFunc:            func(m *Manager) { m.StopAfterScrapeAttempt(time.Now().Add(-1 * time.Hour)) },
+			expectedSamples:     1,
 		},
 		{
 			name:            "scrape on stop, with jitter",
-			stop:            func(m *Manager) { m.StopAfterScrapeAttempt(time.Now()) },
+			stopDelay:       5 * time.Second,
+			stopFunc:        func(m *Manager) { m.StopAfterScrapeAttempt(time.Now()) },
 			expectedSamples: 1,
 		},
+		{
+			name:                "scrape on stop, with large offset",
+			initialScrapeOffset: &largeOffset,
+			stopDelay:           5 * time.Second,
+			stopFunc:            func(m *Manager) { m.StopAfterScrapeAttempt(time.Now()) },
+			expectedSamples:     1,
+		},
+		{
+			name:                "scrape on stop after 5s, with offset of 1s",
+			initialScrapeOffset: &oneSecondOffset,
+			stopDelay:           5 * time.Second,
+			stopFunc:            func(m *Manager) { m.StopAfterScrapeAttempt(time.Now()) },
+			expectedSamples:     2,
+		},
+		{
+			name:                "scrape on stop after 5s, with offset of 10s",
+			initialScrapeOffset: &tenSecondOffset,
+			stopDelay:           5 * time.Second,
+			stopFunc:            func(m *Manager) { m.StopAfterScrapeAttempt(time.Now()) },
+			expectedSamples:     1,
+		},
+		{
+			name:                "no scrape on stop, with offset of 10s",
+			initialScrapeOffset: &tenSecondOffset,
+			stopDelay:           5 * time.Second,
+			stopFunc:            func(m *Manager) { m.Stop() },
+			expectedSamples:     0,
+		},
 	} {
 		t.Run(tcase.name, func(t *testing.T) {
 			app := &collectResultAppender{}
 
 			// Setup scrape manager.
 			scrapeManager := NewManager(&Options{
-				IgnoreJitter: tcase.noJitter,
+				InitialScrapeOffset: tcase.initialScrapeOffset,
 
 				// Extremely high value to turn it off. We don't want to wait minimum 5s, so
 				// we reload manually.
@@ -795,27 +834,10 @@ func TestManagerStopAfterScrapeAttempt(t *testing.T) {
 			})
 			scrapeManager.reload()
 
-			// At this point the first sample is scheduled to be scraped after the initial
-			// jitter in the background scrape loop go-routine
-			//
-			// With jitter the first sample will appear after long time,
-			// given the extremely long scrape interval configured. We stop right
-			// away and expect only the last sample due to stop.
-			//
-			// With no jitter setting, we expect the first to be added straight away--wait
-			// for it, before stopping.
-			if tcase.noJitter {
-				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-				defer cancel()
-				require.NoError(t, runutil.Retry(100*time.Millisecond, ctx.Done(), func() error {
-					if countFloatSamples(app, "expected_metric") < 1 {
-						return errors.New("expected more then one expected_metric sample")
-					}
-					return nil
-				}), "after 5 seconds")
-			}
+			// Wait for the defined stop delay, before stopping.
+			time.Sleep(tcase.stopDelay)
 
-			tcase.stop(scrapeManager)
+			tcase.stopFunc(scrapeManager)
 
 			require.Equal(t, tcase.expectedSamples, countFloatSamples(app, "expected_metric"))
 		})
diff --git a/scrape/scrape.go b/scrape/scrape.go
index ff2120e1eb..e97bec4851 100644
--- a/scrape/scrape.go
+++ b/scrape/scrape.go
@@ -1239,8 +1239,8 @@ func (sl *scrapeLoop) run(errc chan<- error) {
 	defer close(sl.stopAfterScrapeAttemptCh)
 
 	jitterDelayTime := sl.scraper.offset(sl.interval, sl.offsetSeed)
-	if sl.opts.IgnoreJitter {
-		jitterDelayTime = 0 * time.Second
+	if sl.opts.InitialScrapeOffset != nil {
+		jitterDelayTime = *sl.opts.InitialScrapeOffset
 	}
 
 	select {