diff --git a/Makefile b/Makefile index 2fcb02b06d..572f996bd0 100644 --- a/Makefile +++ b/Makefile @@ -131,7 +131,7 @@ endif GCM_SECRET?='' .PHONY: test-export-gcm test-export-gcm: ## Run export unit tests that will use GCM if GCM_SECRET is present. - ## TODO(bwplotka): Move to cloud build. + ## TODO(b/306337101): Move to cloud build. ifneq ($(GCM_SECRET), '') TEST_TAG=false go test -v ./pkg/export/gcm else diff --git a/pkg/export/gcm/export_gcm_test.go b/pkg/export/gcm/export_gcm_test.go index f86cd5c5ec..49a1cc25a6 100644 --- a/pkg/export/gcm/export_gcm_test.go +++ b/pkg/export/gcm/export_gcm_test.go @@ -34,6 +34,7 @@ func TestExport_CounterReset(t *testing.T) { scrape(interval). Expect(200, c, prom) // Nothing is expected for GMP due to cannibalization. + // See https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#counter-sums // TODO(bwplotka): Fix with b/259261536. c.Add(10) @@ -41,50 +42,50 @@ func TestExport_CounterReset(t *testing.T) { Expect(10, c, export). Expect(210, c, prom) - c.Add(50) + c.Add(40) scrape(interval). - Expect(60, c, export). - Expect(260, c, prom) + Expect(50, c, export). + Expect(250, c, prom) - // Reset to 0, then add something that still should indicate "decreased counter" - // in absolute values, but our current cannibalization logic is wrong here. - // It's perhaps unusual that reset happens without scrape target restart - // so, it was easy to forget about this case. + // Reset to 0 (simulating instrumentation resetting metric or restarting target). counter.Reset() c = counter.WithLabelValues("bar") - c.Add(250) scrape(interval). - // TODO(bwplotka): Very odd, export logic is fine, added b/305901765 - Expect(310, c, export). - Expect(250, c, prom) + // NOTE(bwplotka): This and following discrepancies are expected due to + // GCM PromQL layer using MQL with delta alignment. What we get as a raw + // counter is already reset-normalized (b/305901765) (plus cannibalization). + Expect(50, c, export). + Expect(0, c, prom) - c.Add(50) + c.Add(150) scrape(interval). - Expect(360, c, export). - Expect(300, c, prom) + Expect(200, c, export). + Expect(150, c, prom) - // Reset to 0 again, our export does not detect it again. + // Reset to 0 with addition. counter.Reset() c = counter.WithLabelValues("bar") - c.Add(100) + c.Add(20) scrape(interval). - // TODO(bwplotka): Very odd, export logic is fine, added b/305901765 - Expect(460, c, export). - Expect(100, c, prom) + Expect(220, c, export). + Expect(20, c, prom) c.Add(50) scrape(interval). - Expect(510, c, export). - Expect(150, c, prom) + Expect(270, c, export). + Expect(70, c, prom) + + c.Add(10) + scrape(interval). + Expect(280, c, export). + Expect(80, c, prom) // Tricky reset case, unnoticeable reset for Prometheus without created timestamp as well. counter.Reset() c = counter.WithLabelValues("bar") c.Add(600) scrape(interval). - // TODO(bwplotka): Even more odd, I would expect wrong 1110 value, not 960, - // part of b/305901765 - Expect(960, c, export). // Also something is off, why 960, not 1110? + Expect(800, c, export). Expect(600, c, prom) }) diff --git a/pkg/export/gcm/promtest/prometheus.go b/pkg/export/gcm/promtest/prometheus.go index 972cd97442..236785ea18 100644 --- a/pkg/export/gcm/promtest/prometheus.go +++ b/pkg/export/gcm/promtest/prometheus.go @@ -36,7 +36,7 @@ func Prometheus(image string) Backend { func (p *promBackend) Ref() string { return "prometheus" } // recordGatherer is a prometheus.Gatherer capable to "play" the recorded metric state -// with fixed timestamp to the backfill data into Prometheus compatible system. +// with fixed timestamps to backfill data into Prometheus compatible system. type recordGatherer struct { i int plannedScrapes [][]*dto.MetricFamily @@ -96,10 +96,6 @@ func (p *promBackend) start(t testing.TB, env e2e.Environment) (v1.API, map[stri } func newPrometheus(env e2e.Environment, name string, image string, scrapeTargetAddress string, flagOverride map[string]string) *e2emon.Prometheus { - - if image == "" { - image = "quay.io/prometheus/prometheus:v2.47.2" - } ports := map[string]int{"http": 9090} f := env.Runnable(name).WithPorts(ports).Future() diff --git a/pkg/export/gcm/promtest/promtest.go b/pkg/export/gcm/promtest/promtest.go index 932975b54b..36cfa7f8c7 100644 --- a/pkg/export/gcm/promtest/promtest.go +++ b/pkg/export/gcm/promtest/promtest.go @@ -79,10 +79,10 @@ func NewIngestionTest(t testing.TB, backends []Backend) *ingestionTest { it := &ingestionTest{ t: t, - // Use ULID as a unique label per test run. + // NOTE(bwplotka): Cardinality is obvious, but even for 100 test runs a day with // 100 cases, 10 samples each, that's "only" 10k series / 0.1 million samples a day. - testID: ulid.MustNew(ulid.Now(), rand.New(rand.NewSource(time.Now().UnixNano()))).String(), + testID: fmt.Sprintf("%v: %v", t.Name(), ulid.MustNew(ulid.Now(), rand.New(rand.NewSource(time.Now().UnixNano()))).String()), // 1h in the past as Monarch allows 24h, but Prometheus allows 2h (plus some buffer). // It should give buffer for a fair amount of samples from -1h to now. @@ -174,7 +174,7 @@ func (ir *ingestionTestExpRecorder) Expect(val float64, metric prometheus.Metric metric.Write(&m) if m.GetSummary() != nil || m.GetHistogram() != nil { // TODO(bwplotka): Implement an alternative. - ir.it.t.Fatal("It's not practical to use equalsGCMPromQuery against histograms and summaries.") + ir.it.t.Fatal("It's not practical to use Expect against histograms and summaries.") } modelMetric := toModelMetric(metric) @@ -232,7 +232,7 @@ func (it *ingestionTest) FatalOnUnexpectedPromQLResults(b Backend, metric promet if m.GetSummary() != nil || m.GetHistogram() != nil { // TODO(bwplotka): Implement alternative. - it.t.Fatal("It's not practical to use equalsGCMPromQuery against histograms and summaries.") + it.t.Fatal("It's not practical to use FatalOnUnexpectedPromQLResults against histograms and summaries.") } bMeta, ok := it.backends[b.Ref()]