diff --git a/lib/service/service.go b/lib/service/service.go index 4d70c019c7718..ec91870cd18e4 100644 --- a/lib/service/service.go +++ b/lib/service/service.go @@ -3424,30 +3424,7 @@ func (l promHTTPLogAdapter) Println(v ...interface{}) { // prometheus consumption func (process *TeleportProcess) initMetricsService() error { mux := http.NewServeMux() - - // We gather metrics both from the in-process registry (preferred metrics registration method) - // and the global registry (used by some Teleport services and many dependencies). - gatherers := prometheus.Gatherers{ - process.metricsRegistry, - prometheus.DefaultGatherer, - } - - metricsHandler := promhttp.InstrumentMetricHandler( - process.metricsRegistry, promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{ - // Errors can happen if metrics are registered with identical names in both the local and the global registry. - // In this case, we log the error but continue collecting metrics. The first collected metric will win - // (the one from the local metrics registry takes precedence). - // As we move more things to the local registry, especially in other tools like tbot, we will have less - // conflicts in tests. - ErrorHandling: promhttp.ContinueOnError, - ErrorLog: promHTTPLogAdapter{ - ctx: process.ExitContext(), - Logger: process.logger.With(teleport.ComponentKey, teleport.ComponentMetrics), - }, - }), - ) - - mux.Handle("/metrics", metricsHandler) + mux.Handle("/metrics", process.newMetricsHandler()) logger := process.logger.With(teleport.ComponentKey, teleport.Component(teleport.ComponentMetrics, process.id)) @@ -3532,6 +3509,33 @@ func (process *TeleportProcess) initMetricsService() error { return nil } +// newMetricsHandler creates a new metrics handler serving metrics both from the global prometheus registry and the +// in-process one. +func (process *TeleportProcess) newMetricsHandler() http.Handler { + // We gather metrics both from the in-process registry (preferred metrics registration method) + // and the global registry (used by some Teleport services and many dependencies). + gatherers := prometheus.Gatherers{ + process.metricsRegistry, + prometheus.DefaultGatherer, + } + + metricsHandler := promhttp.InstrumentMetricHandler( + process.metricsRegistry, promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{ + // Errors can happen if metrics are registered with identical names in both the local and the global registry. + // In this case, we log the error but continue collecting metrics. The first collected metric will win + // (the one from the local metrics registry takes precedence). + // As we move more things to the local registry, especially in other tools like tbot, we will have less + // conflicts in tests. + ErrorHandling: promhttp.ContinueOnError, + ErrorLog: promHTTPLogAdapter{ + ctx: process.ExitContext(), + Logger: process.logger.With(teleport.ComponentKey, teleport.ComponentMetrics), + }, + }), + ) + return metricsHandler +} + // initDiagnosticService starts diagnostic service currently serving healthz // and prometheus endpoints func (process *TeleportProcess) initDiagnosticService() error { @@ -3541,7 +3545,7 @@ func (process *TeleportProcess) initDiagnosticService() error { // metrics will otherwise be served by the metrics service if it's enabled // in the config. if !process.Config.Metrics.Enabled { - mux.Handle("/metrics", promhttp.Handler()) + mux.Handle("/metrics", process.newMetricsHandler()) } if process.Config.Debug { diff --git a/lib/service/service_test.go b/lib/service/service_test.go index 4c08a87689145..00d8890c0d0dd 100644 --- a/lib/service/service_test.go +++ b/lib/service/service_test.go @@ -1953,6 +1953,10 @@ func TestAgentRolloutController(t *testing.T) { }, 5*time.Second, 10*time.Millisecond) } +// TestMetricsService tests that the optional metrics service exposes +// metrics from both the in-process and global metrics registry. When the +// service is disabled, metrics are served by the diagnostics service +// (tested in TestMetricsInDiagnosticsService). func TestMetricsService(t *testing.T) { t.Parallel() // Test setup: create a listener for the metrics server, get its file descriptor. @@ -2031,6 +2035,70 @@ func TestMetricsService(t *testing.T) { require.Contains(t, string(body), "global_metric_"+nonce) } +// TestMetricsInDiagnosticsService tests that the diagnostics service exposes +// metrics from both the in-process and global metrics registry when the metrics +// service is disabled. +func TestMetricsInDiagnosticsService(t *testing.T) { + t.Parallel() + // Test setup: create a new teleport process + dataDir := makeTempDir(t) + cfg := servicecfg.MakeDefaultConfig() + cfg.DataDir = dataDir + cfg.SetAuthServerAddress(utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"}) + cfg.Auth.Enabled = true + cfg.Proxy.Enabled = false + cfg.SSH.Enabled = false + cfg.DebugService.Enabled = false + cfg.Auth.StorageConfig.Params["path"] = dataDir + cfg.Auth.ListenAddr = utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"} + cfg.DiagnosticAddr = utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"} + + // Test setup: Create and start the Teleport service. + process, err := NewTeleport(cfg) + require.NoError(t, err) + require.NoError(t, process.Start()) + t.Cleanup(func() { + assert.NoError(t, process.Close()) + assert.NoError(t, process.Wait()) + }) + + // Test setup: create our test metrics. + nonce := strings.ReplaceAll(uuid.NewString(), "-", "") + localMetric := prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "test", + Name: "local_metric_" + nonce, + }) + globalMetric := prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "test", + Name: "global_metric_" + nonce, + }) + require.NoError(t, process.metricsRegistry.Register(localMetric)) + require.NoError(t, prometheus.Register(globalMetric)) + + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + t.Cleanup(cancel) + _, err = process.WaitForEvent(ctx, TeleportReadyEvent) + require.NoError(t, err) + + // Test execution: query the metrics endpoint and check the tests metrics are here. + diagAddr, err := process.DiagnosticAddr() + require.NoError(t, err) + metricsURL, err := url.Parse("http://" + diagAddr.String()) + require.NoError(t, err) + metricsURL.Path = "/metrics" + resp, err := http.Get(metricsURL.String()) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + require.NoError(t, resp.Body.Close()) + + // Test validation: check that the metrics server served both the local and global registry. + require.Contains(t, string(body), "local_metric_"+nonce) + require.Contains(t, string(body), "global_metric_"+nonce) +} + // makeTempDir makes a temp dir with a shorter name than t.TempDir() in order to // avoid https://github.com/golang/go/issues/62614. func makeTempDir(t *testing.T) string {