Skip to content

Commit

Permalink
Serve metrics from the local registry in the diagnostic service (#51031)
Browse files Browse the repository at this point in the history
* Use local metrics registry in the diagnostic service

* Test metrics are served by the diag service
  • Loading branch information
hugoShaka authored Jan 15, 2025
1 parent a1467b1 commit 9286ccd
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 25 deletions.
54 changes: 29 additions & 25 deletions lib/service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -3424,30 +3424,7 @@ func (l promHTTPLogAdapter) Println(v ...interface{}) {
// prometheus consumption
func (process *TeleportProcess) initMetricsService() error {
mux := http.NewServeMux()

// We gather metrics both from the in-process registry (preferred metrics registration method)
// and the global registry (used by some Teleport services and many dependencies).
gatherers := prometheus.Gatherers{
process.metricsRegistry,
prometheus.DefaultGatherer,
}

metricsHandler := promhttp.InstrumentMetricHandler(
process.metricsRegistry, promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{
// Errors can happen if metrics are registered with identical names in both the local and the global registry.
// In this case, we log the error but continue collecting metrics. The first collected metric will win
// (the one from the local metrics registry takes precedence).
// As we move more things to the local registry, especially in other tools like tbot, we will have less
// conflicts in tests.
ErrorHandling: promhttp.ContinueOnError,
ErrorLog: promHTTPLogAdapter{
ctx: process.ExitContext(),
Logger: process.logger.With(teleport.ComponentKey, teleport.ComponentMetrics),
},
}),
)

mux.Handle("/metrics", metricsHandler)
mux.Handle("/metrics", process.newMetricsHandler())

logger := process.logger.With(teleport.ComponentKey, teleport.Component(teleport.ComponentMetrics, process.id))

Expand Down Expand Up @@ -3532,6 +3509,33 @@ func (process *TeleportProcess) initMetricsService() error {
return nil
}

// newMetricsHandler creates a new metrics handler serving metrics both from the global prometheus registry and the
// in-process one.
func (process *TeleportProcess) newMetricsHandler() http.Handler {
// We gather metrics both from the in-process registry (preferred metrics registration method)
// and the global registry (used by some Teleport services and many dependencies).
gatherers := prometheus.Gatherers{
process.metricsRegistry,
prometheus.DefaultGatherer,
}

metricsHandler := promhttp.InstrumentMetricHandler(
process.metricsRegistry, promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{
// Errors can happen if metrics are registered with identical names in both the local and the global registry.
// In this case, we log the error but continue collecting metrics. The first collected metric will win
// (the one from the local metrics registry takes precedence).
// As we move more things to the local registry, especially in other tools like tbot, we will have less
// conflicts in tests.
ErrorHandling: promhttp.ContinueOnError,
ErrorLog: promHTTPLogAdapter{
ctx: process.ExitContext(),
Logger: process.logger.With(teleport.ComponentKey, teleport.ComponentMetrics),
},
}),
)
return metricsHandler
}

// initDiagnosticService starts diagnostic service currently serving healthz
// and prometheus endpoints
func (process *TeleportProcess) initDiagnosticService() error {
Expand All @@ -3541,7 +3545,7 @@ func (process *TeleportProcess) initDiagnosticService() error {
// metrics will otherwise be served by the metrics service if it's enabled
// in the config.
if !process.Config.Metrics.Enabled {
mux.Handle("/metrics", promhttp.Handler())
mux.Handle("/metrics", process.newMetricsHandler())
}

if process.Config.Debug {
Expand Down
68 changes: 68 additions & 0 deletions lib/service/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1953,6 +1953,10 @@ func TestAgentRolloutController(t *testing.T) {
}, 5*time.Second, 10*time.Millisecond)
}

// TestMetricsService tests that the optional metrics service exposes
// metrics from both the in-process and global metrics registry. When the
// service is disabled, metrics are served by the diagnostics service
// (tested in TestMetricsInDiagnosticsService).
func TestMetricsService(t *testing.T) {
t.Parallel()
// Test setup: create a listener for the metrics server, get its file descriptor.
Expand Down Expand Up @@ -2031,6 +2035,70 @@ func TestMetricsService(t *testing.T) {
require.Contains(t, string(body), "global_metric_"+nonce)
}

// TestMetricsInDiagnosticsService tests that the diagnostics service exposes
// metrics from both the in-process and global metrics registry when the metrics
// service is disabled.
func TestMetricsInDiagnosticsService(t *testing.T) {
t.Parallel()
// Test setup: create a new teleport process
dataDir := makeTempDir(t)
cfg := servicecfg.MakeDefaultConfig()
cfg.DataDir = dataDir
cfg.SetAuthServerAddress(utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"})
cfg.Auth.Enabled = true
cfg.Proxy.Enabled = false
cfg.SSH.Enabled = false
cfg.DebugService.Enabled = false
cfg.Auth.StorageConfig.Params["path"] = dataDir
cfg.Auth.ListenAddr = utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"}
cfg.DiagnosticAddr = utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"}

// Test setup: Create and start the Teleport service.
process, err := NewTeleport(cfg)
require.NoError(t, err)
require.NoError(t, process.Start())
t.Cleanup(func() {
assert.NoError(t, process.Close())
assert.NoError(t, process.Wait())
})

// Test setup: create our test metrics.
nonce := strings.ReplaceAll(uuid.NewString(), "-", "")
localMetric := prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "test",
Name: "local_metric_" + nonce,
})
globalMetric := prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "test",
Name: "global_metric_" + nonce,
})
require.NoError(t, process.metricsRegistry.Register(localMetric))
require.NoError(t, prometheus.Register(globalMetric))

ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
t.Cleanup(cancel)
_, err = process.WaitForEvent(ctx, TeleportReadyEvent)
require.NoError(t, err)

// Test execution: query the metrics endpoint and check the tests metrics are here.
diagAddr, err := process.DiagnosticAddr()
require.NoError(t, err)
metricsURL, err := url.Parse("http://" + diagAddr.String())
require.NoError(t, err)
metricsURL.Path = "/metrics"
resp, err := http.Get(metricsURL.String())
require.NoError(t, err)
require.Equal(t, http.StatusOK, resp.StatusCode)

body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
require.NoError(t, resp.Body.Close())

// Test validation: check that the metrics server served both the local and global registry.
require.Contains(t, string(body), "local_metric_"+nonce)
require.Contains(t, string(body), "global_metric_"+nonce)
}

// makeTempDir makes a temp dir with a shorter name than t.TempDir() in order to
// avoid https://github.com/golang/go/issues/62614.
func makeTempDir(t *testing.T) string {
Expand Down

0 comments on commit 9286ccd

Please sign in to comment.