diff --git a/client/client.go b/client/client.go index 6b66bd5a31b..643cfec3cbc 100644 --- a/client/client.go +++ b/client/client.go @@ -2638,7 +2638,7 @@ func (c *Client) emitClientMetrics() { // Emit allocation metrics blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0 for _, ar := range c.getAllocRunners() { - switch ar.Alloc().ClientStatus { + switch ar.AllocState().ClientStatus { case structs.AllocClientStatusPending: switch { case ar.IsWaiting(): diff --git a/command/agent/metrics_endpoint_test.go b/command/agent/metrics_endpoint_test.go index 3d8d3a1db1f..eca67faa267 100644 --- a/command/agent/metrics_endpoint_test.go +++ b/command/agent/metrics_endpoint_test.go @@ -1,11 +1,20 @@ package agent import ( + "fmt" "net/http" "net/http/httptest" + "strings" "testing" + "time" - metrics "github.com/armon/go-metrics" + "github.com/hashicorp/nomad/nomad/structs" + + "github.com/hashicorp/nomad/nomad/mock" + + "github.com/stretchr/testify/require" + + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" ) @@ -56,3 +65,76 @@ func TestHTTP_Metrics(t *testing.T) { }) }) } + +// When emitting metrics, the client should use the local copy of the allocs with +// updated task states (not the copy submitted by the server). +func TestHTTP_FreshClientAllocMetrics(t *testing.T) { + t.Parallel() + require := require.New(t) + + numTasks := 10 + + httpTest(t, func(c *Config) { + c.Telemetry.PublishAllocationMetrics = true + c.Telemetry.PublishNodeMetrics = true + c.Telemetry.BackwardsCompatibleMetrics = false + c.Telemetry.DisableTaggedMetrics = false + }, func(s *TestAgent) { + // make a separate HTTP request first, to ensure Nomad has written metrics + // and prevent a race condition + //req, err := http.NewRequest("GET", "/v1/agent/self", nil) + //require.NoError(err) + //respW := httptest.NewRecorder() + //s.Server.AgentSelfRequest(respW, req) + + // Create the job, wait for it to finish + job := mock.BatchJob() + job.TaskGroups[0].Count = numTasks + testutil.RegisterJob(t, s.RPC, job) + testutil.WaitForResult(func() (bool, error) { + args := &structs.JobSpecificRequest{} + args.JobID = job.ID + args.QueryOptions.Region = "global" + var resp structs.SingleJobResponse + err := s.RPC("Job.GetJob", args, &resp) + require.NoError(err) + return resp.Job.Status != "dead", nil + }, func(err error) { + require.Fail("timed-out waiting for job to complete") + }) + + // wait for metrics to converge + var pending, running, terminal float32 = -1.0, -1.0, -1.0 + testutil.WaitForResultRetries(100, func() (bool, error) { + time.Sleep(100 * time.Millisecond) + // client alloc metrics should reflect that there is one running alloc and zero pending allocs + req, err := http.NewRequest("GET", "/v1/metrics", nil) + require.NoError(err) + respW := httptest.NewRecorder() + + obj, err := s.Server.MetricsRequest(respW, req) + require.NoError(err) + + metrics := obj.(metrics.MetricsSummary) + for _, g := range metrics.Gauges { + if strings.Contains(g.Name, "allocations") { + fmt.Println(fmt.Sprintf("%v: %v", g.Name, g.Value)) + } + if strings.HasSuffix(g.Name, "client.allocations.pending") { + pending = g.Value + } + if strings.HasSuffix(g.Name, "client.allocations.running") { + running = g.Value + } + if strings.HasSuffix(g.Name, "client.allocations.terminal") { + terminal = g.Value + } + } + return pending == float32(0) && running == float32(0) && + terminal == float32(numTasks), nil + }, func(err error) { + require.Fail("timed out waiting for metrics to converge", + "pending: %v, running: %v, terminal: %v", pending, running, terminal) + }) + }) +}