diff --git a/.changelog/24166.txt b/.changelog/24166.txt new file mode 100644 index 00000000000..ebac6cfe36c --- /dev/null +++ b/.changelog/24166.txt @@ -0,0 +1,7 @@ +```release-note:bug +consul: Fixed a bug where broken Consul ACL tokens could block registration and deregistration of services and checks +``` + +```release-note:bug +consul: Fixed a bug where service deregistration could fail because Consul ACL tokens were revoked during allocation GC +``` diff --git a/command/agent/consul/service_client.go b/command/agent/consul/service_client.go index 973dd3649ef..29904608e05 100644 --- a/command/agent/consul/service_client.go +++ b/command/agent/consul/service_client.go @@ -22,6 +22,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/consul/api" "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-set/v3" "github.com/hashicorp/nomad/client/serviceregistration" "github.com/hashicorp/nomad/helper" @@ -973,8 +974,7 @@ func (c *ServiceClient) merge(ops *operations) { func (c *ServiceClient) sync(reason syncReason) error { c.logger.Trace("execute sync", "reason", reason) - sreg, creg, sdereg, cdereg := 0, 0, 0, 0 - var err error + sreg, creg, sdereg, cdereg, fails := 0, 0, 0, 0, 0 // Get the list of all namespaces created so we can iterate them. namespaces, err := c.namespacesClient.List() @@ -1001,8 +1001,10 @@ func (c *ServiceClient) sync(reason syncReason) error { // de-registering services. inProbation := time.Now().Before(c.deregisterProbationExpiry) + var mErr *multierror.Error // collect errors for individual services/checks + // Remove Nomad services in Consul but unknown to Nomad. - for id := range servicesInConsul { + for id, service := range servicesInConsul { if _, ok := c.services[id]; ok { // Known service, skip continue @@ -1027,38 +1029,16 @@ func (c *ServiceClient) sync(reason syncReason) error { continue } - // Get the Consul namespace this service is in. - ns := servicesInConsul[id].Namespace - - token := c.getServiceToken(id) - - // If this service has a sidecar, we need to remove the sidecar first, - // otherwise Consul will produce a warning and an error when removing - // the parent service. - // - // The sidecar is not tracked on the Nomad side; it was registered - // implicitly through the parent service. - if sidecar := getNomadSidecar(id, servicesInConsul); sidecar != nil { - if err := c.agentAPI.ServiceDeregisterOpts(sidecar.ID, - &api.QueryOptions{Namespace: ns, Token: token}, - ); err != nil { - metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err - } - } - - // Remove the unwanted service. - if err := c.agentAPI.ServiceDeregisterOpts(id, - &api.QueryOptions{Namespace: ns, Token: token}, - ); err != nil { - if isOldNomadService(id) { - // Don't hard-fail on old entries. See #3620 - continue - } - + // Remove the service and any sidecar; this will return an error early + // if removing the sidecar fails + err := c.syncRemoveService(service.Namespace, id, servicesInConsul) + if err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + mErr = multierror.Append(mErr, err) + fails++ + continue } + sdereg++ metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) } @@ -1078,7 +1058,9 @@ func (c *ServiceClient) sync(reason syncReason) error { Token: token, }); err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + mErr = multierror.Append(mErr, err) + fails++ + continue } sreg++ metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) @@ -1092,7 +1074,13 @@ func (c *ServiceClient) sync(reason syncReason) error { nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) if err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return fmt.Errorf("failed to query Consul checks: %w", err) + err = fmt.Errorf("failed to query Consul checks: %w", err) + if mErr.Len() == 0 { + return err + } else { + mErr = multierror.Append(mErr, err) + return mErr.ErrorOrNil() + } } for k, v := range nsChecks { checksInConsul[k] = v @@ -1128,14 +1116,12 @@ func (c *ServiceClient) sync(reason syncReason) error { // Unknown Nomad managed check; remove. Note: this query has to use the // Nomad agent's own Consul token, because by definition we don't have // an associated workload for it - if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace}); err != nil { - if isOldNomadService(check.ServiceID) { - // Don't hard-fail on old entries. - continue - } - + err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace}) + if err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + mErr = multierror.Append(mErr, err) + fails++ + continue } cdereg++ metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) @@ -1152,16 +1138,39 @@ func (c *ServiceClient) sync(reason syncReason) error { } if err := c.agentAPI.CheckRegisterOpts(check, opts); err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + mErr = multierror.Append(mErr, err) + fails++ + continue } creg++ metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) } // Only log if something was actually synced - if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 { + if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 || fails > 0 { c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg, - "registered_checks", creg, "deregistered_checks", cdereg) + "registered_checks", creg, "deregistered_checks", cdereg, "failures", fails) + } + return mErr.ErrorOrNil() +} + +// syncRemoveService removes an unwanted service from Consul. If the service has +// a sidecar, we need to remove the sidecar first, otherwise Consul will produce +// a warning and an error when removing the parent service. So this returns +// early if the sidecar can't be removed. +func (c *ServiceClient) syncRemoveService(ns, id string, servicesInConsul map[string]*api.AgentService) error { + // The sidecar is not tracked on the Nomad side; it was registered + // implicitly through the parent service. + if sidecar := getNomadSidecar(id, servicesInConsul); sidecar != nil { + err := c.agentAPI.ServiceDeregisterOpts(sidecar.ID, &api.QueryOptions{Namespace: ns}) + if err != nil { + return err + } + } + + err := c.agentAPI.ServiceDeregisterOpts(id, &api.QueryOptions{Namespace: ns}) + if err != nil { + return err } return nil } @@ -1825,10 +1834,7 @@ func (c *ServiceClient) Shutdown() error { // Always attempt to deregister Nomad agent Consul entries, even if // deadline was reached for id := range c.agentServices.Items() { - opts := &api.QueryOptions{ - Token: c.getServiceToken(id), - } - if err := c.agentAPI.ServiceDeregisterOpts(id, opts); err != nil { + if err := c.agentAPI.ServiceDeregisterOpts(id, nil); err != nil { c.logger.Error("failed deregistering agent service", "service_id", id, "error", err) } } @@ -1865,10 +1871,8 @@ func (c *ServiceClient) Shutdown() error { if remainingChecks == nil || checkRemains(id) { check := remainingChecks[id] ns := check.Namespace - token := c.getServiceToken(check.ServiceID) - if err := c.agentAPI.CheckDeregisterOpts(id, - &api.QueryOptions{Namespace: ns, Token: token}); err != nil { + &api.QueryOptions{Namespace: ns}); err != nil { c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) } } @@ -2043,7 +2047,7 @@ func isNomadAgent(id string) bool { // service (new or old formats). Agent services return false as independent // client and server agents may be running on the same machine. #2827 func isNomadService(id string) bool { - return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) + return strings.HasPrefix(id, nomadTaskPrefix) } // isNomadCheck returns true if the ID matches the pattern of a Nomad managed @@ -2052,18 +2056,6 @@ func isNomadCheck(id string) bool { return strings.HasPrefix(id, nomadCheckPrefix) } -// isOldNomadService returns true if the ID matches an old pattern managed by -// Nomad. -// -// Pre-0.7.1 task service IDs are of the form: -// -// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} -// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 -func isOldNomadService(id string) bool { - const prefix = nomadServicePrefix + "-executor" - return strings.HasPrefix(id, prefix) -} - const ( sidecarSuffix = "-sidecar-proxy" ) diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 888a5d2235d..e2d53fce871 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -891,14 +891,7 @@ func TestIsNomadService(t *testing.T) { }{ {"_nomad-client-nomad-client-http", false}, {"_nomad-server-nomad-serf", false}, - - // Pre-0.7.1 style IDs still match - {"_nomad-executor-abc", true}, - {"_nomad-executor", true}, - - // Post-0.7.1 style IDs match {"_nomad-task-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4", true}, - {"not-nomad", false}, {"_nomad", false}, } @@ -906,9 +899,7 @@ func TestIsNomadService(t *testing.T) { for _, test := range tests { t.Run(test.id, func(t *testing.T) { actual := isNomadService(test.id) - if actual != test.result { - t.Errorf("%q should be %t but found %t", test.id, test.result, actual) - } + must.Eq(t, test.result, actual) }) } }