From 03a6da7e3ce755763a1ca1af72379fe457588f3b Mon Sep 17 00:00:00 2001 From: ncabatoff Date: Wed, 4 Mar 2020 08:45:57 -0500 Subject: [PATCH] Reduce blocking in approle tidy by grabbing only one lock at a time. (#8418) (#8460) Revamp race test to make it aware of when tidies start/stop, and to generate dangling accessors that need cleaning up. Also run for longer but with slightly less concurrency to ensure the writes and the cleanup actually overlap. --- builtin/credential/approle/backend.go | 3 - .../credential/approle/path_tidy_user_id.go | 321 ++++++++++-------- .../approle/path_tidy_user_id_test.go | 53 ++- 3 files changed, 207 insertions(+), 170 deletions(-) diff --git a/builtin/credential/approle/backend.go b/builtin/credential/approle/backend.go index da3835802523..9612b65f7afd 100644 --- a/builtin/credential/approle/backend.go +++ b/builtin/credential/approle/backend.go @@ -3,7 +3,6 @@ package approle import ( "context" "sync" - "time" "github.com/hashicorp/vault/sdk/framework" "github.com/hashicorp/vault/sdk/helper/consts" @@ -57,8 +56,6 @@ type backend struct { // secretIDListingLock is a dedicated lock for listing SecretIDAccessors // for all the SecretIDs issued against an approle secretIDListingLock sync.RWMutex - - testTidyDelay time.Duration } func Factory(ctx context.Context, conf *logical.BackendConfig) (logical.Backend, error) { diff --git a/builtin/credential/approle/path_tidy_user_id.go b/builtin/credential/approle/path_tidy_user_id.go index 32c704587123..49a3e3c6a4dc 100644 --- a/builtin/credential/approle/path_tidy_user_id.go +++ b/builtin/credential/approle/path_tidy_user_id.go @@ -10,6 +10,7 @@ import ( "github.com/hashicorp/errwrap" "github.com/hashicorp/vault/sdk/framework" "github.com/hashicorp/vault/sdk/helper/consts" + "github.com/hashicorp/vault/sdk/helper/locksutil" "github.com/hashicorp/vault/sdk/logical" ) @@ -39,206 +40,224 @@ func (b *backend) tidySecretID(ctx context.Context, req *logical.Request) (*logi return resp, nil } - s := req.Storage + go b.tidySecretIDinternal(req.Storage) - go func() { - defer atomic.StoreUint32(b.tidySecretIDCASGuard, 0) + resp := &logical.Response{} + resp.AddWarning("Tidy operation successfully started. Any information from the operation will be printed to Vault's server logs.") + return logical.RespondWithStatusCode(resp, req, http.StatusAccepted) + +} - logger := b.Logger().Named("tidy") +type tidyHelperSecretIDAccessor struct { + secretIDAccessorStorageEntry + saltedSecretIDAccessor string +} - checkCount := 0 +func (b *backend) tidySecretIDinternal(s logical.Storage) { + defer atomic.StoreUint32(b.tidySecretIDCASGuard, 0) - defer func() { - if b.testTidyDelay > 0 { - logger.Trace("done checking entries", "num_entries", checkCount) - } - }() + logger := b.Logger().Named("tidy") + + checkCount := 0 + + defer func() { + logger.Trace("done checking entries", "num_entries", checkCount) + }() - // Don't cancel when the original client request goes away - ctx = context.Background() + // Don't cancel when the original client request goes away + ctx := context.Background() - tidyFunc := func(secretIDPrefixToUse, accessorIDPrefixToUse string) error { - logger.Trace("listing role HMACs", "prefix", secretIDPrefixToUse) + salt, err := b.Salt(ctx) + if err != nil { + logger.Error("error tidying secret IDs", "error", err) + return + } + + tidyFunc := func(secretIDPrefixToUse, accessorIDPrefixToUse string) error { + logger.Trace("listing accessors", "prefix", accessorIDPrefixToUse) - roleNameHMACs, err := s.List(ctx, secretIDPrefixToUse) + // List all the accessors and add them all to a map + // These hashes are the result of salting the accessor id. + accessorHashes, err := s.List(ctx, accessorIDPrefixToUse) + if err != nil { + return err + } + skipHashes := make(map[string]bool, len(accessorHashes)) + accHashesByLockID := make([][]tidyHelperSecretIDAccessor, 256) + for _, accessorHash := range accessorHashes { + var entry secretIDAccessorStorageEntry + entryIndex := accessorIDPrefixToUse + accessorHash + se, err := s.Get(ctx, entryIndex) + if err != nil { + return err + } + if se == nil { + continue + } + err = se.DecodeJSON(&entry) if err != nil { return err } - logger.Trace("listing accessors", "prefix", accessorIDPrefixToUse) + lockIdx := locksutil.LockIndexForKey(entry.SecretIDHMAC) + accHashesByLockID[lockIdx] = append(accHashesByLockID[lockIdx], tidyHelperSecretIDAccessor{ + secretIDAccessorStorageEntry: entry, + saltedSecretIDAccessor: accessorHash, + }) + } + + secretIDCleanupFunc := func(secretIDHMAC, roleNameHMAC, secretIDPrefixToUse string) error { + checkCount++ + lock := b.secretIDLock(secretIDHMAC) + lock.Lock() + defer lock.Unlock() - // List all the accessors and add them all to a map - accessorHashes, err := s.List(ctx, accessorIDPrefixToUse) + entryIndex := fmt.Sprintf("%s%s%s", secretIDPrefixToUse, roleNameHMAC, secretIDHMAC) + secretIDEntry, err := s.Get(ctx, entryIndex) if err != nil { - return err + return errwrap.Wrapf(fmt.Sprintf("error fetching SecretID %q: {{err}}", secretIDHMAC), err) } - accessorMap := make(map[string]bool, len(accessorHashes)) - for _, accessorHash := range accessorHashes { - accessorMap[accessorHash] = true + + if secretIDEntry == nil { + logger.Error("entry for secret id was nil", "secret_id_hmac", secretIDHMAC) + return nil } - time.Sleep(b.testTidyDelay) + if secretIDEntry.Value == nil || len(secretIDEntry.Value) == 0 { + return fmt.Errorf("found entry for SecretID %q but actual SecretID is empty", secretIDHMAC) + } - secretIDCleanupFunc := func(secretIDHMAC, roleNameHMAC, secretIDPrefixToUse string) error { - checkCount++ - lock := b.secretIDLock(secretIDHMAC) - lock.Lock() - defer lock.Unlock() + var result secretIDStorageEntry + if err := secretIDEntry.DecodeJSON(&result); err != nil { + return err + } - entryIndex := fmt.Sprintf("%s%s%s", secretIDPrefixToUse, roleNameHMAC, secretIDHMAC) - secretIDEntry, err := s.Get(ctx, entryIndex) - if err != nil { - return errwrap.Wrapf(fmt.Sprintf("error fetching SecretID %q: {{err}}", secretIDHMAC), err) + // If a secret ID entry does not have a corresponding accessor + // entry, revoke the secret ID immediately + accessorEntry, err := b.secretIDAccessorEntry(ctx, s, result.SecretIDAccessor, secretIDPrefixToUse) + if err != nil { + return errwrap.Wrapf("failed to read secret ID accessor entry: {{err}}", err) + } + if accessorEntry == nil { + logger.Trace("found nil accessor") + if err := s.Delete(ctx, entryIndex); err != nil { + return errwrap.Wrapf(fmt.Sprintf("error deleting secret ID %q from storage: {{err}}", secretIDHMAC), err) } + return nil + } - if secretIDEntry == nil { - logger.Error("entry for secret id was nil", "secret_id_hmac", secretIDHMAC) - return nil + // ExpirationTime not being set indicates non-expiring SecretIDs + if !result.ExpirationTime.IsZero() && time.Now().After(result.ExpirationTime) { + logger.Trace("found expired secret ID") + // Clean up the accessor of the secret ID first + err = b.deleteSecretIDAccessorEntry(ctx, s, result.SecretIDAccessor, secretIDPrefixToUse) + if err != nil { + return errwrap.Wrapf("failed to delete secret ID accessor entry: {{err}}", err) } - if secretIDEntry.Value == nil || len(secretIDEntry.Value) == 0 { - return fmt.Errorf("found entry for SecretID %q but actual SecretID is empty", secretIDHMAC) + if err := s.Delete(ctx, entryIndex); err != nil { + return errwrap.Wrapf(fmt.Sprintf("error deleting SecretID %q from storage: {{err}}", secretIDHMAC), err) } - var result secretIDStorageEntry - if err := secretIDEntry.DecodeJSON(&result); err != nil { - return err - } + return nil + } - // If a secret ID entry does not have a corresponding accessor - // entry, revoke the secret ID immediately - accessorEntry, err := b.secretIDAccessorEntry(ctx, s, result.SecretIDAccessor, secretIDPrefixToUse) - if err != nil { - return errwrap.Wrapf("failed to read secret ID accessor entry: {{err}}", err) - } - if accessorEntry == nil { - logger.Trace("found nil accessor") - if err := s.Delete(ctx, entryIndex); err != nil { - return errwrap.Wrapf(fmt.Sprintf("error deleting secret ID %q from storage: {{err}}", secretIDHMAC), err) - } - return nil - } + // At this point, the secret ID is not expired and is valid. Flag + // the corresponding accessor as not needing attention. + skipHashes[salt.SaltID(result.SecretIDAccessor)] = true - // ExpirationTime not being set indicates non-expiring SecretIDs - if !result.ExpirationTime.IsZero() && time.Now().After(result.ExpirationTime) { - logger.Trace("found expired secret ID") - // Clean up the accessor of the secret ID first - err = b.deleteSecretIDAccessorEntry(ctx, s, result.SecretIDAccessor, secretIDPrefixToUse) - if err != nil { - return errwrap.Wrapf("failed to delete secret ID accessor entry: {{err}}", err) - } + return nil + } - if err := s.Delete(ctx, entryIndex); err != nil { - return errwrap.Wrapf(fmt.Sprintf("error deleting SecretID %q from storage: {{err}}", secretIDHMAC), err) - } + logger.Trace("listing role HMACs", "prefix", secretIDPrefixToUse) - return nil - } + roleNameHMACs, err := s.List(ctx, secretIDPrefixToUse) + if err != nil { + return err + } - // At this point, the secret ID is not expired and is valid. Delete - // the corresponding accessor from the accessorMap. This will leave - // only the dangling accessors in the map which can then be cleaned - // up later. - salt, err := b.Salt(ctx) + for _, roleNameHMAC := range roleNameHMACs { + logger.Trace("listing secret ID HMACs", "role_hmac", roleNameHMAC) + secretIDHMACs, err := s.List(ctx, fmt.Sprintf("%s%s", secretIDPrefixToUse, roleNameHMAC)) + if err != nil { + return err + } + for _, secretIDHMAC := range secretIDHMACs { + err = secretIDCleanupFunc(secretIDHMAC, roleNameHMAC, secretIDPrefixToUse) if err != nil { return err } - delete(accessorMap, salt.SaltID(result.SecretIDAccessor)) - - return nil } + } + // Accessor indexes were not getting cleaned up until 0.9.3. This is a fix + // to clean up the dangling accessor entries. + if len(accessorHashes) > len(skipHashes) { + // There is some raciness here because we're querying secretids for + // roles without having a lock while doing so. Because + // accHashesByLockID was populated previously, at worst this may + // mean that we fail to clean up something we ought to. + var allSecretIDHMACs = make(map[string]struct{}) for _, roleNameHMAC := range roleNameHMACs { - logger.Trace("listing secret ID HMACs", "role_hmac", roleNameHMAC) - secretIDHMACs, err := s.List(ctx, fmt.Sprintf("%s%s", secretIDPrefixToUse, roleNameHMAC)) + secretIDHMACs, err := s.List(ctx, secretIDPrefixToUse+roleNameHMAC) if err != nil { return err } - for _, secretIDHMAC := range secretIDHMACs { - err = secretIDCleanupFunc(secretIDHMAC, roleNameHMAC, secretIDPrefixToUse) - if err != nil { - return err - } + for _, v := range secretIDHMACs { + allSecretIDHMACs[v] = struct{}{} } } - // Accessor indexes were not getting cleaned up until 0.9.3. This is a fix - // to clean up the dangling accessor entries. - if len(accessorMap) > 0 { - for _, lock := range b.secretIDLocks { - lock.Lock() - defer lock.Unlock() - } - for accessorHash, _ := range accessorMap { - logger.Trace("found dangling accessor, verifying") - // Ideally, locking on accessors should be performed here too - // but for that, accessors are required in plaintext, which are - // not available. The code above helps but it may still be - // racy. - // ... - // Look up the secret again now that we have all the locks. The - // lock is held when writing accessor/secret so if we have the - // lock we know we're not in a - // wrote-accessor-but-not-yet-secret case, which can be racy. - var entry secretIDAccessorStorageEntry - entryIndex := accessorIDPrefixToUse + accessorHash - se, err := s.Get(ctx, entryIndex) - if err != nil { - return err + tidyEntries := func(entries []tidyHelperSecretIDAccessor) error { + for _, entry := range entries { + // Don't clean up accessor index entry if secretid cleanup func + // determined that it should stay. + if _, ok := skipHashes[entry.saltedSecretIDAccessor]; ok { + continue } - if se != nil { - err = se.DecodeJSON(&entry) - if err != nil { - return err - } - - // The storage entry doesn't store the role ID, so we have - // to go about this the long way; fortunately we shouldn't - // actually hit this very often - var found bool - searchloop: - for _, roleNameHMAC := range roleNameHMACs { - secretIDHMACs, err := s.List(ctx, fmt.Sprintf("%s%s", secretIDPrefixToUse, roleNameHMAC)) - if err != nil { - return err - } - for _, v := range secretIDHMACs { - if v == entry.SecretIDHMAC { - found = true - logger.Trace("accessor verified, not removing") - break searchloop - } - } - } - if !found { - logger.Trace("could not verify dangling accessor, removing") - err = s.Delete(ctx, entryIndex) - if err != nil { - return err - } - } + + // Don't clean up accessor index entry if referenced in role. + if _, ok := allSecretIDHMACs[entry.SecretIDHMAC]; ok { + continue + } + + if err := s.Delete(context.Background(), accessorIDPrefixToUse+entry.saltedSecretIDAccessor); err != nil { + return err } } + return nil } - return nil + for lockIdx, entries := range accHashesByLockID { + // Ideally, locking on accessors should be performed here too + // but for that, accessors are required in plaintext, which are + // not available. + // ... + // The lock is held when writing accessor/secret so if we have + // the lock we know we're not in a + // wrote-accessor-but-not-yet-secret case, which can be racy. + b.secretIDLocks[lockIdx].Lock() + err = tidyEntries(entries) + b.secretIDLocks[lockIdx].Unlock() + if err != nil { + return err + } + } } - err := tidyFunc(secretIDPrefix, secretIDAccessorPrefix) - if err != nil { - logger.Error("error tidying global secret IDs", "error", err) - return - } - err = tidyFunc(secretIDLocalPrefix, secretIDAccessorLocalPrefix) - if err != nil { - logger.Error("error tidying local secret IDs", "error", err) - return - } - }() + return nil + } - resp := &logical.Response{} - resp.AddWarning("Tidy operation successfully started. Any information from the operation will be printed to Vault's server logs.") - return logical.RespondWithStatusCode(resp, req, http.StatusAccepted) + err = tidyFunc(secretIDPrefix, secretIDAccessorPrefix) + if err != nil { + logger.Error("error tidying global secret IDs", "error", err) + return + } + err = tidyFunc(secretIDLocalPrefix, secretIDAccessorLocalPrefix) + if err != nil { + logger.Error("error tidying local secret IDs", "error", err) + return + } } // pathTidySecretIDUpdate is used to delete the expired SecretID entries diff --git a/builtin/credential/approle/path_tidy_user_id_test.go b/builtin/credential/approle/path_tidy_user_id_test.go index b6b2b31cef13..c9d460155893 100644 --- a/builtin/credential/approle/path_tidy_user_id_test.go +++ b/builtin/credential/approle/path_tidy_user_id_test.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "sync" + "sync/atomic" "testing" "time" @@ -91,8 +92,6 @@ func TestAppRole_TidyDanglingAccessors_RaceTest(t *testing.T) { var err error b, storage := createBackendWithStorage(t) - b.testTidyDelay = 300 * time.Millisecond - // Create a role createRole(t, b, storage, "role1", "a,b,c") @@ -109,14 +108,9 @@ func TestAppRole_TidyDanglingAccessors_RaceTest(t *testing.T) { count := 1 wg := &sync.WaitGroup{} - now := time.Now() - started := false - for { - if time.Now().Sub(now) > 700*time.Millisecond { - break - } - if time.Now().Sub(now) > 100*time.Millisecond && !started { - started = true + start := time.Now() + for time.Now().Sub(start) < 10*time.Second { + if time.Now().Sub(start) > 100*time.Millisecond && atomic.LoadUint32(b.tidySecretIDCASGuard) == 0 { _, err = b.tidySecretID(context.Background(), &logical.Request{ Storage: storage, }) @@ -137,23 +131,50 @@ func TestAppRole_TidyDanglingAccessors_RaceTest(t *testing.T) { t.Fatalf("err:%v resp:%#v", err, resp) } }() + + entry, err := logical.StorageEntryJSON( + fmt.Sprintf("accessor/invalid%d", count), + &secretIDAccessorStorageEntry{ + SecretIDHMAC: "samplesecretidhmac", + }, + ) + err = storage.Put(context.Background(), entry) + if err != nil { + t.Fatal(err) + } + count++ + time.Sleep(100 * time.Microsecond) } - t.Logf("wrote %d entries", count) + logger := b.Logger().Named(t.Name()) + logger.Info("wrote entries", "count", count) wg.Wait() // Let tidy finish - time.Sleep(1 * time.Second) + for atomic.LoadUint32(b.tidySecretIDCASGuard) != 0 { + time.Sleep(100 * time.Millisecond) + } + + logger.Info("running tidy again") // Run tidy again - _, err = b.tidySecretID(context.Background(), &logical.Request{ + secret, err := b.tidySecretID(context.Background(), &logical.Request{ Storage: storage, }) - if err != nil { - t.Fatal(err) + if err != nil || len(secret.Warnings) > 0 { + t.Fatal(err, secret.Warnings) + } + + // Wait for tidy to start + for atomic.LoadUint32(b.tidySecretIDCASGuard) == 0 { + time.Sleep(100 * time.Millisecond) + } + + // Let tidy finish + for atomic.LoadUint32(b.tidySecretIDCASGuard) != 0 { + time.Sleep(100 * time.Millisecond) } - time.Sleep(2 * time.Second) accessorHashes, err := storage.List(context.Background(), "accessor/") if err != nil {