Skip to content

Commit

Permalink
fix: try recovering from gateway sync failure only on UpdateError
Browse files Browse the repository at this point in the history
  • Loading branch information
czeslavo committed Jun 25, 2024
1 parent 07fa4b1 commit 7459c39
Show file tree
Hide file tree
Showing 7 changed files with 407 additions and 83 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ Adding a new version? You'll need three changes:
- Services using `Secret`s containing the same certificate as client certificates
by annotation `konghq.com/client-cert` can be correctly translated.
[#6228](https://github.com/Kong/kubernetes-ingress-controller/pull/6228)
- Do not try recovering from gateways synchronization errors with fallback configuration
(either generated or the last valid one) when an unexpected error (e.g. 5xx or network issue) occurs.
[#6237](https://github.com/Kong/kubernetes-ingress-controller/pull/6237)

## 3.2.0

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ require (
github.com/google/uuid v1.6.0
github.com/jpillora/backoff v1.0.0
github.com/kong/go-database-reconciler v1.12.2
github.com/kong/go-kong v0.55.0
github.com/kong/go-kong v0.56.0
github.com/kong/kubernetes-telemetry v0.1.3
github.com/kong/kubernetes-testing-framework v0.47.1
github.com/lithammer/dedent v1.1.0
Expand Down
6 changes: 3 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDsl
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE=
github.com/go-task/slim-sprig v2.20.0+incompatible h1:4Xh3bDzO29j4TWNOI+24ubc0vbVFMg2PMnXKxK54/CA=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
Expand Down Expand Up @@ -240,8 +240,8 @@ github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJw
github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/kong/go-database-reconciler v1.12.2 h1:bnlvLCgP4OjgJOK5JYqq1MlIJ2F7erXERMj8n/LNU8M=
github.com/kong/go-database-reconciler v1.12.2/go.mod h1:bUPJkoeW//x4hzNxewQMoIkeoDzJzunI0stDMYJ3BkU=
github.com/kong/go-kong v0.55.0 h1:lonKRzsDGk12dh9E+y+pWnY2ThXhKuMHjzBHSpCvQLw=
github.com/kong/go-kong v0.55.0/go.mod h1:i1cMgTu6RYPHSyMpviShddRnc+DML/vlpgKC00hr8kU=
github.com/kong/go-kong v0.56.0 h1:/9qbnQJWAgrSAKzL2RViBhHMTYOEyG8N4ClkKnUwEW4=
github.com/kong/go-kong v0.56.0/go.mod h1:gyNwyP1fzztT6sX/0/ygMQ30OiRMIQ51b2jSfstMrcU=
github.com/kong/kubernetes-telemetry v0.1.3 h1:Hz2tkHGIIUqbn1x46QRDmmNjbEtJyxyOvHSPne3uPto=
github.com/kong/kubernetes-telemetry v0.1.3/go.mod h1:wB7o8dOKa5R396CyiU0sPa8am/g3c5DKd/qrn/Vmb+k=
github.com/kong/kubernetes-testing-framework v0.47.1 h1:BE2mQLC0Zj/NC5Y8B1Nm5VZymmrdVfu7cfwVKSobWtg=
Expand Down
78 changes: 37 additions & 41 deletions internal/dataplane/kong_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,6 @@ type KongClient struct {
// While lastProcessedSnapshotHash keeps track of the last processed cache snapshot (the one kept in KongClient.cache),
// lastValidCacheSnapshot can also represent the fallback cache snapshot that was successfully synced with gateways.
lastValidCacheSnapshot *store.CacheStores

// brokenObjects is a list of the Kubernetes resources that failed to sync and triggered a fallback sync.
brokenObjects []fallback.ObjectHash
}

// NewKongClient provides a new KongClient object after connecting to the
Expand Down Expand Up @@ -453,6 +450,8 @@ func (c *KongClient) Update(ctx context.Context) error {
}
// Empty snapshot hash means that the cache hasn't changed since the last snapshot was taken. That optimization can be used
// in main code path to avoid unnecessary processing. TODO: https://github.com/Kong/kubernetes-ingress-controller/issues/6095
// TODO: We should short-circuit here only if all the Gateways were successfully synced with the current configuration.
// https://github.com/Kong/kubernetes-ingress-controller/issues/6219
if newSnapshotHash == store.SnapshotHashEmpty {
c.prometheusMetrics.RecordProcessedConfigSnapshotCacheHit()
c.logger.V(util.DebugLevel).Info("No configuration change; pushing config to gateway is not necessary, skipping")
Expand Down Expand Up @@ -493,7 +492,7 @@ func (c *KongClient) Update(ctx context.Context) error {

// In case of a failure in syncing configuration with Gateways, propagate the error.
if gatewaysSyncErr != nil {
if recoveringErr := c.tryRecoveringFromGatewaysSyncError(
if recoveringErr := c.maybeTryRecoveringFromGatewaysSyncError(
ctx,
cacheSnapshot,
gatewaysSyncErr,
Expand Down Expand Up @@ -532,28 +531,45 @@ func (c *KongClient) maybePreserveTheLastValidConfigCache(lastValidCache store.C
}
}

// tryRecoveringFromGatewaysSyncError tries to recover from a configuration rejection by:
// 1. Generating a fallback configuration and pushing it to the gateways if FallbackConfiguration feature is enabled.
// 2. Applying the last valid configuration to the gateways if FallbackConfiguration is disabled or fallback
// configuration generation fails.
func (c *KongClient) tryRecoveringFromGatewaysSyncError(
// maybeTryRecoveringFromGatewaysSyncError tries to recover from a configuration rejection if the error is of the expected
// UpdateError type. Otherwise, we assume the error is non-recoverable by means of fallback configuration generation
// or applying the last valid configuration, and we need to rely on the retry mechanism. It can happen in case of
// transient network issues, internal server errors, etc.
//
// The recovery can be handled in two ways:
// 1. Generating a fallback configuration and pushing it to the gateways if FallbackConfiguration feature is enabled and
// the UpdateError contains at least 1 broken object.
// 2. Applying the last valid configuration to the gateways if FallbackConfiguration is disabled, fallback
// configuration generation fails, or the UpdateError does not contain any broken objects (it can happen if a gateway
// returns 400 with no meaningful entities' errors).
func (c *KongClient) maybeTryRecoveringFromGatewaysSyncError(
ctx context.Context,
cacheSnapshot store.CacheStores,
gatewaysSyncErr error,
) error {
// If configuration was rejected by the gateways and FallbackConfiguration is enabled,
// we should generate a fallback configuration and push it to the gateways.
if c.kongConfig.FallbackConfiguration {
recoveringErr := c.tryRecoveringWithFallbackConfiguration(ctx, cacheSnapshot, gatewaysSyncErr)
// If the error is not of the expected UpdateError type, we should log it and skip the recovery.
updateErr := sendconfig.UpdateError{}
if !errors.As(gatewaysSyncErr, &updateErr) {
c.logger.V(util.DebugLevel).Info("Skipping recovery from gateways sync error - not enough details to recover",
"error", gatewaysSyncErr)
return nil
}

// If configuration was rejected by the gateways, we identified at least one broken object and FallbackConfiguration
// is enabled, we should generate a fallback configuration and push it to the gateways.
brokenObjects := extractBrokenObjectsFromUpdateError(updateErr)
if c.kongConfig.FallbackConfiguration && len(brokenObjects) > 0 {
recoveringErr := c.tryRecoveringWithFallbackConfiguration(ctx, cacheSnapshot, brokenObjects)
if recoveringErr == nil {
c.logger.Info("Successfully recovered from configuration rejection with fallback configuration")
return nil
}
// If we failed to recover using the fallback configuration, we should log the error and carry on.
// If we failed to recover using the fallback configuration, we should log the error and carry on with the last
// valid configuration.
c.logger.Error(recoveringErr, "Failed to recover from configuration rejection with fallback configuration")
}

// If FallbackConfiguration is disabled, or we failed to recover using the fallback configuration, we should
// If FallbackConfiguration is disabled, we skipped or failed to recover using the fallback configuration, we should
// apply the last valid configuration to the gateways.
if state, found := c.kongConfigFetcher.LastValidConfig(); found {
const isFallback = true
Expand All @@ -565,23 +581,13 @@ func (c *KongClient) tryRecoveringFromGatewaysSyncError(
return nil
}

func (c *KongClient) cacheBrokenObjectList(list []fallback.ObjectHash) {
c.brokenObjects = list
}

// tryRecoveringWithFallbackConfiguration tries to recover from a configuration rejection by generating a fallback
// configuration excluding affected objects from the cache.
func (c *KongClient) tryRecoveringWithFallbackConfiguration(
ctx context.Context,
currentCache store.CacheStores,
gatewaysSyncErr error,
brokenObjects []fallback.ObjectHash,
) error {
// Extract the broken objects from the update error and generate a fallback configuration excluding them.
brokenObjects, err := extractBrokenObjectsFromUpdateError(gatewaysSyncErr)
if err != nil {
return fmt.Errorf("failed to extract broken objects from update error: %w", err)
}

// Generate a fallback cache snapshot.
fallbackCache, generatedCacheMetadata, err := c.generateFallbackCache(currentCache, brokenObjects)
if err != nil {
Expand All @@ -607,8 +613,7 @@ func (c *KongClient) tryRecoveringWithFallbackConfiguration(
}

const isFallback = true
c.cacheBrokenObjectList(brokenObjects)
_, gatewaysSyncErr = c.sendOutToGatewayClients(ctx, fallbackParsingResult.KongState, c.kongConfig, isFallback)
_, gatewaysSyncErr := c.sendOutToGatewayClients(ctx, fallbackParsingResult.KongState, c.kongConfig, isFallback)
if gatewaysSyncErr != nil {
return fmt.Errorf("failed to sync fallback configuration with gateways: %w", gatewaysSyncErr)
}
Expand Down Expand Up @@ -647,24 +652,15 @@ func (c *KongClient) generateFallbackCache(
)
}

// extractBrokenObjectsFromUpdateError.
func extractBrokenObjectsFromUpdateError(err error) ([]fallback.ObjectHash, error) {
// extractBrokenObjectsFromUpdateError extracts broken objects from the UpdateError.
func extractBrokenObjectsFromUpdateError(err sendconfig.UpdateError) []fallback.ObjectHash {
var brokenObjects []client.Object

var updateErr sendconfig.UpdateError
if ok := errors.As(err, &updateErr); !ok {
return nil, fmt.Errorf("expected UpdateError, cannot extract broken objects from %T", err) //nolint:errorlint
}
for _, resourceFailure := range updateErr.ResourceFailures() {
for _, resourceFailure := range err.ResourceFailures() {
brokenObjects = append(brokenObjects, resourceFailure.CausingObjects()...)
}
if len(brokenObjects) == 0 {
return nil, fmt.Errorf("no broken objects found in UpdateError")
}

return lo.Map(brokenObjects, func(obj client.Object, _ int) fallback.ObjectHash {
return fallback.GetObjectHash(obj)
}), nil
})
}

// sendOutToGatewayClients will generate deck content (config) from the provided kong state
Expand Down
Loading

0 comments on commit 7459c39

Please sign in to comment.