From 7df8d3994b75991b5e49a65728ef5e4b24a85dde Mon Sep 17 00:00:00 2001 From: odubajDT <93584209+odubajDT@users.noreply.github.com> Date: Thu, 13 Jul 2023 16:09:18 +0200 Subject: [PATCH] feat(flagD): support zero downtime during upgrades (#731) ## This PR - implements graceful shutdown of flagD, which leads to zero-downtime -> this means disabling the readiness probes and sending a shutdown event to all connected SDKs - create example manifests for deploying flagD as a standalone Deployment - create Makefile entry to deploy flagD to cluster - create ZD test with README description how to run - create Makefile entry to run ZD test ### Related Issues Fixes #728 ### Follow-up Tasks - running ZD test as part of CI https://github.com/open-feature/flagd/issues/732 --------- Signed-off-by: odubajDT --- Makefile | 20 ++++- config/deployments/flagd/deployment.yaml | 74 +++++++++++++++++++ config/deployments/flagd/service.yaml | 10 +++ core/pkg/runtime/runtime.go | 7 ++ .../flag-evaluation/connect_service.go | 13 +++- .../flag-evaluation/connect_service_test.go | 40 ++++++++++ core/pkg/service/iservice.go | 2 + test/zero-downtime/README.md | 25 +++++++ test/zero-downtime/test-pod.yaml | 25 +++++++ test/zero-downtime/zd_test.sh | 44 +++++++++++ 10 files changed, 258 insertions(+), 2 deletions(-) create mode 100644 config/deployments/flagd/deployment.yaml create mode 100644 config/deployments/flagd/service.yaml create mode 100644 test/zero-downtime/README.md create mode 100644 test/zero-downtime/test-pod.yaml create mode 100755 test/zero-downtime/zd_test.sh diff --git a/Makefile b/Makefile index e6f6776b7..23bc6554b 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,10 @@ -IMG ?= flagd:latest PHONY: .docker-build .build .run .mockgen PREFIX=/usr/local ALL_GO_MOD_DIRS := $(shell find . -type f -name 'go.mod' -exec dirname {} \; | sort) +FLAGD_DEV_NAMESPACE ?= flagd-dev +ZD_TEST_NAMESPACE ?= flagd-zd-test + workspace-init: workspace-clean go work init $(foreach module, $(ALL_GO_MOD_DIRS), go work use $(module);) @@ -67,6 +69,22 @@ mockgen: install-mockgen generate-docs: cd flagd; go run ./cmd/doc/main.go +.PHONY: deploy-dev-env +export IMG?= ghcr.io/open-feature/flagd:latest +deploy-dev-env: undeploy-dev-env + kubectl create ns "$(FLAGD_DEV_NAMESPACE)" + envsubst '$${IMG}' < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n "$(FLAGD_DEV_NAMESPACE)" + kubectl apply -f config/deployments/flagd/service.yaml -n "$(FLAGD_DEV_NAMESPACE)" + kubectl wait --for=condition=available deployment/flagd -n "$(FLAGD_DEV_NAMESPACE)" --timeout=300s + +undeploy-dev-env: + kubectl delete ns "$(FLAGD_DEV_NAMESPACE)" --ignore-not-found=true + +run-zd-test: + kubectl delete ns "$(ZD_TEST_NAMESPACE)" --ignore-not-found=true + kubectl create ns "$(ZD_TEST_NAMESPACE)" + ZD_TEST_NAMESPACE="$(ZD_TEST_NAMESPACE)" FLAGD_DEV_NAMESPACE=$(FLAGD_DEV_NAMESPACE) IMG="$(IMG)" IMG_ZD="$(IMG_ZD)" ./test/zero-downtime/zd_test.sh + # Markdown lint configuration # # - .markdownlintignore holds the configuration for files to be ignored diff --git a/config/deployments/flagd/deployment.yaml b/config/deployments/flagd/deployment.yaml new file mode 100644 index 000000000..0f7207430 --- /dev/null +++ b/config/deployments/flagd/deployment.yaml @@ -0,0 +1,74 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: flagd + name: flagd +spec: + replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + selector: + matchLabels: + app: flagd + template: + metadata: + labels: + app.kubernetes.io/name: flagd + app: flagd + spec: + containers: + - name: flagd + image: ${IMG} + volumeMounts: + - name: config-volume + mountPath: /etc/flagd + readinessProbe: + httpGet: + path: /readyz + port: 8014 + initialDelaySeconds: 5 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /healthz + port: 8014 + initialDelaySeconds: 5 + periodSeconds: 60 + ports: + - containerPort: 8013 + args: + - start + - --uri + - file:/etc/flagd/config.json + - --debug + volumes: + - name: config-volume + configMap: + name: open-feature-flags + items: + - key: flags + path: config.json +--- +# ConfigMap for Flagd OpenFeatuer provider +apiVersion: v1 +kind: ConfigMap +metadata: + name: open-feature-flags +data: + flags: | + { + "flags": { + "myStringFlag": { + "state": "ENABLED", + "variants": { + "key1": "val1", + "key2": "val2" + }, + "defaultVariant": "key1" + } + } + } diff --git a/config/deployments/flagd/service.yaml b/config/deployments/flagd/service.yaml new file mode 100644 index 000000000..6ed8db0cf --- /dev/null +++ b/config/deployments/flagd/service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: flagd-svc +spec: + selector: + app.kubernetes.io/name: flagd + ports: + - port: 8013 + targetPort: 8013 diff --git a/core/pkg/runtime/runtime.go b/core/pkg/runtime/runtime.go index 72ec033e9..50bd51958 100644 --- a/core/pkg/runtime/runtime.go +++ b/core/pkg/runtime/runtime.go @@ -84,6 +84,13 @@ func (r *Runtime) Start() error { return nil }) } + + defer func() { + r.Logger.Info("Shutting down server...") + r.Service.Shutdown() + r.Logger.Info("Server successfully shutdown.") + }() + g.Go(func() error { // Readiness probe rely on the runtime r.ServiceConfig.ReadinessProbe = r.isReady diff --git a/core/pkg/service/flag-evaluation/connect_service.go b/core/pkg/service/flag-evaluation/connect_service.go index 68122bf7d..af993b089 100644 --- a/core/pkg/service/flag-evaluation/connect_service.go +++ b/core/pkg/service/flag-evaluation/connect_service.go @@ -37,6 +37,8 @@ type ConnectService struct { serverMtx sync.RWMutex metricsServerMtx sync.RWMutex + + readinessEnabled bool } // NewConnectService creates a ConnectService with provided parameters @@ -57,6 +59,7 @@ func NewConnectService( // Serve serves services with provided configuration options func (s *ConnectService) Serve(ctx context.Context, svcConf service.Configuration) error { g, gCtx := errgroup.WithContext(ctx) + s.readinessEnabled = true g.Go(func() error { return s.startServer(svcConf) @@ -152,6 +155,14 @@ func (s *ConnectService) AddMiddleware(mw middleware.IMiddleware) { s.server.Handler = mw.Handler(s.server.Handler) } +func (s *ConnectService) Shutdown() { + s.readinessEnabled = false + s.eventingConfiguration.emitToAll(service.Notification{ + Type: service.Shutdown, + Data: map[string]interface{}{}, + }) +} + func (s *ConnectService) startServer(svcConf service.Configuration) error { lis, err := s.setupServer(svcConf) if err != nil { @@ -189,7 +200,7 @@ func (s *ConnectService) startMetricsServer(svcConf service.Configuration) error case "/healthz": w.WriteHeader(http.StatusOK) case "/readyz": - if svcConf.ReadinessProbe() { + if s.readinessEnabled && svcConf.ReadinessProbe() { w.WriteHeader(http.StatusOK) } else { w.WriteHeader(http.StatusPreconditionFailed) diff --git a/core/pkg/service/flag-evaluation/connect_service_test.go b/core/pkg/service/flag-evaluation/connect_service_test.go index fe68f93cd..2ee11aed0 100644 --- a/core/pkg/service/flag-evaluation/connect_service_test.go +++ b/core/pkg/service/flag-evaluation/connect_service_test.go @@ -204,3 +204,43 @@ func TestConnectServiceNotify(t *testing.T) { t.Error("timeout while waiting for notifications") } } + +func TestConnectServiceShutdown(t *testing.T) { + // given + ctrl := gomock.NewController(t) + eval := mock.NewMockIEvaluator(ctrl) + + exp := metric.NewManualReader() + rs := resource.NewWithAttributes("testSchema") + metricRecorder := telemetry.NewOTelRecorder(exp, rs, "my-exporter") + + service := NewConnectService(logger.NewLogger(nil, false), eval, metricRecorder) + + sChan := make(chan iservice.Notification, 1) + eventing := service.eventingConfiguration + eventing.subs["key"] = sChan + + // notification type + ofType := iservice.Shutdown + + // emit notification in routine + go func() { + service.Notify(iservice.Notification{ + Type: ofType, + Data: map[string]interface{}{}, + }) + }() + + // wait for notification + timeout, cancelFunc := context.WithTimeout(context.Background(), 2*time.Second) + defer cancelFunc() + + require.False(t, service.readinessEnabled) + + select { + case n := <-sChan: + require.Equal(t, ofType, n.Type, "expected notification type: %s, but received %s", ofType, n.Type) + case <-timeout.Done(): + t.Error("timeout while waiting for notifications") + } +} diff --git a/core/pkg/service/iservice.go b/core/pkg/service/iservice.go index 69f4077b9..71951931e 100644 --- a/core/pkg/service/iservice.go +++ b/core/pkg/service/iservice.go @@ -10,6 +10,7 @@ type NotificationType string const ( ConfigurationChange NotificationType = "configuration_change" + Shutdown NotificationType = "provider_shutdown" ProviderReady NotificationType = "provider_ready" KeepAlive NotificationType = "keep_alive" ) @@ -40,6 +41,7 @@ which call the IEvaluator implementation. type IFlagEvaluationService interface { Serve(ctx context.Context, svcConf Configuration) error Notify(n Notification) + Shutdown() } /* diff --git a/test/zero-downtime/README.md b/test/zero-downtime/README.md new file mode 100644 index 000000000..a46d77493 --- /dev/null +++ b/test/zero-downtime/README.md @@ -0,0 +1,25 @@ +# FlagD Zero downtime test + +## How to run + +Clone this repository and run the following command to deploy a standalone flagD: + +```shell +IMG=your-flagd-image make deploy-dev-env +``` + +This will create a flagd deployment `flagd-dev` namespace. + +To run the test, execute: + +```shell +IMG=your-flagd-image IMG_ZD=your-flagd-image2 make run-zd-test +``` + +Please be aware, you need to build your two custom images with different tags for flagD first. + +To build your images using Docker execute: + +```shell +docker build . -t image-name:tag -f flagd/build.Dockerfile +``` diff --git a/test/zero-downtime/test-pod.yaml b/test/zero-downtime/test-pod.yaml new file mode 100644 index 000000000..0696b9094 --- /dev/null +++ b/test/zero-downtime/test-pod.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-zd +spec: + containers: + - name: test-zd + image: curlimages/curl:8.1.2 + # yamllint disable rule:line-length + command: + - 'sh' + - '-c' + - | + for i in $(seq 1 3000); do + curl -H 'Cache-Control: no-cache, no-store' -X POST flagd-svc.$FLAGD_DEV_NAMESPACE.svc.cluster.local:8013/schema.v1.Service/ResolveString?$RANDOM -d '{"flagKey":"myStringFlag","context":{}}' -H "Content-Type: application/json" > ~/out.txt + if ! grep -q "val1" ~/out.txt + then + cat ~/out.txt + echo "\n\nCannot fetch data from flagD, exiting...\n\n" + exit 1 + fi + sleep 1 + done + exit 0 + # yamllint enable rule:line-length diff --git a/test/zero-downtime/zd_test.sh b/test/zero-downtime/zd_test.sh new file mode 100755 index 000000000..6cb25e258 --- /dev/null +++ b/test/zero-downtime/zd_test.sh @@ -0,0 +1,44 @@ +#!/bin/sh + +set -eu + +# Store the flagD image to a helper variable +IMG_ORIGINAL=$IMG + +# Create pod requesting the values from flagD +envsubst < test/zero-downtime/test-pod.yaml | kubectl apply -f - -n $ZD_TEST_NAMESPACE + +for count in 1 2 3; +do + # Update the flagD deployment with the second image + IMG=$IMG_ZD + envsubst < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n $FLAGD_DEV_NAMESPACE + kubectl wait --for=condition=available deployment/flagd -n $FLAGD_DEV_NAMESPACE --timeout=30s + + # Wait until the client pod executes curl requests agains flagD + sleep 20 + + # Update the flagDT deployment back to original image + IMG=$IMG_ORIGINAL + envsubst < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n $FLAGD_DEV_NAMESPACE + kubectl wait --for=condition=available deployment/flagd -n $FLAGD_DEV_NAMESPACE --timeout=30s + + # Wait until the client pod executes curl requests agains flagD + sleep 20 +done + +# Pod will fail only when it fails to get a proper response from curl (that means we do not have zero downtime) +# If it is still running, the last curl request was successfull. +kubectl wait --for=condition=ready pod/test-zd -n $ZD_TEST_NAMESPACE --timeout=30s + +# If curl request once not successful and another curl request was, pod might be in a ready state again. +# Therefore we need to check that the restart count is equal to zero -> this means every request provided valid data. +restart_count=$(kubectl get pods test-zd -o=jsonpath='{.status.containerStatuses[0].restartCount}' -n $ZD_TEST_NAMESPACE) +if [ "$restart_count" -ne 0 ]; then + echo "Restart count of the test-zd pod is not equal to zero." + exit 1 +fi + +# Cleanup only when the test passed +kubectl delete ns $ZD_TEST_NAMESPACE --ignore-not-found=true +