From 7df8d3994b75991b5e49a65728ef5e4b24a85dde Mon Sep 17 00:00:00 2001
From: odubajDT <93584209+odubajDT@users.noreply.github.com>
Date: Thu, 13 Jul 2023 16:09:18 +0200
Subject: [PATCH] feat(flagD): support zero downtime during upgrades (#731)

<!-- Please use this template for your pull request. -->
<!-- Please use the sections that you need and delete other sections -->

## This PR
<!-- add the description of the PR here -->

- implements graceful shutdown of flagD, which leads to zero-downtime ->
this means disabling the readiness probes and sending a shutdown event
to all connected SDKs
- create example manifests for deploying flagD as a standalone
Deployment
- create Makefile entry to deploy flagD to cluster
- create ZD test with README description how to run
- create Makefile entry to run ZD test

### Related Issues
<!-- add here the GitHub issue that this PR resolves if applicable -->

Fixes #728

### Follow-up Tasks
- running ZD test as part of CI
https://github.com/open-feature/flagd/issues/732

---------

Signed-off-by: odubajDT <ondrej.dubaj@dynatrace.com>
---
 Makefile                                      | 20 ++++-
 config/deployments/flagd/deployment.yaml      | 74 +++++++++++++++++++
 config/deployments/flagd/service.yaml         | 10 +++
 core/pkg/runtime/runtime.go                   |  7 ++
 .../flag-evaluation/connect_service.go        | 13 +++-
 .../flag-evaluation/connect_service_test.go   | 40 ++++++++++
 core/pkg/service/iservice.go                  |  2 +
 test/zero-downtime/README.md                  | 25 +++++++
 test/zero-downtime/test-pod.yaml              | 25 +++++++
 test/zero-downtime/zd_test.sh                 | 44 +++++++++++
 10 files changed, 258 insertions(+), 2 deletions(-)
 create mode 100644 config/deployments/flagd/deployment.yaml
 create mode 100644 config/deployments/flagd/service.yaml
 create mode 100644 test/zero-downtime/README.md
 create mode 100644 test/zero-downtime/test-pod.yaml
 create mode 100755 test/zero-downtime/zd_test.sh

diff --git a/Makefile b/Makefile
index e6f6776b7..23bc6554b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,10 @@
-IMG ?= flagd:latest
 PHONY: .docker-build .build .run .mockgen
 PREFIX=/usr/local
 ALL_GO_MOD_DIRS := $(shell find . -type f -name 'go.mod' -exec dirname {} \; | sort)
 
+FLAGD_DEV_NAMESPACE ?= flagd-dev
+ZD_TEST_NAMESPACE ?= flagd-zd-test
+
 workspace-init: workspace-clean
 	go work init
 	$(foreach module, $(ALL_GO_MOD_DIRS), go work use $(module);)
@@ -67,6 +69,22 @@ mockgen: install-mockgen
 generate-docs:
 	cd flagd; go run ./cmd/doc/main.go
 
+.PHONY: deploy-dev-env
+export IMG?= ghcr.io/open-feature/flagd:latest
+deploy-dev-env: undeploy-dev-env
+	kubectl create ns "$(FLAGD_DEV_NAMESPACE)"
+	envsubst '$${IMG}' < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n "$(FLAGD_DEV_NAMESPACE)"
+	kubectl apply -f config/deployments/flagd/service.yaml -n "$(FLAGD_DEV_NAMESPACE)"
+	kubectl wait --for=condition=available deployment/flagd -n "$(FLAGD_DEV_NAMESPACE)" --timeout=300s
+
+undeploy-dev-env:
+	kubectl delete ns "$(FLAGD_DEV_NAMESPACE)" --ignore-not-found=true
+
+run-zd-test:
+	kubectl delete ns "$(ZD_TEST_NAMESPACE)" --ignore-not-found=true
+	kubectl create ns "$(ZD_TEST_NAMESPACE)"
+	ZD_TEST_NAMESPACE="$(ZD_TEST_NAMESPACE)" FLAGD_DEV_NAMESPACE=$(FLAGD_DEV_NAMESPACE) IMG="$(IMG)" IMG_ZD="$(IMG_ZD)" ./test/zero-downtime/zd_test.sh
+
 # Markdown lint configuration
 #
 # - .markdownlintignore holds the configuration for files to be ignored
diff --git a/config/deployments/flagd/deployment.yaml b/config/deployments/flagd/deployment.yaml
new file mode 100644
index 000000000..0f7207430
--- /dev/null
+++ b/config/deployments/flagd/deployment.yaml
@@ -0,0 +1,74 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: flagd
+  name: flagd
+spec:
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
+  selector:
+    matchLabels:
+      app: flagd
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: flagd
+        app: flagd
+    spec:
+      containers:
+        - name: flagd
+          image: ${IMG}
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/flagd
+          readinessProbe:
+            httpGet:
+              path: /readyz
+              port: 8014
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8014
+            initialDelaySeconds: 5
+            periodSeconds: 60
+          ports:
+            - containerPort: 8013
+          args:
+            - start
+            - --uri
+            - file:/etc/flagd/config.json
+            - --debug
+      volumes:
+        - name: config-volume
+          configMap:
+            name: open-feature-flags
+            items:
+              - key: flags
+                path: config.json
+---
+# ConfigMap for Flagd OpenFeatuer provider
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: open-feature-flags
+data:
+  flags: |
+    {
+      "flags": {
+        "myStringFlag": {
+          "state": "ENABLED",
+          "variants": {
+            "key1": "val1",
+            "key2": "val2"
+          },
+          "defaultVariant": "key1"
+        }
+      }
+    }
diff --git a/config/deployments/flagd/service.yaml b/config/deployments/flagd/service.yaml
new file mode 100644
index 000000000..6ed8db0cf
--- /dev/null
+++ b/config/deployments/flagd/service.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: flagd-svc
+spec:
+  selector:
+    app.kubernetes.io/name: flagd
+  ports:
+    - port: 8013
+      targetPort: 8013
diff --git a/core/pkg/runtime/runtime.go b/core/pkg/runtime/runtime.go
index 72ec033e9..50bd51958 100644
--- a/core/pkg/runtime/runtime.go
+++ b/core/pkg/runtime/runtime.go
@@ -84,6 +84,13 @@ func (r *Runtime) Start() error {
 			return nil
 		})
 	}
+
+	defer func() {
+		r.Logger.Info("Shutting down server...")
+		r.Service.Shutdown()
+		r.Logger.Info("Server successfully shutdown.")
+	}()
+
 	g.Go(func() error {
 		// Readiness probe rely on the runtime
 		r.ServiceConfig.ReadinessProbe = r.isReady
diff --git a/core/pkg/service/flag-evaluation/connect_service.go b/core/pkg/service/flag-evaluation/connect_service.go
index 68122bf7d..af993b089 100644
--- a/core/pkg/service/flag-evaluation/connect_service.go
+++ b/core/pkg/service/flag-evaluation/connect_service.go
@@ -37,6 +37,8 @@ type ConnectService struct {
 
 	serverMtx        sync.RWMutex
 	metricsServerMtx sync.RWMutex
+
+	readinessEnabled bool
 }
 
 // NewConnectService creates a ConnectService with provided parameters
@@ -57,6 +59,7 @@ func NewConnectService(
 // Serve serves services with provided configuration options
 func (s *ConnectService) Serve(ctx context.Context, svcConf service.Configuration) error {
 	g, gCtx := errgroup.WithContext(ctx)
+	s.readinessEnabled = true
 
 	g.Go(func() error {
 		return s.startServer(svcConf)
@@ -152,6 +155,14 @@ func (s *ConnectService) AddMiddleware(mw middleware.IMiddleware) {
 	s.server.Handler = mw.Handler(s.server.Handler)
 }
 
+func (s *ConnectService) Shutdown() {
+	s.readinessEnabled = false
+	s.eventingConfiguration.emitToAll(service.Notification{
+		Type: service.Shutdown,
+		Data: map[string]interface{}{},
+	})
+}
+
 func (s *ConnectService) startServer(svcConf service.Configuration) error {
 	lis, err := s.setupServer(svcConf)
 	if err != nil {
@@ -189,7 +200,7 @@ func (s *ConnectService) startMetricsServer(svcConf service.Configuration) error
 		case "/healthz":
 			w.WriteHeader(http.StatusOK)
 		case "/readyz":
-			if svcConf.ReadinessProbe() {
+			if s.readinessEnabled && svcConf.ReadinessProbe() {
 				w.WriteHeader(http.StatusOK)
 			} else {
 				w.WriteHeader(http.StatusPreconditionFailed)
diff --git a/core/pkg/service/flag-evaluation/connect_service_test.go b/core/pkg/service/flag-evaluation/connect_service_test.go
index fe68f93cd..2ee11aed0 100644
--- a/core/pkg/service/flag-evaluation/connect_service_test.go
+++ b/core/pkg/service/flag-evaluation/connect_service_test.go
@@ -204,3 +204,43 @@ func TestConnectServiceNotify(t *testing.T) {
 		t.Error("timeout while waiting for notifications")
 	}
 }
+
+func TestConnectServiceShutdown(t *testing.T) {
+	// given
+	ctrl := gomock.NewController(t)
+	eval := mock.NewMockIEvaluator(ctrl)
+
+	exp := metric.NewManualReader()
+	rs := resource.NewWithAttributes("testSchema")
+	metricRecorder := telemetry.NewOTelRecorder(exp, rs, "my-exporter")
+
+	service := NewConnectService(logger.NewLogger(nil, false), eval, metricRecorder)
+
+	sChan := make(chan iservice.Notification, 1)
+	eventing := service.eventingConfiguration
+	eventing.subs["key"] = sChan
+
+	// notification type
+	ofType := iservice.Shutdown
+
+	// emit notification in routine
+	go func() {
+		service.Notify(iservice.Notification{
+			Type: ofType,
+			Data: map[string]interface{}{},
+		})
+	}()
+
+	// wait for notification
+	timeout, cancelFunc := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancelFunc()
+
+	require.False(t, service.readinessEnabled)
+
+	select {
+	case n := <-sChan:
+		require.Equal(t, ofType, n.Type, "expected notification type: %s, but received %s", ofType, n.Type)
+	case <-timeout.Done():
+		t.Error("timeout while waiting for notifications")
+	}
+}
diff --git a/core/pkg/service/iservice.go b/core/pkg/service/iservice.go
index 69f4077b9..71951931e 100644
--- a/core/pkg/service/iservice.go
+++ b/core/pkg/service/iservice.go
@@ -10,6 +10,7 @@ type NotificationType string
 
 const (
 	ConfigurationChange NotificationType = "configuration_change"
+	Shutdown            NotificationType = "provider_shutdown"
 	ProviderReady       NotificationType = "provider_ready"
 	KeepAlive           NotificationType = "keep_alive"
 )
@@ -40,6 +41,7 @@ which call the IEvaluator implementation.
 type IFlagEvaluationService interface {
 	Serve(ctx context.Context, svcConf Configuration) error
 	Notify(n Notification)
+	Shutdown()
 }
 
 /*
diff --git a/test/zero-downtime/README.md b/test/zero-downtime/README.md
new file mode 100644
index 000000000..a46d77493
--- /dev/null
+++ b/test/zero-downtime/README.md
@@ -0,0 +1,25 @@
+# FlagD Zero downtime test
+
+## How to run
+
+Clone this repository and run the following command to deploy a standalone flagD:
+
+```shell
+IMG=your-flagd-image make deploy-dev-env
+```
+
+This will create a flagd deployment `flagd-dev` namespace.
+
+To run the test, execute:
+
+```shell
+IMG=your-flagd-image IMG_ZD=your-flagd-image2 make run-zd-test
+```
+
+Please be aware, you need to build your two custom images with different tags for flagD first.
+
+To build your images using Docker execute:
+
+```shell
+docker build . -t image-name:tag -f flagd/build.Dockerfile
+```
diff --git a/test/zero-downtime/test-pod.yaml b/test/zero-downtime/test-pod.yaml
new file mode 100644
index 000000000..0696b9094
--- /dev/null
+++ b/test/zero-downtime/test-pod.yaml
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: test-zd
+spec:
+  containers:
+    - name: test-zd
+      image: curlimages/curl:8.1.2
+      # yamllint disable rule:line-length
+      command:
+        - 'sh'
+        - '-c'
+        - |
+          for i in $(seq 1 3000); do
+              curl -H 'Cache-Control: no-cache, no-store' -X POST flagd-svc.$FLAGD_DEV_NAMESPACE.svc.cluster.local:8013/schema.v1.Service/ResolveString?$RANDOM -d '{"flagKey":"myStringFlag","context":{}}' -H "Content-Type: application/json" > ~/out.txt
+              if ! grep -q "val1" ~/out.txt
+              then
+                cat ~/out.txt
+                echo "\n\nCannot fetch data from flagD, exiting...\n\n"
+                exit 1
+              fi
+              sleep 1
+          done
+          exit 0
+      # yamllint enable rule:line-length
diff --git a/test/zero-downtime/zd_test.sh b/test/zero-downtime/zd_test.sh
new file mode 100755
index 000000000..6cb25e258
--- /dev/null
+++ b/test/zero-downtime/zd_test.sh
@@ -0,0 +1,44 @@
+#!/bin/sh
+
+set -eu
+
+# Store the flagD image to a helper variable
+IMG_ORIGINAL=$IMG
+
+# Create pod requesting the values from flagD
+envsubst < test/zero-downtime/test-pod.yaml | kubectl apply -f - -n $ZD_TEST_NAMESPACE
+
+for count in 1 2 3;
+do
+    # Update the flagD deployment with the second image
+    IMG=$IMG_ZD
+    envsubst < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n $FLAGD_DEV_NAMESPACE
+    kubectl wait --for=condition=available deployment/flagd -n $FLAGD_DEV_NAMESPACE --timeout=30s
+
+    # Wait until the client pod executes curl requests agains flagD
+    sleep 20
+
+    # Update the flagDT deployment back to original image
+    IMG=$IMG_ORIGINAL
+    envsubst < config/deployments/flagd/deployment.yaml | kubectl apply -f - -n $FLAGD_DEV_NAMESPACE
+    kubectl wait --for=condition=available deployment/flagd -n $FLAGD_DEV_NAMESPACE --timeout=30s
+
+    # Wait until the client pod executes curl requests agains flagD
+    sleep 20
+done
+
+# Pod will fail only when it fails to get a proper response from curl (that means we do not have zero downtime)
+# If it is still running, the last curl request was successfull.
+kubectl wait --for=condition=ready pod/test-zd -n $ZD_TEST_NAMESPACE --timeout=30s
+
+# If curl request once not successful and another curl request was, pod might be in a ready state again.
+# Therefore we need to check that the restart count is equal to zero -> this means every request provided valid data.
+restart_count=$(kubectl get pods test-zd -o=jsonpath='{.status.containerStatuses[0].restartCount}' -n $ZD_TEST_NAMESPACE)
+if [ "$restart_count" -ne 0 ]; then
+    echo "Restart count of the test-zd pod is not equal to zero."
+    exit 1
+fi
+
+# Cleanup only when the test passed
+kubectl delete ns $ZD_TEST_NAMESPACE --ignore-not-found=true
+