From 2c6d5fea81c5ce8950e46c7d472ff1f3ea9694c3 Mon Sep 17 00:00:00 2001 From: Alex Katsman Date: Thu, 19 Jan 2023 16:49:17 +0100 Subject: [PATCH] Add delay for retrying NEG polling for unhealthy pods --- pkg/neg/readiness/poller.go | 7 +++- pkg/neg/readiness/poller_test.go | 59 +++++++++++++++++--------------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/pkg/neg/readiness/poller.go b/pkg/neg/readiness/poller.go index b367d544ce..677f6d7870 100644 --- a/pkg/neg/readiness/poller.go +++ b/pkg/neg/readiness/poller.go @@ -41,7 +41,8 @@ const ( // GCE NEG API RPS quota is rate limited per every 100 seconds. // Make this retry delay to match the rate limiting interval. // More detail: https://cloud.google.com/compute/docs/api-rate-limits - retryDelay = 100 * time.Second + retryDelay = 100 * time.Second + hcRetryDelay = time.Second ) // negMeta references a GCE NEG resource @@ -255,6 +256,10 @@ func (p *poller) processHealthStatus(key negMeta, healthStatuses []*composite.Ne } } + if retry { + <-p.clock.After(hcRetryDelay) + } + // If we didn't patch all of the endpoints, we must keep polling for health status return retry, utilerrors.NewAggregate(errList) } diff --git a/pkg/neg/readiness/poller_test.go b/pkg/neg/readiness/poller_test.go index 312efae367..c5847be903 100644 --- a/pkg/neg/readiness/poller_test.go +++ b/pkg/neg/readiness/poller_test.go @@ -19,17 +19,18 @@ package readiness import ( "context" "fmt" - "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud" - "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/filter" - "google.golang.org/api/compute/v1" - "k8s.io/apimachinery/pkg/util/clock" - "k8s.io/legacy-cloud-providers/gce" "net" "reflect" "strconv" "testing" "time" + "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud" + "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/filter" + "google.golang.org/api/compute/v1" + "k8s.io/apimachinery/pkg/util/clock" + "k8s.io/legacy-cloud-providers/gce" + "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" "k8s.io/apimachinery/pkg/types" "k8s.io/ingress-gce/pkg/composite" @@ -387,11 +388,15 @@ func TestPoll(t *testing.T) { }, } - pollAndValidate := func(desc string, expectErr bool, expectRetry bool, expectPatchCount int, stepClock bool) { + pollAndValidate := func(desc string, expectErr bool, expectRetry bool, expectPatchCount int, stepClock bool, healthStatusDelay bool) { if stepClock { go func() { time.Sleep(2 * time.Second) - fakeClock.Step(retryDelay) + delay := retryDelay + if healthStatusDelay { + delay = hcRetryDelay + } + fakeClock.Step(delay) }() } retry, err := poller.Poll(key) @@ -416,19 +421,19 @@ func TestPoll(t *testing.T) { polling: true, } - pollAndValidate(step, false, true, 0, false) - pollAndValidate(step, false, true, 0, false) + pollAndValidate(step, false, true, 0, false, false) + pollAndValidate(step, false, true, 0, false, false) step = "unmark polling" poller.pollMap[key].polling = false - pollAndValidate(step, true, true, 0, true) - pollAndValidate(step, true, true, 0, true) + pollAndValidate(step, true, true, 0, true, false) + pollAndValidate(step, true, true, 0, true, false) step = "NEG exists, but with no endpoint" // create NEG, but with no endpoint negCloud.CreateNetworkEndpointGroup(&composite.NetworkEndpointGroup{Name: negName, Zone: zone, Version: meta.VersionGA}, zone) - pollAndValidate(step, false, true, 0, false) - pollAndValidate(step, false, true, 0, false) + pollAndValidate(step, false, true, 0, true, true) + pollAndValidate(step, false, true, 0, true, true) step = "NE added to the NEG, but NE health status is empty" ne := &composite.NetworkEndpoint{ @@ -446,8 +451,8 @@ func TestPoll(t *testing.T) { }, }) - pollAndValidate(step, false, false, 1, false) - pollAndValidate(step, false, false, 2, false) + pollAndValidate(step, false, false, 1, false, false) + pollAndValidate(step, false, false, 2, false, false) patcherTester.Eval(t, fmt.Sprintf("%v/%v", ns, podName), meta.ZonalKey(negName, zone), nil) step = "NE health status is empty and there are other endpoint with health status in NEG" @@ -458,8 +463,8 @@ func TestPoll(t *testing.T) { Healths: []*composite.HealthStatusForNetworkEndpoint{}, }, }) - pollAndValidate(step, false, true, 2, false) - pollAndValidate(step, false, true, 2, false) + pollAndValidate(step, false, true, 2, true, true) + pollAndValidate(step, false, true, 2, true, true) step = "NE has nonhealthy status" negtypes.GetNetworkEndpointStore(negCloud).AddNetworkEndpointHealthStatus(*meta.ZonalKey(negName, zone), []negtypes.NetworkEndpointEntry{ @@ -475,8 +480,8 @@ func TestPoll(t *testing.T) { }, }, }) - pollAndValidate(step, false, true, 2, false) - pollAndValidate(step, false, true, 2, false) + pollAndValidate(step, false, true, 2, true, true) + pollAndValidate(step, false, true, 2, true, true) step = "NE has nonhealthy status with irrelevant entry" negtypes.GetNetworkEndpointStore(negCloud).AddNetworkEndpointHealthStatus(*meta.ZonalKey(negName, zone), []negtypes.NetworkEndpointEntry{ @@ -493,8 +498,8 @@ func TestPoll(t *testing.T) { }, }, }) - pollAndValidate(step, false, true, 2, false) - pollAndValidate(step, false, true, 2, false) + pollAndValidate(step, false, true, 2, true, true) + pollAndValidate(step, false, true, 2, true, true) step = "NE has unsupported health" negtypes.GetNetworkEndpointStore(negCloud).AddNetworkEndpointHealthStatus(*meta.ZonalKey(negName, zone), []negtypes.NetworkEndpointEntry{ @@ -510,8 +515,8 @@ func TestPoll(t *testing.T) { }, }, }) - pollAndValidate(step, false, false, 3, false) - pollAndValidate(step, false, false, 4, false) + pollAndValidate(step, false, false, 3, false, false) + pollAndValidate(step, false, false, 4, false, false) step = "NE has healthy status" bsName := "bar" @@ -530,9 +535,9 @@ func TestPoll(t *testing.T) { }, irrelevantEntry, }) - pollAndValidate(step, false, false, 5, false) + pollAndValidate(step, false, false, 5, false, false) patcherTester.Eval(t, fmt.Sprintf("%v/%v", ns, podName), meta.ZonalKey(negName, zone), meta.GlobalKey(bsName)) - pollAndValidate(step, false, false, 6, false) + pollAndValidate(step, false, false, 6, false, false) patcherTester.Eval(t, fmt.Sprintf("%v/%v", ns, podName), meta.ZonalKey(negName, zone), meta.GlobalKey(bsName)) step = "ListNetworkEndpoint return error response" @@ -542,8 +547,8 @@ func TestPoll(t *testing.T) { return nil, fmt.Errorf("random error from GCE") } poller.negCloud = negtypes.NewAdapter(fakeGCE) - pollAndValidate(step, true, true, 6, true) - pollAndValidate(step, true, true, 6, true) + pollAndValidate(step, true, true, 6, true, false) + pollAndValidate(step, true, true, 6, true, false) } func TestProcessHealthStatus(t *testing.T) {