From 51e7f86e0040d9dc3d00702c5b76f69c5c2076ba Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 3 Mar 2025 15:42:59 -0800 Subject: [PATCH] ci: Fix E2E test flakes (#7827) --- pkg/controllers/nodeclass/validation.go | 24 +++++++++---------- test/pkg/debug/pod.go | 2 +- test/pkg/environment/common/expectations.go | 4 ++-- test/suites/consolidation/suite_test.go | 6 ++--- test/suites/drift/suite_test.go | 4 ++-- .../integration/instance_profile_test.go | 4 ++-- test/suites/integration/nodeclass_test.go | 22 ++++++++--------- test/suites/scheduling/suite_test.go | 4 ++-- 8 files changed, 33 insertions(+), 37 deletions(-) diff --git a/pkg/controllers/nodeclass/validation.go b/pkg/controllers/nodeclass/validation.go index f07e1402af9c..0266b4737d9b 100644 --- a/pkg/controllers/nodeclass/validation.go +++ b/pkg/controllers/nodeclass/validation.go @@ -22,10 +22,8 @@ import ( "github.com/mitchellh/hashstructure/v2" "github.com/patrickmn/go-cache" "github.com/samber/lo" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" corev1 "k8s.io/api/core/v1" @@ -153,8 +151,8 @@ func (v *Validation) validateCreateFleetAuthorization( _ *karpv1.NodeClaim, tags map[string]string, ) (reason string, err error) { - createFleetInput := instance.GetCreateFleetInput(nodeClass, string(karpv1.CapacityTypeOnDemand), tags, mockLaunchTemplateConfig()) - createFleetInput.DryRun = aws.Bool(true) + createFleetInput := instance.GetCreateFleetInput(nodeClass, karpv1.CapacityTypeOnDemand, tags, mockLaunchTemplateConfig()) + createFleetInput.DryRun = lo.ToPtr(true) if _, err := v.ec2api.CreateFleet(ctx, createFleetInput); awserrors.IgnoreDryRunError(err) != nil { if awserrors.IgnoreUnauthorizedOperationError(err) != nil { // Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error @@ -173,7 +171,7 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization( tags map[string]string, ) (reason string, err error) { createLaunchTemplateInput := launchtemplate.GetCreateLaunchTemplateInput(ctx, mockOptions(*nodeClaim, nodeClass, tags), corev1.IPv4Protocol, "") - createLaunchTemplateInput.DryRun = aws.Bool(true) + createLaunchTemplateInput.DryRun = lo.ToPtr(true) if _, err := v.ec2api.CreateLaunchTemplate(ctx, createLaunchTemplateInput); awserrors.IgnoreDryRunError(err) != nil { if awserrors.IgnoreUnauthorizedOperationError(err) != nil { // Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error @@ -207,8 +205,8 @@ func (v *Validation) validateRunInstancesAuthorization( runInstancesInput := &ec2.RunInstancesInput{ DryRun: lo.ToPtr(true), - MaxCount: aws.Int32(1), - MinCount: aws.Int32(1), + MaxCount: lo.ToPtr[int32](1), + MinCount: lo.ToPtr[int32](1), InstanceType: instanceType, MetadataOptions: &ec2types.InstanceMetadataOptionsRequest{ HttpEndpoint: ec2types.InstanceMetadataEndpointState(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPEndpoint)), @@ -216,7 +214,7 @@ func (v *Validation) validateRunInstancesAuthorization( HttpProtocolIpv6: ec2types.InstanceMetadataProtocolState(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPProtocolIPv6)), //aws sdk v2 changed this type to *int32 instead of *int64 //nolint: gosec - HttpPutResponseHopLimit: aws.Int32(int32(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPPutResponseHopLimit))), + HttpPutResponseHopLimit: lo.ToPtr(int32(lo.FromPtr(nodeClass.Spec.MetadataOptions.HTTPPutResponseHopLimit))), }, TagSpecifications: []ec2types.TagSpecification{ { @@ -290,18 +288,18 @@ func mockLaunchTemplateConfig() []ec2types.FleetLaunchTemplateConfigRequest { return []ec2types.FleetLaunchTemplateConfigRequest{ { LaunchTemplateSpecification: &ec2types.FleetLaunchTemplateSpecificationRequest{ - LaunchTemplateName: aws.String("mock-lt-name"), - LaunchTemplateId: aws.String("lt-1234567890abcdef0"), - Version: aws.String("1"), + LaunchTemplateName: lo.ToPtr("mock-lt-name"), + LaunchTemplateId: lo.ToPtr("lt-1234567890abcdef0"), + Version: lo.ToPtr("1"), }, Overrides: []ec2types.FleetLaunchTemplateOverridesRequest{ { InstanceType: ec2types.InstanceTypeT3Micro, - SubnetId: aws.String("subnet-1234567890abcdef0"), + SubnetId: lo.ToPtr("subnet-1234567890abcdef0"), }, { InstanceType: ec2types.InstanceTypeT3Small, - SubnetId: aws.String("subnet-1234567890abcdef1"), + SubnetId: lo.ToPtr("subnet-1234567890abcdef1"), }, }, }, diff --git a/test/pkg/debug/pod.go b/test/pkg/debug/pod.go index d1cb2e58bf19..715f28dc5921 100644 --- a/test/pkg/debug/pod.go +++ b/test/pkg/debug/pod.go @@ -81,7 +81,7 @@ func (c *PodController) Register(_ context.Context, m manager.Manager) error { }, }, predicate.NewPredicateFuncs(func(o client.Object) bool { - return o.GetNamespace() != "kube-system" + return o.GetNamespace() != "kube-system" && o.GetNamespace() != "prometheus" }), )). WithOptions(controller.Options{MaxConcurrentReconciles: 10, SkipNameValidation: lo.ToPtr(true)}). diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index 52aa5d4016b0..7dd5b0e7e9c3 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -691,13 +691,13 @@ func (env *Environment) EventuallyExpectNodesUntaintedWithTimeout(timeout time.D }).WithTimeout(timeout).Should(Succeed()) } -func (env *Environment) EventuallyExpectNodeClaimCount(comparator string, count int) []*karpv1.NodeClaim { +func (env *Environment) EventuallyExpectLaunchedNodeClaimCount(comparator string, count int) []*karpv1.NodeClaim { GinkgoHelper() By(fmt.Sprintf("waiting for nodes to be %s to %d", comparator, count)) nodeClaimList := &karpv1.NodeClaimList{} Eventually(func(g Gomega) { g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) - g.Expect(len(nodeClaimList.Items)).To(BeNumerically(comparator, count), + g.Expect(lo.CountBy(nodeClaimList.Items, func(nc karpv1.NodeClaim) bool { return nc.StatusConditions().IsTrue(karpv1.ConditionTypeLaunched) })).To(BeNumerically(comparator, count), fmt.Sprintf("expected %d nodeclaims, had %d (%v)", count, len(nodeClaimList.Items), NodeClaimNames(lo.ToSlicePtr(nodeClaimList.Items)))) }).Should(Succeed()) return lo.ToSlicePtr(nodeClaimList.Items) diff --git a/test/suites/consolidation/suite_test.go b/test/suites/consolidation/suite_test.go index cba85ba2c181..725f660c8c34 100644 --- a/test/suites/consolidation/suite_test.go +++ b/test/suites/consolidation/suite_test.go @@ -398,7 +398,7 @@ var _ = Describe("Consolidation", Ordered, func() { // Ensure that we get three nodes tainted, and they have overlap during the consolidation env.EventuallyExpectTaintedNodeCount("==", 3) - env.EventuallyExpectNodeClaimCount("==", 8) + env.EventuallyExpectLaunchedNodeClaimCount("==", 8) env.EventuallyExpectNodeCount("==", 8) env.ConsistentlyExpectDisruptionsUntilNoneLeft(5, 3, 10*time.Minute) @@ -925,7 +925,7 @@ var _ = Describe("Consolidation", Ordered, func() { Replicas: 1, }) env.ExpectCreated(nodePool, nodeClass, dep) - env.EventuallyExpectNodeClaimsReady(env.EventuallyExpectNodeClaimCount("==", 1)...) + env.EventuallyExpectNodeClaimsReady(env.EventuallyExpectLaunchedNodeClaimCount("==", 1)...) n := env.EventuallyExpectNodeCount("==", int(1))[0] Expect(n.Labels).To(HaveKeyWithValue(corev1.LabelInstanceTypeStable, string(ec2types.InstanceTypeM5Large))) Expect(n.Labels).To(HaveKeyWithValue(karpv1.CapacityTypeLabelKey, karpv1.CapacityTypeOnDemand)) @@ -969,7 +969,7 @@ var _ = Describe("Consolidation", Ordered, func() { // Start by only enabling the m5.xlarge capacity reservation, ensuring it's provisioned nodeClass.Spec.CapacityReservationSelectorTerms = []v1.CapacityReservationSelectorTerm{{ID: xlargeCapacityReservationID}} env.ExpectCreated(nodePool, nodeClass, dep) - env.EventuallyExpectNodeClaimsReady(env.EventuallyExpectNodeClaimCount("==", 1)...) + env.EventuallyExpectNodeClaimsReady(env.EventuallyExpectLaunchedNodeClaimCount("==", 1)...) n := env.EventuallyExpectNodeCount("==", int(1))[0] Expect(n.Labels).To(HaveKeyWithValue(corev1.LabelInstanceTypeStable, string(ec2types.InstanceTypeM5Xlarge))) Expect(n.Labels).To(HaveKeyWithValue(karpv1.CapacityTypeLabelKey, karpv1.CapacityTypeReserved)) diff --git a/test/suites/drift/suite_test.go b/test/suites/drift/suite_test.go index af176867cbd5..5d90bb7d2a51 100644 --- a/test/suites/drift/suite_test.go +++ b/test/suites/drift/suite_test.go @@ -1000,7 +1000,7 @@ var _ = Describe("Drift", Ordered, func() { }, }) env.ExpectCreated(nodePool, nodeClass, pod) - nc := env.EventuallyExpectNodeClaimCount("==", 1)[0] + nc := env.EventuallyExpectLaunchedNodeClaimCount("==", 1)[0] env.EventuallyExpectNodeClaimsReady(nc) n := env.EventuallyExpectCreatedNodeCount("==", 1)[0] Expect(n.Labels).To(HaveKeyWithValue(corev1.LabelInstanceTypeStable, string(ec2types.InstanceTypeM5Large))) @@ -1037,7 +1037,7 @@ var _ = Describe("Drift", Ordered, func() { }) env.ExpectCreated(nodePool, nodeClass, pod) - nc := env.EventuallyExpectNodeClaimCount("==", 1)[0] + nc := env.EventuallyExpectLaunchedNodeClaimCount("==", 1)[0] req, ok := lo.Find(nc.Spec.Requirements, func(req karpv1.NodeSelectorRequirementWithMinValues) bool { return req.Key == v1.LabelCapacityReservationID }) diff --git a/test/suites/integration/instance_profile_test.go b/test/suites/integration/instance_profile_test.go index f26ca6570ed8..d5b52fd1806a 100644 --- a/test/suites/integration/instance_profile_test.go +++ b/test/suites/integration/instance_profile_test.go @@ -94,7 +94,7 @@ var _ = Describe("InstanceProfile Generation", func() { nodeClass.Spec.Role = fmt.Sprintf("KarpenterNodeRole-%s", "invalidRole") env.ExpectCreated(nodeClass) ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: v1.ConditionTypeInstanceProfileReady, Status: metav1.ConditionUnknown}) - ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: v1.ConditionTypeValidationSucceeded, Status: metav1.ConditionFalse}) - ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: status.ConditionReady, Status: metav1.ConditionFalse}) + ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: v1.ConditionTypeValidationSucceeded, Status: metav1.ConditionUnknown}) + ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: status.ConditionReady, Status: metav1.ConditionUnknown}) }) }) diff --git a/test/suites/integration/nodeclass_test.go b/test/suites/integration/nodeclass_test.go index a95ca5a9f0bd..1fe6eacbdf47 100644 --- a/test/suites/integration/nodeclass_test.go +++ b/test/suites/integration/nodeclass_test.go @@ -27,6 +27,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" @@ -54,29 +55,26 @@ var _ = Describe("NodeClass IAM Permissions", func() { }`, action) deployment := &appsv1.Deployment{} - err := env.Client.Get(env.Context, types.NamespacedName{ + Expect(env.Client.Get(env.Context, types.NamespacedName{ Namespace: "kube-system", Name: "karpenter", - }, deployment) - Expect(err).To(BeNil()) + }, deployment)).To(Succeed()) sa := &corev1.ServiceAccount{} - err = env.Client.Get(env.Context, types.NamespacedName{ + Expect(env.Client.Get(env.Context, types.NamespacedName{ Namespace: "kube-system", Name: deployment.Spec.Template.Spec.ServiceAccountName, - }, sa) - Expect(err).To(BeNil()) + }, sa)).To(Succeed()) roleName = strings.Split(sa.Annotations["eks.amazonaws.com/role-arn"], "/")[1] policyName = fmt.Sprintf("TestPolicy-%s", uuid.New().String()) - _, err = env.IAMAPI.PutRolePolicy(env.Context, &iam.PutRolePolicyInput{ + _, err := env.IAMAPI.PutRolePolicy(env.Context, &iam.PutRolePolicyInput{ RoleName: aws.String(roleName), PolicyName: aws.String(policyName), PolicyDocument: aws.String(policyDoc), }) Expect(err).To(BeNil()) - DeferCleanup(func() { _, err := env.IAMAPI.DeleteRolePolicy(env.Context, &iam.DeleteRolePolicyInput{ RoleName: aws.String(roleName), @@ -87,10 +85,10 @@ var _ = Describe("NodeClass IAM Permissions", func() { env.ExpectCreated(nodeClass) Eventually(func(g Gomega) { - env.ExpectUpdated(nodeClass) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).IsFalse()).To(BeTrue()) g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).Reason).To(Equal(expectedMessage)) - }, "240s", "5s").Should(Succeed()) + }).Should(Succeed()) ExpectStatusConditions(env, env.Client, 1*time.Minute, nodeClass, status.Condition{Type: status.ConditionReady, Status: metav1.ConditionFalse, Message: "ValidationSucceeded=False"}) }, Entry("should fail when CreateFleet is denied", @@ -107,8 +105,8 @@ var _ = Describe("NodeClass IAM Permissions", func() { It("should succeed with all required permissions", func() { env.ExpectCreated(nodeClass) Eventually(func(g Gomega) { - env.ExpectUpdated(nodeClass) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClass), nodeClass)).To(Succeed()) g.Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).IsTrue()).To(BeTrue()) - }, "60s", "5s").Should(Succeed()) + }).Should(Succeed()) }) }) diff --git a/test/suites/scheduling/suite_test.go b/test/suites/scheduling/suite_test.go index 8ca777df9a7c..c5e74278707a 100644 --- a/test/suites/scheduling/suite_test.go +++ b/test/suites/scheduling/suite_test.go @@ -771,7 +771,7 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { }) env.ExpectCreated(nodePool, nodeClass, pod) - nc := env.EventuallyExpectNodeClaimCount("==", 1)[0] + nc := env.EventuallyExpectLaunchedNodeClaimCount("==", 1)[0] req, ok := lo.Find(nc.Spec.Requirements, func(req karpv1.NodeSelectorRequirementWithMinValues) bool { return req.Key == v1.LabelCapacityReservationID }) @@ -808,7 +808,7 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { env.ExpectCreated(nodePool, nodeClass, pods[0], pods[1]) reservedCount := 0 - for _, nc := range env.EventuallyExpectNodeClaimCount("==", 2) { + for _, nc := range env.EventuallyExpectLaunchedNodeClaimCount("==", 2) { req, ok := lo.Find(nc.Spec.Requirements, func(req karpv1.NodeSelectorRequirementWithMinValues) bool { return req.Key == v1.LabelCapacityReservationID })