kubernetes · jkyros · Apr 19, 2024 · Apr 19, 2024 · Apr 20, 2024 · Dec 3, 2024
diff --git a/vertical-pod-autoscaler/e2e/v1/common.go b/vertical-pod-autoscaler/e2e/v1/common.go
@@ -352,6 +352,14 @@ func PatchVpaRecommendation(f *framework.Framework, vpa *vpa_types.VerticalPodAu
 	gomega.Expect(err).NotTo(gomega.HaveOccurred(), "Failed to patch VPA.")
 }
 
+// PatchDeployment patches a deployment with a given patch.
+func PatchDeployment(f *framework.Framework, deployment *appsv1.Deployment, patch *patchRecord) {
+	patchBytes, err := json.Marshal([]patchRecord{*patch})
+	gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	_, err = f.ClientSet.AppsV1().Deployments(f.Namespace.Name).Patch(context.TODO(), deployment.Name, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
+	gomega.Expect(err).NotTo(gomega.HaveOccurred(), "unexpected error patching deployment")
+}
+
 // AnnotatePod adds annotation for an existing pod.
 func AnnotatePod(f *framework.Framework, podName, annotationName, annotationValue string) {
 	bytes, err := json.Marshal([]patchRecord{{
@@ -498,6 +506,29 @@ func WaitForUncappedCPURecommendationAbove(c vpa_clientset.Interface, vpa *vpa_t
 	})
 }
 
+// WaitForNumberOfCheckpoints polls until the specified number of VerticalPodAutoscalerCheckpoints is present.
+// Returns the list of checkpoints. On timeout returns error.
+func WaitForNumberOfCheckpoints(c vpa_clientset.Interface, namespace string, count int) (*vpa_types.VerticalPodAutoscalerCheckpointList, error) {
+	var checkpoints *vpa_types.VerticalPodAutoscalerCheckpointList
+	err := wait.PollUntilContextTimeout(context.Background(), pollInterval, pollTimeout, true, func(ctx context.Context) (done bool, err error) {
+		checkpoints, err = c.AutoscalingV1().VerticalPodAutoscalerCheckpoints(namespace).List(context.TODO(), metav1.ListOptions{})
+		if err != nil {
+			return false, err
+		}
+
+		if len(checkpoints.Items) == count {
+			return true, nil
+		}
+
+		return false, nil
+	})
+
+	if err != nil {
+		return nil, fmt.Errorf("error waiting for %v checkpoints: %v", count, err)
+	}
+	return checkpoints, nil
+}
+
 func installLimitRange(f *framework.Framework, minCpuLimit, minMemoryLimit, maxCpuLimit, maxMemoryLimit *resource.Quantity, lrType apiv1.LimitType) {
 	lr := &apiv1.LimitRange{
 		ObjectMeta: metav1.ObjectMeta{

diff --git a/vertical-pod-autoscaler/e2e/v1/recommender.go b/vertical-pod-autoscaler/e2e/v1/recommender.go
@@ -29,6 +29,7 @@ import (
 	vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
 	vpa_clientset "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/client/clientset/versioned"
 	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/test"
+	vpa_api_util "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/vpa"
 	clientset "k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/tools/cache"
 	klog "k8s.io/klog/v2"
@@ -141,21 +142,9 @@ var _ = RecommenderE2eDescribe("Checkpoints", func() {
 		gomega.Expect(err).NotTo(gomega.HaveOccurred())
 
 		klog.InfoS("Sleeping for up to 15 minutes...")
-
-		maxRetries := 90
-		retryDelay := 10 * time.Second
-		for i := 0; i < maxRetries; i++ {
-			list, err := vpaClientSet.AutoscalingV1().VerticalPodAutoscalerCheckpoints(ns).List(context.TODO(), metav1.ListOptions{})
-			if err == nil && len(list.Items) == 0 {
-				break
-			}
-			klog.InfoS("Still waiting...")
-			time.Sleep(retryDelay)
-		}
-
-		list, err := vpaClientSet.AutoscalingV1().VerticalPodAutoscalerCheckpoints(ns).List(context.TODO(), metav1.ListOptions{})
+		checkpoints, err := WaitForNumberOfCheckpoints(vpaClientSet, ns, 0)
 		gomega.Expect(err).NotTo(gomega.HaveOccurred())
-		gomega.Expect(list.Items).To(gomega.BeEmpty())
+		gomega.Expect(checkpoints.Items).To(gomega.BeEmpty())
 	})
 })
 
@@ -411,6 +400,167 @@ var _ = RecommenderE2eDescribe("VPA CRD object", func() {
 	})
 })
 
+const recommendationLoopInterval = 1 * time.Minute
+
+var _ = RecommenderE2eDescribe("VPA CRD object", func() {
+	f := framework.NewDefaultFramework("vertical-pod-autoscaling")
+	f.NamespacePodSecurityEnforceLevel = podsecurity.LevelBaseline
+
+	var vpaClientSet vpa_clientset.Interface
+
+	ginkgo.BeforeEach(func() {
+		vpaClientSet = getVpaClientSet(f)
+	})
+
+	ginkgo.It("only provides recommendation to containers that exist when renaming a container", func() {
+		ginkgo.By("Setting up a hamster deployment")
+		d := NewNHamstersDeployment(f, 1 /*number of containers*/)
+		_ = startDeploymentPods(f, d)
+
+		ginkgo.By("Setting up VPA CRD")
+		vpaCRD := test.VerticalPodAutoscaler().
+			WithName("hamster-vpa").
+			WithNamespace(f.Namespace.Name).
+			WithTargetRef(hamsterTargetRef).
+			WithContainer("*").
+			WithAnnotations(map[string]string{
+				vpa_api_util.VpaPruningGracePeriodAnnotation: "0",
+			}).
+			Get()
+
+		InstallVPA(f, vpaCRD)
+
+		ginkgo.By("Waiting for recommendation to be filled for the container")
+		vpa, err := WaitForRecommendationPresent(vpaClientSet, vpaCRD)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations).Should(gomega.HaveLen(1))
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations[0].ContainerName).To(gomega.Equal(GetHamsterContainerNameByIndex(0)))
+
+		ginkgo.By("Renaming the container")
+		newContainerName := "renamed-container"
+		patchRecord := &patchRecord{
+			Op:    "replace",
+			Path:  "/spec/template/spec/containers/0/name",
+			Value: newContainerName,
+		}
+		PatchDeployment(f, d, patchRecord)
+
+		ginkgo.By("Waiting for recommendation to be filled for the renamed container and only the renamed container")
+		time.Sleep(recommendationLoopInterval)
+		vpa, err = WaitForRecommendationPresent(vpaClientSet, vpaCRD)
+
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		errMsg := fmt.Sprintf("%s is the only container in the VPA CR. There should not be any recommendations for %s",
+			newContainerName,
+			GetHamsterContainerNameByIndex(0))
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations).Should(gomega.HaveLen(1), errMsg)
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations[0].ContainerName).To(gomega.Equal(newContainerName), errMsg)
+	})
+
+	ginkgo.It("only provides recommendation to containers that exist when removing a container + deletes stale checkpoint", func() {
+		ginkgo.By("Setting up a hamster deployment")
+		d := NewNHamstersDeployment(f, 2 /*number of containers*/)
+		_ = startDeploymentPods(f, d)
+		vpaName := "hamster-vpa"
+
+		ginkgo.By("Setting up VPA CRD")
+		vpaCRD := test.VerticalPodAutoscaler().
+			WithName(vpaName).
+			WithNamespace(f.Namespace.Name).
+			WithTargetRef(hamsterTargetRef).
+			WithContainer("*").
+			WithAnnotations(map[string]string{
+				vpa_api_util.VpaPruningGracePeriodAnnotation: "0",
+			}).
+			Get()
+
+		InstallVPA(f, vpaCRD)
+
+		ginkgo.By("Waiting for recommendation to be filled for both containers")
+		vpa, err := WaitForRecommendationPresent(vpaClientSet, vpaCRD)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations).Should(gomega.HaveLen(2))
+
+		ginkgo.By("Waiting for VPA checkpoints for each container (2 checkpoints), sleeping for up to 15 minutes...")
+		checkpoints, err := WaitForNumberOfCheckpoints(vpaClientSet, f.Namespace.Name, 2)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		gomega.Expect(checkpoints.Items).To(gomega.HaveLen(2))
+
+		ginkgo.By("Removing the second container")
+		patchRecord := &patchRecord{
+			Op:   "remove",
+			Path: "/spec/template/spec/containers/1",
+		}
+		PatchDeployment(f, d, patchRecord)
+
+		ginkgo.By("Waiting for recommendation to be filled for just one container")
+		time.Sleep(recommendationLoopInterval)
+		vpa, err = WaitForRecommendationPresent(vpaClientSet, vpaCRD)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		errMsg := fmt.Sprintf("%s is now the only container in the VPA CR. There should not be any recommendations for %s",
+			GetHamsterContainerNameByIndex(0),
+			GetHamsterContainerNameByIndex(1))
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations).Should(gomega.HaveLen(1), errMsg)
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations[0].ContainerName).To(gomega.Equal(GetHamsterContainerNameByIndex(0)), errMsg)
+
+		ginkgo.By("Waiting for the garbage collection of the stale checkpoint, sleeping for up to 15 minutes...")
+		checkpoints, err = WaitForNumberOfCheckpoints(vpaClientSet, f.Namespace.Name, 1)
+		expectedCheckpointName := fmt.Sprintf("%s-%s", vpaName, GetHamsterContainerNameByIndex(0))
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		gomega.Expect(checkpoints.Items).To(gomega.HaveLen(1))
+		gomega.Expect(checkpoints.Items[0].Name).To(gomega.Equal(expectedCheckpointName),
+			fmt.Sprintf("Expected checkpoint name to be %s, got %s", expectedCheckpointName, checkpoints.Items[0].Name))
+	})
+
+	ginkgo.It("only removes a recommendation until after the pruning grace period", func() {
+		ginkgo.By("Setting up a hamster deployment")
+		d := NewNHamstersDeployment(f, 2 /*number of containers*/)
+		_ = startDeploymentPods(f, d)
+
+		ginkgo.By("Setting up VPA CRD")
+		vpaCRD := test.VerticalPodAutoscaler().
+			WithName("hamster-vpa").
+			WithNamespace(f.Namespace.Name).
+			WithTargetRef(hamsterTargetRef).
+			WithContainer("*").
+			WithAnnotations(map[string]string{
+				vpa_api_util.VpaPruningGracePeriodAnnotation: "3m",
+			}).
+			Get()
+
+		InstallVPA(f, vpaCRD)
+
+		ginkgo.By("Waiting for recommendation to be filled for the container")
+		vpa, err := WaitForRecommendationPresent(vpaClientSet, vpaCRD)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations).Should(gomega.HaveLen(2))
+
+		ginkgo.By("Removing the second container")
+		patchRecord := &patchRecord{
+			Op:   "remove",
+			Path: "/spec/template/spec/containers/1",
+		}
+		PatchDeployment(f, d, patchRecord)
+
+		ginkgo.By("Waiting the duration of the grace period, hoping the recommendation is still there, sleeping for 3 minutes...")
+		vpa, err = WaitForRecommendationPresent(vpaClientSet, vpaCRD)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations).Should(gomega.HaveLen(2))
+
+		ginkgo.By("Waiting for recommendation to be filled for just one container")
+		vpa, err = WaitForVPAMatch(vpaClientSet, vpaCRD, func(thisVpa *vpa_types.VerticalPodAutoscaler) bool {
+			return thisVpa.Status.Recommendation != nil && len(thisVpa.Status.Recommendation.ContainerRecommendations) == 1
+		})
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		errMsg := fmt.Sprintf("%s is now the only container in the VPA CR. There should not be any recommendations for %s",
+			GetHamsterContainerNameByIndex(0),
+			GetHamsterContainerNameByIndex(1))
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations).Should(gomega.HaveLen(1), errMsg)
+		gomega.Expect(vpa.Status.Recommendation.ContainerRecommendations[0].ContainerName).To(gomega.Equal(GetHamsterContainerNameByIndex(0)), errMsg)
+	})
+
+})
+
 func deleteRecommender(c clientset.Interface) error {
 	namespace := "kube-system"
 	listOptions := metav1.ListOptions{}

diff --git a/vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder.go b/vertical-pod-autoscaler/pkg/recommender/input/cluster_feeder.go
@@ -73,6 +73,12 @@ type ClusterStateFeeder interface {
 
 	// GarbageCollectCheckpoints removes historical checkpoints that don't have a matching VPA.
 	GarbageCollectCheckpoints()
+
+	// MarkAggregates marks all aggregates in all VPAs as not under VPAs
+	MarkAggregates()
+
+	// SweepAggregates garbage collects all aggregates in all VPAs aggregate lists that are no longer under VPAs
+	SweepAggregates()
 }
 
 // ClusterStateFeederFactory makes instances of ClusterStateFeeder.
@@ -208,6 +214,7 @@ func (feeder *clusterStateFeeder) InitFromHistoryProvider(historyProvider histor
 	}
 	for podID, podHistory := range clusterHistory {
 		klog.V(4).InfoS("Adding pod with labels", "pod", podID, "labels", podHistory.LastLabels)
+		_, existedBefore := feeder.clusterState.Pods[podID]
 		feeder.clusterState.AddOrUpdatePod(podID, podHistory.LastLabels, apiv1.PodUnknown)
 		for containerName, sampleList := range podHistory.Samples {
 			containerID := model.ContainerID{
@@ -228,6 +235,14 @@ func (feeder *clusterStateFeeder) InitFromHistoryProvider(historyProvider histor
 				}
 			}
 		}
+		// If the pod never existed before, we did not set VPAContainersPerPod in AddOrUpdatePod because podState.Containers
+		// has not initialized yet from AddOrUpdateContainer. So we explicitly set it here the first time we see the pod.
+		if !existedBefore {
+			podState, podExists := feeder.clusterState.Pods[podID]
+			if podExists && len(podHistory.Samples) > 1 {
+				feeder.clusterState.SetVPAContainersPerPod(podState, false)
+			}
+		}
 	}
 }
 
@@ -316,6 +331,20 @@ func (feeder *clusterStateFeeder) GarbageCollectCheckpoints() {
 					klog.ErrorS(err, "Orphaned VPA checkpoint cleanup - error deleting", "checkpoint", klog.KRef(namespace, checkpoint.Name))
 				}
 			}
+			// Also clean up a checkpoint if the VPA is still there, but the container is gone. AggregateStateByContainerName
+			// merges in the initial aggregates so we can use it to check "both lists" (initial, aggregates) at once
+			vpa, vpaExists := feeder.clusterState.Vpas[vpaID]
+			if vpaExists {
+				_, aggregateExists := vpa.AggregateStateByContainerName()[checkpoint.Spec.ContainerName]
+				if !aggregateExists {
+					err = feeder.vpaCheckpointClient.VerticalPodAutoscalerCheckpoints(namespace).Delete(context.TODO(), checkpoint.Name, metav1.DeleteOptions{})
+					if err == nil {
+						klog.V(3).InfoS("Orphaned VPA checkpoint cleanup - deleting", "checkpoint", klog.KRef(namespace, checkpoint.Name))
+					} else {
+						klog.ErrorS(err, "Orphaned VPA checkpoint cleanup - error deleting", "checkpoint", klog.KRef(namespace, checkpoint.Name))
+					}
+				}
+			}
 		}
 	}
 }
@@ -413,6 +442,38 @@ func (feeder *clusterStateFeeder) LoadVPAs(ctx context.Context) {
 	feeder.clusterState.ObservedVpas = vpaCRDs
 }
 
+// MarkAggregates marks all aggregates IsUnderVPA=false, so when we go
+// through LoadPods(), the valid ones will get marked back to true, and
+// we can garbage collect the false ones from the VPAs' aggregate lists.
+func (feeder *clusterStateFeeder) MarkAggregates() {
+	for _, vpa := range feeder.clusterState.Vpas {
+		for _, container := range vpa.AggregateContainerStates() {
+			container.IsUnderVPA = false
+		}
+		for _, container := range vpa.ContainersInitialAggregateState {
+			container.IsUnderVPA = false
+		}
+	}
+}
+
+// SweepAggregates prunes all aggregates/initial aggregates from the VPA where the
+// all containers related to an initial or aggregate state are no longer present.
+func (feeder *clusterStateFeeder) SweepAggregates() {
+	now := time.Now()
+	for _, vpa := range feeder.clusterState.Vpas {
+		// use merged aggregate state to check both initial and aggregates
+		for containerName, container := range vpa.AggregateStateByContainerName() {
+			if !container.IsUnderVPA && container.IsAggregateStale(now) {
+				klog.V(4).InfoS("Deleting stale aggregate container states; container no longer present",
+					"namespace", vpa.ID.Namespace,
+					"vpaName", vpa.ID.VpaName,
+					"containerName", containerName)
+				vpa.DeleteAllAggregatesByContainerName(containerName)
+			}
+		}
+	}
+}
+
 // LoadPods loads pod into the cluster state.
 func (feeder *clusterStateFeeder) LoadPods() {
 	podSpecs, err := feeder.specClient.GetPodSpecs()
@@ -433,12 +494,21 @@ func (feeder *clusterStateFeeder) LoadPods() {
 		if feeder.memorySaveMode && !feeder.matchesVPA(pod) {
 			continue
 		}
+		_, existedBefore := feeder.clusterState.Pods[pod.ID]
 		feeder.clusterState.AddOrUpdatePod(pod.ID, pod.PodLabels, pod.Phase)
 		for _, container := range pod.Containers {
 			if err = feeder.clusterState.AddOrUpdateContainer(container.ID, container.Request); err != nil {
 				klog.V(0).InfoS("Failed to add container", "container", container.ID, "error", err)
 			}
 		}
+		// If the pod never existed before, we did not set VPAContainersPerPod in AddOrUpdatePod because podState.Containers
+		// has not initialized yet from AddOrUpdateContainer. So we explicitly set it here the first time we see the pod.
+		if !existedBefore {
+			podState, podExists := feeder.clusterState.Pods[pod.ID]
+			if podExists && len(pod.Containers) > 1 {
+				feeder.clusterState.SetVPAContainersPerPod(podState, false)
+			}
+		}
 	}
 }
 

diff --git a/vertical-pod-autoscaler/pkg/recommender/logic/recommender.go b/vertical-pod-autoscaler/pkg/recommender/logic/recommender.go
@@ -22,6 +22,7 @@ import (
 
 	vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
 	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/model"
+	"k8s.io/klog/v2"
 )
 
 var (
@@ -39,7 +40,7 @@ var (
 
 // PodResourceRecommender computes resource recommendation for a Vpa object.
 type PodResourceRecommender interface {
-	GetRecommendedPodResources(containerNameToAggregateStateMap model.ContainerNameToAggregateStateMap) RecommendedPodResources
+	GetRecommendedPodResources(containerNameToAggregateStateMap model.ContainerNameToAggregateStateMap, containersPerPod int) RecommendedPodResources
 }
 
 // RecommendedPodResources is a Map from container name to recommended resources.
@@ -65,13 +66,14 @@ type podResourceRecommender struct {
 	upperBoundMemory MemoryEstimator
 }
 
-func (r *podResourceRecommender) GetRecommendedPodResources(containerNameToAggregateStateMap model.ContainerNameToAggregateStateMap) RecommendedPodResources {
+func (r *podResourceRecommender) GetRecommendedPodResources(containerNameToAggregateStateMap model.ContainerNameToAggregateStateMap, containersPerPod int) RecommendedPodResources {
 	var recommendation = make(RecommendedPodResources)
 	if len(containerNameToAggregateStateMap) == 0 {
 		return recommendation
 	}
 
-	fraction := 1.0 / float64(len(containerNameToAggregateStateMap))
+	fraction := 1.0 / float64(containersPerPod)
+	klog.V(5).InfoS("Spreading recommendation across containers", "containerCount", containersPerPod, "fraction", fraction)
 	minCPU := model.ScaleResource(model.CPUAmountFromCores(*podMinCPUMillicores*0.001), fraction)
 	minMemory := model.ScaleResource(model.MemoryAmountFromBytes(*podMinMemoryMb*1024*1024), fraction)