Skip to content

Commit

Permalink
fix cloneSet controller block caused by scale expectation leakage
Browse files Browse the repository at this point in the history
Signed-off-by: liheng.zms <liheng.zms@alibaba-inc.com>
  • Loading branch information
zmberg committed Nov 19, 2024
1 parent fa139cb commit 4f32307
Showing 1 changed file with 35 additions and 1 deletion.
36 changes: 35 additions & 1 deletion pkg/controller/cloneset/cloneset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

appsv1alpha1 "github.com/openkruise/kruise/apis/apps/v1alpha1"
kruiseclient "github.com/openkruise/kruise/pkg/client"
kubeClient "github.com/openkruise/kruise/pkg/client"
clonesetcore "github.com/openkruise/kruise/pkg/controller/cloneset/core"
revisioncontrol "github.com/openkruise/kruise/pkg/controller/cloneset/revision"
synccontrol "github.com/openkruise/kruise/pkg/controller/cloneset/sync"
Expand All @@ -41,6 +42,7 @@ import (
"github.com/openkruise/kruise/pkg/util/refmanager"
"github.com/openkruise/kruise/pkg/util/volumeclaimtemplate"

"github.com/prometheus/client_golang/prometheus"
apps "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -59,13 +61,16 @@ import (
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"
)

func init() {
flag.IntVar(&concurrentReconciles, "cloneset-workers", concurrentReconciles, "Max concurrent workers for CloneSet controller.")
// register prometheus
metrics.Registry.MustRegister(CloneSetScaleExpectationLeakageMetrics)
}

var (
Expand All @@ -75,6 +80,16 @@ var (
minimumReplicasToPreDownloadImage int32 = 3
)

var (
CloneSetScaleExpectationLeakageMetrics = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cloneset_scale_expectation_leakage",
Help: "CloneSet Scale Expectation Leakage Metrics",
// cloneSet namespace, name
}, []string{"namespace", "name"},
)
)

// Add creates a new CloneSet Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller
// and Start it when the Manager is Started.
func Add(mgr manager.Manager) error {
Expand Down Expand Up @@ -229,8 +244,27 @@ func (r *ReconcileCloneSet) doReconcile(request reconcile.Request) (res reconcil
// If scaling expectations have not satisfied yet, just skip this reconcile.
if scaleSatisfied, unsatisfiedDuration, scaleDirtyPods := clonesetutils.ScaleExpectations.SatisfiedExpectations(request.String()); !scaleSatisfied {
if unsatisfiedDuration >= expectations.ExpectationTimeout {
// In some extreme scenarios, if the Pod is created and then quickly deleted, there may be event loss.
// Therefore, a touting mechanism is needed to ensure that clonesets can continue to work.

Check warning on line 248 in pkg/controller/cloneset/cloneset_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/cloneset/cloneset_controller.go#L247-L248

Added lines #L247 - L248 were not covered by tests
klog.InfoS("Expectation unsatisfied overtime", "cloneSet", request, "scaleDirtyPods", scaleDirtyPods, "overTime", unsatisfiedDuration)
return reconcile.Result{}, nil
CloneSetScaleExpectationLeakageMetrics.WithLabelValues(request.Namespace, request.Name).Add(1)
for _, pods := range scaleDirtyPods {
for _, name := range pods {
_, err = kubeClient.GetGenericClient().KubeClient.CoreV1().Pods(request.Namespace).Get(context.TODO(), name, metav1.GetOptions{})
if err == nil {
klog.Warningf("CloneSet(%s/%s) ScaleExpectations leakage, but Pod(%s) already exist", request.Namespace, request.Name, name)
return reconcile.Result{RequeueAfter: 30 * time.Second}, nil
} else if !errors.IsNotFound(err) {
klog.ErrorS(err, "Failed to get Pod", "cloneSet", request, "pod", name)
return reconcile.Result{RequeueAfter: 3 * time.Second}, nil
}

Check warning on line 260 in pkg/controller/cloneset/cloneset_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/cloneset/cloneset_controller.go#L250-L260

Added lines #L250 - L260 were not covered by tests
}
}
klog.InfoS("CloneSet ScaleExpectation DirtyPods no longer exists, and delete ScaleExpectation", "cloneSet", request)
clonesetutils.ScaleExpectations.DeleteExpectations(request.String())
// In order to avoid the scale expectation timeout,
// there is no subsequent Pod, CloneSet event causing CloneSet not to be scheduled
return reconcile.Result{RequeueAfter: 10 * time.Second}, nil

Check warning on line 267 in pkg/controller/cloneset/cloneset_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controller/cloneset/cloneset_controller.go#L263-L267

Added lines #L263 - L267 were not covered by tests
}
klog.V(4).InfoS("Not satisfied scale", "cloneSet", request, "scaleDirtyPods", scaleDirtyPods)
return reconcile.Result{RequeueAfter: expectations.ExpectationTimeout - unsatisfiedDuration}, nil
Expand Down

0 comments on commit 4f32307

Please sign in to comment.