Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize: add probe for cluster status #346

Merged
merged 1 commit into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cmd/controller-manager/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ type Options struct {

NSAutoPropExcludeRegexp string
ClusterJoinTimeout time.Duration
ClusterStatusThreshold time.Duration
MemberObjectEnqueueDelay time.Duration

MaxPodListers int64
Expand Down Expand Up @@ -174,6 +175,13 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers []string, disabl
time.Second*30,
"The period of health check for member clusters. The minimum value is "+MinClusterHealthCheckPeriod.String()+".",
)

flags.DurationVar(
&o.ClusterStatusThreshold,
"cluster-status-threshold",
time.Second*100,
"The threshold of member clusters status change.",
)
}

func (o *Options) addKlogFlags(flags *pflag.FlagSet) {
Expand Down
1 change: 1 addition & 0 deletions cmd/controller-manager/app/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ func getComponentConfig(opts *options.Options) (*controllercontext.ComponentConf
MemberObjectEnqueueDelay: opts.MemberObjectEnqueueDelay,
EnableKatalystSupport: opts.EnableKatalystSupport,
ClusterHealthCheckPeriod: opts.ClusterHealthCheckPeriod,
ClusterStatusThreshold: opts.ClusterStatusThreshold,
}

if opts.ClusterHealthCheckPeriod < options.MinClusterHealthCheckPeriod {
Expand Down
1 change: 1 addition & 0 deletions pkg/controllers/context/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,5 @@ type ComponentConfig struct {
ResourceAggregationNodeFilter []labels.Selector
EnableKatalystSupport bool
ClusterHealthCheckPeriod time.Duration
ClusterStatusThreshold time.Duration
}
103 changes: 103 additions & 0 deletions pkg/controllers/federatedcluster/cluster_status_cache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
Copyright 2016 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

This file may have been modified by The KubeAdmiral Authors
("KubeAdmiral Modifications"). All KubeAdmiral Modifications
are Copyright 2023 The KubeAdmiral Authors.
*/

package federatedcluster

import (
"context"
"sync"
"time"

"k8s.io/klog/v2"

fedcorev1a1 "github.com/kubewharf/kubeadmiral/pkg/apis/core/v1alpha1"
)

type clusterStatusStore struct {
clusterStatusData sync.Map
clusterStatusThreshold time.Duration
}

type clusterStatusConditionData struct {
offlineCondition fedcorev1a1.ClusterCondition
readyCondition fedcorev1a1.ClusterCondition
probeTimestamp time.Time
}

func (c *clusterStatusStore) thresholdAdjustedStatusCondition(
ctx context.Context,
cluster *fedcorev1a1.FederatedCluster,
observedOfflineCondition fedcorev1a1.ClusterCondition,
observedReadyCondition fedcorev1a1.ClusterCondition,
) (fedcorev1a1.ClusterCondition, fedcorev1a1.ClusterCondition) {
logger := klog.FromContext(ctx)

saved := c.get(cluster.Name)
if saved == nil {
// the cluster is just joined
c.update(cluster.Name, &clusterStatusConditionData{
offlineCondition: observedOfflineCondition,
readyCondition: observedReadyCondition,
})
return observedOfflineCondition, observedReadyCondition
}
curOfflineCondition := getClusterCondition(&cluster.Status, fedcorev1a1.ClusterOffline)
curReadyCondition := getClusterCondition(&cluster.Status, fedcorev1a1.ClusterReady)
if curOfflineCondition == nil || curReadyCondition == nil {
return observedOfflineCondition, observedReadyCondition
}

Check warning on line 65 in pkg/controllers/federatedcluster/cluster_status_cache.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/cluster_status_cache.go#L49-L65

Added lines #L49 - L65 were not covered by tests

now := time.Now()
if saved.offlineCondition.Status != observedOfflineCondition.Status || saved.readyCondition.Status != observedReadyCondition.Status {
// condition status changed, record the probe timestamp
saved = &clusterStatusConditionData{
offlineCondition: observedOfflineCondition,
readyCondition: observedReadyCondition,
probeTimestamp: now,
}
c.update(cluster.Name, saved)
}

Check warning on line 76 in pkg/controllers/federatedcluster/cluster_status_cache.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/cluster_status_cache.go#L67-L76

Added lines #L67 - L76 were not covered by tests

if curOfflineCondition.Status != observedOfflineCondition.Status || curReadyCondition.Status != observedReadyCondition.Status {
// threshold not exceeded, return the old status condition
if now.Before(saved.probeTimestamp.Add(c.clusterStatusThreshold)) {
logger.V(3).WithValues("offline", curOfflineCondition.Status, "ready", curReadyCondition.Status).
Info("Threshold not exceeded, return the old status condition")
return *curOfflineCondition, *curReadyCondition
}

Check warning on line 84 in pkg/controllers/federatedcluster/cluster_status_cache.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/cluster_status_cache.go#L78-L84

Added lines #L78 - L84 were not covered by tests

logger.V(3).WithValues("offline", observedOfflineCondition.Status, "ready", observedReadyCondition.Status).
Info("Cluster status condition changed")

Check warning on line 87 in pkg/controllers/federatedcluster/cluster_status_cache.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/cluster_status_cache.go#L86-L87

Added lines #L86 - L87 were not covered by tests
}

return observedOfflineCondition, observedReadyCondition

Check warning on line 90 in pkg/controllers/federatedcluster/cluster_status_cache.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/cluster_status_cache.go#L90

Added line #L90 was not covered by tests
}

func (c *clusterStatusStore) get(cluster string) *clusterStatusConditionData {
condition, ok := c.clusterStatusData.Load(cluster)
if !ok {
return nil
}
return condition.(*clusterStatusConditionData)

Check warning on line 98 in pkg/controllers/federatedcluster/cluster_status_cache.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/cluster_status_cache.go#L93-L98

Added lines #L93 - L98 were not covered by tests
}

func (c *clusterStatusStore) update(cluster string, data *clusterStatusConditionData) {
c.clusterStatusData.Store(cluster, data)

Check warning on line 102 in pkg/controllers/federatedcluster/cluster_status_cache.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/cluster_status_cache.go#L101-L102

Added lines #L101 - L102 were not covered by tests
}
5 changes: 4 additions & 1 deletion pkg/controllers/federatedcluster/clusterstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,11 @@
}

offlineCondition := getNewClusterOfflineCondition(offlineStatus, conditionTime)
setClusterCondition(&cluster.Status, &offlineCondition)
readyCondition := getNewClusterReadyCondition(readyStatus, readyReason, readyMessage, conditionTime)

offlineCondition, readyCondition = c.clusterStatusCache.thresholdAdjustedStatusCondition(ctx, cluster, offlineCondition, readyCondition)

setClusterCondition(&cluster.Status, &offlineCondition)

Check warning on line 305 in pkg/controllers/federatedcluster/clusterstatus.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/clusterstatus.go#L302-L305

Added lines #L302 - L305 were not covered by tests
setClusterCondition(&cluster.Status, &readyCondition)

if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
Expand Down
5 changes: 5 additions & 0 deletions pkg/controllers/federatedcluster/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
clusterHealthCheckConfig *ClusterHealthCheckConfig
clusterJoinTimeout time.Duration
resourceAggregationNodeFilter []labels.Selector
clusterStatusCache clusterStatusStore

lock sync.Mutex
clusterConnectionHashes map[string]string
Expand Down Expand Up @@ -120,6 +121,10 @@
clusterHealthCheckConfig: &ClusterHealthCheckConfig{
Period: componentConfig.ClusterHealthCheckPeriod,
},
clusterStatusCache: clusterStatusStore{
clusterStatusData: sync.Map{},
clusterStatusThreshold: componentConfig.ClusterStatusThreshold,
},

Check warning on line 127 in pkg/controllers/federatedcluster/controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/federatedcluster/controller.go#L124-L127

Added lines #L124 - L127 were not covered by tests

lock: sync.Mutex{},
clusterConnectionHashes: map[string]string{},
Expand Down
Loading