-
Notifications
You must be signed in to change notification settings - Fork 407
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix: pool-coordinator #1126
Fix: pool-coordinator #1126
Changes from 2 commits
aa4964f
74d0f69
81413c6
c942e0c
144557a
02397dc
85a5e18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -251,7 +251,7 @@ func (ycm *yurtHubCertManager) initCaCert() error { | |
klog.Infof("%s file not exists, so create it", caFile) | ||
} | ||
|
||
insecureRestConfig, err := createInsecureRestClientConfig(ycm.remoteServers[0]) | ||
insecureRestConfig, err := CreateInsecureRestClientConfig(ycm.remoteServers[0]) | ||
if err != nil { | ||
klog.Errorf("could not create insecure rest config, %v", err) | ||
return err | ||
|
@@ -524,8 +524,8 @@ func createBasic(apiServerAddr string, caCert []byte) *clientcmdapi.Config { | |
} | ||
} | ||
|
||
// createInsecureRestClientConfig create insecure rest client config. | ||
func createInsecureRestClientConfig(remoteServer *url.URL) (*restclient.Config, error) { | ||
// CreateInsecureRestClientConfig create insecure rest client config. | ||
func CreateInsecureRestClientConfig(remoteServer *url.URL) (*restclient.Config, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we expose the function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll fix it. |
||
if remoteServer == nil { | ||
return nil, fmt.Errorf("no healthy remote server") | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ var supportedResourceAndVerbsForFilter = map[string]map[string]sets.String{ | |
}, | ||
ServiceTopologyFilterName: { | ||
"endpoints": sets.NewString("list", "watch"), | ||
"pods": sets.NewString("list", "watch"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why to add this entry? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll fix it. |
||
"endpointslices": sets.NewString("list", "watch"), | ||
}, | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -161,7 +161,7 @@ func NewCoordinator( | |
poolScopedCacheSyncManager := &poolScopedCacheSyncManager{ | ||
ctx: ctx, | ||
proxiedClient: proxiedClient, | ||
coordinatorClient: cfg.CoordinatorClient, | ||
coordinatorClient: coordinatorClient, | ||
nodeName: cfg.NodeName, | ||
getEtcdStore: coordinator.getEtcdStore, | ||
} | ||
|
@@ -175,12 +175,10 @@ func NewCoordinator( | |
|
||
func (coordinator *coordinator) Run() { | ||
for { | ||
var poolCacheManager cachemanager.CacheManager | ||
var cancelEtcdStorage func() | ||
var cancelEtcdStorage = func() {} | ||
var needUploadLocalCache bool | ||
var needCancelEtcdStorage bool | ||
var isPoolCacheSynced bool | ||
var etcdStorage storage.Store | ||
var err error | ||
|
||
select { | ||
|
@@ -203,10 +201,9 @@ func (coordinator *coordinator) Run() { | |
needUploadLocalCache = true | ||
needCancelEtcdStorage = true | ||
isPoolCacheSynced = false | ||
etcdStorage = nil | ||
poolCacheManager = nil | ||
coordinator.poolCacheManager = nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do not update the coordinator field if we do not acquire lock, since these fields are used and exposed by For example, we assume that coordinator is healthy when However, if we update There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll add lock here |
||
case LeaderHub: | ||
poolCacheManager, etcdStorage, cancelEtcdStorage, err = coordinator.buildPoolCacheStore() | ||
coordinator.poolCacheManager, coordinator.etcdStorage, cancelEtcdStorage, err = coordinator.buildPoolCacheStore() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
if err != nil { | ||
klog.Errorf("failed to create pool scoped cache store and manager, %v", err) | ||
continue | ||
|
@@ -249,14 +246,14 @@ func (coordinator *coordinator) Run() { | |
}) | ||
|
||
if coordinator.needUploadLocalCache { | ||
if err := coordinator.uploadLocalCache(etcdStorage); err != nil { | ||
if err := coordinator.uploadLocalCache(coordinator.etcdStorage); err != nil { | ||
klog.Errorf("failed to upload local cache when yurthub becomes leader, %v", err) | ||
} else { | ||
needUploadLocalCache = false | ||
} | ||
} | ||
case FollowerHub: | ||
poolCacheManager, etcdStorage, cancelEtcdStorage, err = coordinator.buildPoolCacheStore() | ||
coordinator.poolCacheManager, coordinator.etcdStorage, cancelEtcdStorage, err = coordinator.buildPoolCacheStore() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
if err != nil { | ||
klog.Errorf("failed to create pool scoped cache store and manager, %v", err) | ||
continue | ||
|
@@ -280,7 +277,7 @@ func (coordinator *coordinator) Run() { | |
}) | ||
|
||
if coordinator.needUploadLocalCache { | ||
if err := coordinator.uploadLocalCache(etcdStorage); err != nil { | ||
if err := coordinator.uploadLocalCache(coordinator.etcdStorage); err != nil { | ||
klog.Errorf("failed to upload local cache when yurthub becomes follower, %v", err) | ||
} else { | ||
needUploadLocalCache = false | ||
|
@@ -293,11 +290,9 @@ func (coordinator *coordinator) Run() { | |
// Because the caller of IsReady() may be concurrent. | ||
coordinator.Lock() | ||
if needCancelEtcdStorage { | ||
coordinator.cancelEtcdStorage() | ||
cancelEtcdStorage() | ||
} | ||
coordinator.electStatus = electorStatus | ||
coordinator.poolCacheManager = poolCacheManager | ||
coordinator.etcdStorage = etcdStorage | ||
coordinator.cancelEtcdStorage = cancelEtcdStorage | ||
coordinator.needUploadLocalCache = needUploadLocalCache | ||
coordinator.isPoolCacheSynced = isPoolCacheSynced | ||
|
@@ -316,7 +311,8 @@ func (coordinator *coordinator) IsReady() (cachemanager.CacheManager, bool) { | |
// If electStatus is not PendingHub, it means pool-coordinator is healthy. | ||
coordinator.Lock() | ||
defer coordinator.Unlock() | ||
if coordinator.electStatus != PendingHub && coordinator.isPoolCacheSynced && !coordinator.needUploadLocalCache { | ||
// fixme: coordinator.isPoolCacheSynced now is not considered | ||
if coordinator.electStatus != PendingHub && !coordinator.needUploadLocalCache { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why to remove the isPoolCacheSynced? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now isPoolCacheSynced is never set to 'true'. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acctually we need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed by #1131 |
||
return coordinator.poolCacheManager, true | ||
} | ||
return nil, false | ||
|
@@ -430,7 +426,8 @@ type poolScopedCacheSyncManager struct { | |
|
||
func (p *poolScopedCacheSyncManager) EnsureStart() error { | ||
if !p.isRunning { | ||
if err := p.coordinatorClient.CoordinationV1().Leases(namespaceInformerLease).Delete(p.ctx, nameInformerLease, metav1.DeleteOptions{}); err != nil { | ||
err := p.coordinatorClient.CoordinationV1().Leases(namespaceInformerLease).Delete(p.ctx, nameInformerLease, metav1.DeleteOptions{}) | ||
if err != nil && !apierrors.IsNotFound(err) { | ||
return fmt.Errorf("failed to delete informer sync lease, %v", err) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,7 +55,7 @@ func NewHubElector( | |
coordinatorClient: coordinatorClient, | ||
coordinatorHealthChecker: coordinatorHealthChecker, | ||
cloudAPIServerHealthChecker: cloudAPIServerHealthyChecker, | ||
electorStatus: make(chan int32), | ||
electorStatus: make(chan int32, 1), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why to set the capability of electorStatus as 1? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The capability set to 0 would cause deadlock in line: https://github.com/openyurtio/openyurt/blob/pool-coordinator-dev/pkg/yurthub/poolcoordinator/leader_election.go#L91 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about moving |
||
} | ||
|
||
rl, err := resourcelock.New(cfg.LeaderElection.ResourceLock, | ||
|
@@ -75,7 +75,7 @@ func NewHubElector( | |
RetryPeriod: cfg.LeaderElection.RetryPeriod.Duration, | ||
Callbacks: leaderelection.LeaderCallbacks{ | ||
OnStartedLeading: func(ctx context.Context) { | ||
klog.Infof("yurthub of %s became lease", cfg.NodeName) | ||
klog.Infof("yurthub of %s became leader", cfg.NodeName) | ||
he.electorStatus <- LeaderHub | ||
}, | ||
OnStoppedLeading: func() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems not a good way to use
time.Sleep
to sync.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right. I'll change it to notification.