From 214dd0dba773af3b3c79b2a74595ba6dfe44822f Mon Sep 17 00:00:00 2001 From: James Munson Date: Tue, 10 Dec 2024 14:41:10 -0700 Subject: [PATCH] fix: clear fast-failover status when node goes not ready Signed-off-by: James Munson --- controller/node_controller.go | 39 +++++++++++++++++++++++++++++++++ controller/volume_controller.go | 2 +- datastore/kubernetes.go | 6 ++++- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/controller/node_controller.go b/controller/node_controller.go index 7f776bbc61..a2db253d7b 100644 --- a/controller/node_controller.go +++ b/controller/node_controller.go @@ -434,6 +434,13 @@ func (nc *NodeController) syncNode(key string) (err error) { return err } + // Set any RWX leases to non-delinquent if owned by not-ready node. + // Usefulness of delinquent state has passed. + if err = nc.clearDelinquentLeasesIfNodeNotReady(node); err != nil { + log.WithError(err).Warnf("Failed to clear delinquent leases for node %v", node.Name) + return err + } + node.Status.Region, node.Status.Zone = types.GetRegionAndZone(kubeNode.Labels) if nc.controllerID != node.Name { @@ -2218,3 +2225,35 @@ func (nc *NodeController) SetSchedulableCondition(node *longhorn.Node, kubeNode corev1.EventTypeNormal) } } + +func (nc *NodeController) clearDelinquentLeasesIfNodeNotReady(node *longhorn.Node) error { + enabled, err := nc.ds.GetSettingAsBool(types.SettingNameRWXVolumeFastFailover) + if err != nil { + return errors.Wrapf(err, "failed to get setting %v", types.SettingNameRWXVolumeFastFailover) + } + if !enabled { + return nil + } + + isDownOrDeleted, err := nc.ds.IsNodeDownOrDeleted(node.Name) + if err != nil { + return errors.Wrapf(err, "failed to check IsNodeDownOrDeleted, node=%v", node.Name) + } + if !isDownOrDeleted { + return nil + } + + sms, err := nc.ds.ListShareManagersRO() + if err != nil { + return errors.Wrap(err, "failed to list share managers") + } + for _, sm := range sms { + // Share manager name is volume name is lease name. + err = nc.ds.ClearDelinquentAndStaleStateIfVolumeIsDelinquent(sm.Name, node.Name) + if err != nil { + return errors.Wrapf(err, "failed to clear delinquent lease for volume %v, node %v", sm.Name, node.Name) + } + } + + return nil +} diff --git a/controller/volume_controller.go b/controller/volume_controller.go index 2251dcac73..cd98ae6165 100644 --- a/controller/volume_controller.go +++ b/controller/volume_controller.go @@ -1480,7 +1480,7 @@ func (c *VolumeController) handleDelinquentAndStaleStateForFaultedRWXVolume(v *l if !isRegularRWXVolume(v) { return nil } - return c.ds.ClearDelinquentAndStaleStateIfVolumeIsDelinquent(v.Name) + return c.ds.ClearDelinquentAndStaleStateIfVolumeIsDelinquent(v.Name, "") } func (c *VolumeController) requestRemountIfFileSystemReadOnly(v *longhorn.Volume, e *longhorn.Engine) { diff --git a/datastore/kubernetes.go b/datastore/kubernetes.go index 2142c40501..fdf5f230f4 100644 --- a/datastore/kubernetes.go +++ b/datastore/kubernetes.go @@ -282,7 +282,7 @@ func (s *DataStore) UpdateLease(lease *coordinationv1.Lease) (*coordinationv1.Le return s.kubeClient.CoordinationV1().Leases(s.namespace).Update(context.TODO(), lease, metav1.UpdateOptions{}) } -func (s *DataStore) ClearDelinquentAndStaleStateIfVolumeIsDelinquent(volumeName string) (err error) { +func (s *DataStore) ClearDelinquentAndStaleStateIfVolumeIsDelinquent(volumeName string, nodeName string) (err error) { defer func() { err = errors.Wrapf(err, "failed to ClearDelinquentAndStaleStateIfVolumeIsDelinquent") }() @@ -301,6 +301,10 @@ func (s *DataStore) ClearDelinquentAndStaleStateIfVolumeIsDelinquent(volumeName // Ref: IsRWXVolumeDelinquent() function return nil } + if nodeName != "" && nodeName != holder { + // If a node is specified, only clear state for it. + return nil + } if !(lease.Spec.AcquireTime).IsZero() { // Non Zero lease.Spec.AcquireTime means not delinquent. // Ref: IsRWXVolumeDelinquent() function